diff --git a/.github/actions/custom-build-and-push/action.yml b/.github/actions/custom-build-and-push/action.yml index 48344237059..fbee0554995 100644 --- a/.github/actions/custom-build-and-push/action.yml +++ b/.github/actions/custom-build-and-push/action.yml @@ -32,16 +32,20 @@ inputs: description: 'Cache destinations' required: false retry-wait-time: - description: 'Time to wait before retry in seconds' + description: 'Time to wait before attempt 2 in seconds' required: false - default: '5' + default: '60' + retry-wait-time-2: + description: 'Time to wait before attempt 3 in seconds' + required: false + default: '120' runs: using: "composite" steps: - - name: Build and push Docker image (First Attempt) + - name: Build and push Docker image (Attempt 1 of 3) id: buildx1 - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 continue-on-error: true with: context: ${{ inputs.context }} @@ -54,16 +58,39 @@ runs: cache-from: ${{ inputs.cache-from }} cache-to: ${{ inputs.cache-to }} - - name: Wait to retry + - name: Wait before attempt 2 if: steps.buildx1.outcome != 'success' run: | echo "First attempt failed. Waiting ${{ inputs.retry-wait-time }} seconds before retry..." sleep ${{ inputs.retry-wait-time }} shell: bash - - name: Build and push Docker image (Retry Attempt) + - name: Build and push Docker image (Attempt 2 of 3) + id: buildx2 if: steps.buildx1.outcome != 'success' - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 + with: + context: ${{ inputs.context }} + file: ${{ inputs.file }} + platforms: ${{ inputs.platforms }} + pull: ${{ inputs.pull }} + push: ${{ inputs.push }} + load: ${{ inputs.load }} + tags: ${{ inputs.tags }} + cache-from: ${{ inputs.cache-from }} + cache-to: ${{ inputs.cache-to }} + + - name: Wait before attempt 3 + if: steps.buildx1.outcome != 'success' && steps.buildx2.outcome != 'success' + run: | + echo "Second attempt failed. Waiting ${{ inputs.retry-wait-time-2 }} seconds before retry..." + sleep ${{ inputs.retry-wait-time-2 }} + shell: bash + + - name: Build and push Docker image (Attempt 3 of 3) + id: buildx3 + if: steps.buildx1.outcome != 'success' && steps.buildx2.outcome != 'success' + uses: docker/build-push-action@v6 with: context: ${{ inputs.context }} file: ${{ inputs.file }} @@ -74,3 +101,9 @@ runs: tags: ${{ inputs.tags }} cache-from: ${{ inputs.cache-from }} cache-to: ${{ inputs.cache-to }} + + - name: Report failure + if: steps.buildx1.outcome != 'success' && steps.buildx2.outcome != 'success' && steps.buildx3.outcome != 'success' + run: | + echo "All attempts failed. Possible transient infrastucture issues? Try again later or inspect logs for details." + shell: bash diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index e57283f0377..8287f9b5300 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -6,20 +6,24 @@ [Describe the tests you ran to verify your changes] -## Accepted Risk -[Any know risks or failure modes to point out to reviewers] +## Accepted Risk (provide if relevant) +N/A -## Related Issue(s) -[If applicable, link to the issue(s) this PR addresses] +## Related Issue(s) (provide if relevant) +N/A -## Checklist: -- [ ] All of the automated tests pass -- [ ] All PR comments are addressed and marked resolved -- [ ] If there are migrations, they have been rebased to latest main -- [ ] If there are new dependencies, they are added to the requirements -- [ ] If there are new environment variables, they are added to all of the deployment methods -- [ ] If there are new APIs that don't require auth, they are added to PUBLIC_ENDPOINT_SPECS -- [ ] Docker images build and basic functionalities work -- [ ] Author has done a final read through of the PR right before merge +## Mental Checklist: +- All of the automated tests pass +- All PR comments are addressed and marked resolved +- If there are migrations, they have been rebased to latest main +- If there are new dependencies, they are added to the requirements +- If there are new environment variables, they are added to all of the deployment methods +- If there are new APIs that don't require auth, they are added to PUBLIC_ENDPOINT_SPECS +- Docker images build and basic functionalities work +- Author has done a final read through of the PR right before merge + +## Backporting (check the box to trigger backport action) +Note: You have to check that the action passes, otherwise resolve the conflicts manually and tag the patches. +- [ ] This PR should be backported (make sure to check that the backport attempt succeeds) diff --git a/.github/workflows/docker-build-push-backend-container-on-tag.yml b/.github/workflows/docker-build-push-backend-container-on-tag.yml index 8b77af29df7..ef33750c271 100644 --- a/.github/workflows/docker-build-push-backend-container-on-tag.yml +++ b/.github/workflows/docker-build-push-backend-container-on-tag.yml @@ -3,47 +3,61 @@ name: Build and Push Backend Image on Tag on: push: tags: - - '*' + - "*" env: - REGISTRY_IMAGE: danswer/danswer-backend + REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'danswer/danswer-backend-cloud' || 'danswer/danswer-backend' }} + LATEST_TAG: ${{ contains(github.ref_name, 'latest') }} jobs: build-and-push: - # TODO: make this a matrix build like the web containers - runs-on: - group: amd64-image-builders + # TODO: investigate a matrix build like the web container + # See https://runs-on.com/runners/linux/ + runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"] steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_TOKEN }} - - - name: Backend Image Docker Build and Push - uses: docker/build-push-action@v5 - with: - context: ./backend - file: ./backend/Dockerfile - platforms: linux/amd64,linux/arm64 - push: true - tags: | - ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} - ${{ env.REGISTRY_IMAGE }}:latest - build-args: | - DANSWER_VERSION=${{ github.ref_name }} - - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master - with: - # To run locally: trivy image --severity HIGH,CRITICAL danswer/danswer-backend - image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} - severity: 'CRITICAL,HIGH' - trivyignores: ./backend/.trivyignore + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_TOKEN }} + + - name: Install build-essential + run: | + sudo apt-get update + sudo apt-get install -y build-essential + + - name: Backend Image Docker Build and Push + uses: docker/build-push-action@v5 + with: + context: ./backend + file: ./backend/Dockerfile + platforms: linux/amd64,linux/arm64 + push: true + tags: | + ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} + ${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }} + build-args: | + DANSWER_VERSION=${{ github.ref_name }} + + # trivy has their own rate limiting issues causing this action to flake + # we worked around it by hardcoding to different db repos in env + # can re-enable when they figure it out + # https://github.com/aquasecurity/trivy/discussions/7538 + # https://github.com/aquasecurity/trivy-action/issues/389 + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + env: + TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2" + TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1" + with: + # To run locally: trivy image --severity HIGH,CRITICAL danswer/danswer-backend + image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} + severity: "CRITICAL,HIGH" + trivyignores: ./backend/.trivyignore diff --git a/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml b/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml new file mode 100644 index 00000000000..45cd5093a0c --- /dev/null +++ b/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml @@ -0,0 +1,137 @@ +name: Build and Push Cloud Web Image on Tag +# Identical to the web container build, but with correct image tag and build args + +on: + push: + tags: + - "*" + +env: + REGISTRY_IMAGE: danswer/danswer-web-server-cloud + LATEST_TAG: ${{ contains(github.ref_name, 'latest') }} + +jobs: + build: + runs-on: + - runs-on + - runner=${{ matrix.platform == 'linux/amd64' && '8cpu-linux-x64' || '8cpu-linux-arm64' }} + - run-id=${{ github.run_id }} + - tag=platform-${{ matrix.platform }} + strategy: + fail-fast: false + matrix: + platform: + - linux/amd64 + - linux/arm64 + + steps: + - name: Prepare + run: | + platform=${{ matrix.platform }} + echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV + + - name: Checkout + uses: actions/checkout@v4 + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY_IMAGE }} + tags: | + type=raw,value=${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} + type=raw,value=${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_TOKEN }} + + - name: Build and push by digest + id: build + uses: docker/build-push-action@v5 + with: + context: ./web + file: ./web/Dockerfile + platforms: ${{ matrix.platform }} + push: true + build-args: | + DANSWER_VERSION=${{ github.ref_name }} + NEXT_PUBLIC_CLOUD_ENABLED=true + NEXT_PUBLIC_POSTHOG_KEY=${{ secrets.POSTHOG_KEY }} + NEXT_PUBLIC_POSTHOG_HOST=${{ secrets.POSTHOG_HOST }} + NEXT_PUBLIC_SENTRY_DSN=${{ secrets.SENTRY_DSN }} + NEXT_PUBLIC_GTM_ENABLED=true + # needed due to weird interactions with the builds for different platforms + no-cache: true + labels: ${{ steps.meta.outputs.labels }} + outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true + + - name: Export digest + run: | + mkdir -p /tmp/digests + digest="${{ steps.build.outputs.digest }}" + touch "/tmp/digests/${digest#sha256:}" + + - name: Upload digest + uses: actions/upload-artifact@v4 + with: + name: digests-${{ env.PLATFORM_PAIR }} + path: /tmp/digests/* + if-no-files-found: error + retention-days: 1 + + merge: + runs-on: ubuntu-latest + needs: + - build + steps: + - name: Download digests + uses: actions/download-artifact@v4 + with: + path: /tmp/digests + pattern: digests-* + merge-multiple: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY_IMAGE }} + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_TOKEN }} + + - name: Create manifest list and push + working-directory: /tmp/digests + run: | + docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ + $(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *) + + - name: Inspect image + run: | + docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }} + + # trivy has their own rate limiting issues causing this action to flake + # we worked around it by hardcoding to different db repos in env + # can re-enable when they figure it out + # https://github.com/aquasecurity/trivy/discussions/7538 + # https://github.com/aquasecurity/trivy-action/issues/389 + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + env: + TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2" + TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1" + with: + image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} + severity: "CRITICAL,HIGH" diff --git a/.github/workflows/docker-build-push-model-server-container-on-tag.yml b/.github/workflows/docker-build-push-model-server-container-on-tag.yml index 134b77d43c2..3e0445ab04a 100644 --- a/.github/workflows/docker-build-push-model-server-container-on-tag.yml +++ b/.github/workflows/docker-build-push-model-server-container-on-tag.yml @@ -3,41 +3,53 @@ name: Build and Push Model Server Image on Tag on: push: tags: - - '*' + - "*" + +env: + REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'danswer/danswer-model-server-cloud' || 'danswer/danswer-model-server' }} + LATEST_TAG: ${{ contains(github.ref_name, 'latest') }} jobs: build-and-push: - runs-on: - group: amd64-image-builders + # See https://runs-on.com/runners/linux/ + runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"] steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_TOKEN }} - - - name: Model Server Image Docker Build and Push - uses: docker/build-push-action@v5 - with: - context: ./backend - file: ./backend/Dockerfile.model_server - platforms: linux/amd64,linux/arm64 - push: true - tags: | - danswer/danswer-model-server:${{ github.ref_name }} - danswer/danswer-model-server:latest - build-args: | - DANSWER_VERSION=${{ github.ref_name }} - - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master - with: - image-ref: docker.io/danswer/danswer-model-server:${{ github.ref_name }} - severity: 'CRITICAL,HIGH' + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_TOKEN }} + + - name: Model Server Image Docker Build and Push + uses: docker/build-push-action@v5 + with: + context: ./backend + file: ./backend/Dockerfile.model_server + platforms: linux/amd64,linux/arm64 + push: true + tags: | + ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} + ${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }} + build-args: | + DANSWER_VERSION=${{ github.ref_name }} + + # trivy has their own rate limiting issues causing this action to flake + # we worked around it by hardcoding to different db repos in env + # can re-enable when they figure it out + # https://github.com/aquasecurity/trivy/discussions/7538 + # https://github.com/aquasecurity/trivy-action/issues/389 + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + env: + TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2" + TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1" + with: + image-ref: docker.io/danswer/danswer-model-server:${{ github.ref_name }} + severity: "CRITICAL,HIGH" diff --git a/.github/workflows/docker-build-push-web-container-on-tag.yml b/.github/workflows/docker-build-push-web-container-on-tag.yml index 0a97a01f7c8..4f1fd804969 100644 --- a/.github/workflows/docker-build-push-web-container-on-tag.yml +++ b/.github/workflows/docker-build-push-web-container-on-tag.yml @@ -7,11 +7,15 @@ on: env: REGISTRY_IMAGE: danswer/danswer-web-server - + LATEST_TAG: ${{ contains(github.ref_name, 'latest') }} + jobs: build: - runs-on: - group: ${{ matrix.platform == 'linux/amd64' && 'amd64-image-builders' || 'arm64-image-builders' }} + runs-on: + - runs-on + - runner=${{ matrix.platform == 'linux/amd64' && '8cpu-linux-x64' || '8cpu-linux-arm64' }} + - run-id=${{ github.run_id }} + - tag=platform-${{ matrix.platform }} strategy: fail-fast: false matrix: @@ -35,7 +39,7 @@ jobs: images: ${{ env.REGISTRY_IMAGE }} tags: | type=raw,value=${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} - type=raw,value=${{ env.REGISTRY_IMAGE }}:latest + type=raw,value=${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }} - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -112,8 +116,16 @@ jobs: run: | docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }} + # trivy has their own rate limiting issues causing this action to flake + # we worked around it by hardcoding to different db repos in env + # can re-enable when they figure it out + # https://github.com/aquasecurity/trivy/discussions/7538 + # https://github.com/aquasecurity/trivy-action/issues/389 - name: Run Trivy vulnerability scanner uses: aquasecurity/trivy-action@master + env: + TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2' + TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1' with: image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} severity: 'CRITICAL,HIGH' diff --git a/.github/workflows/docker-tag-latest.yml b/.github/workflows/docker-tag-latest.yml index c0853ff3835..e2c7c30f31e 100644 --- a/.github/workflows/docker-tag-latest.yml +++ b/.github/workflows/docker-tag-latest.yml @@ -1,3 +1,6 @@ +# This workflow is set up to be manually triggered via the GitHub Action tab. +# Given a version, it will tag those backend and webserver images as "latest". + name: Tag Latest Version on: @@ -9,7 +12,9 @@ on: jobs: tag: - runs-on: ubuntu-latest + # See https://runs-on.com/runners/linux/ + # use a lower powered instance since this just does i/o to docker hub + runs-on: [runs-on,runner=2cpu-linux-x64,"run-id=${{ github.run_id }}"] steps: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v1 diff --git a/.github/workflows/hotfix-release-branches.yml b/.github/workflows/hotfix-release-branches.yml new file mode 100644 index 00000000000..0e921f8d694 --- /dev/null +++ b/.github/workflows/hotfix-release-branches.yml @@ -0,0 +1,172 @@ +# This workflow is intended to be manually triggered via the GitHub Action tab. +# Given a hotfix branch, it will attempt to open a PR to all release branches and +# by default auto merge them + +name: Hotfix release branches + +on: + workflow_dispatch: + inputs: + hotfix_commit: + description: 'Hotfix commit hash' + required: true + hotfix_suffix: + description: 'Hotfix branch suffix (e.g. hotfix/v0.8-{suffix})' + required: true + release_branch_pattern: + description: 'Release branch pattern (regex)' + required: true + default: 'release/.*' + auto_merge: + description: 'Automatically merge the hotfix PRs' + required: true + type: choice + default: 'true' + options: + - true + - false + +jobs: + hotfix_release_branches: + permissions: write-all + # See https://runs-on.com/runners/linux/ + # use a lower powered instance since this just does i/o to docker hub + runs-on: [runs-on,runner=2cpu-linux-x64,"run-id=${{ github.run_id }}"] + steps: + + # needs RKUO_DEPLOY_KEY for write access to merge PR's + - name: Checkout Repository + uses: actions/checkout@v4 + with: + ssh-key: "${{ secrets.RKUO_DEPLOY_KEY }}" + fetch-depth: 0 + + - name: Set up Git user + run: | + git config user.name "Richard Kuo [bot]" + git config user.email "rkuo[bot]@danswer.ai" + + - name: Fetch All Branches + run: | + git fetch --all --prune + + - name: Verify Hotfix Commit Exists + run: | + git rev-parse --verify "${{ github.event.inputs.hotfix_commit }}" || { echo "Commit not found: ${{ github.event.inputs.hotfix_commit }}"; exit 1; } + + - name: Get Release Branches + id: get_release_branches + run: | + BRANCHES=$(git branch -r | grep -E "${{ github.event.inputs.release_branch_pattern }}" | sed 's|origin/||' | tr -d ' ') + if [ -z "$BRANCHES" ]; then + echo "No release branches found matching pattern '${{ github.event.inputs.release_branch_pattern }}'." + exit 1 + fi + + echo "Found release branches:" + echo "$BRANCHES" + + # Join the branches into a single line separated by commas + BRANCHES_JOINED=$(echo "$BRANCHES" | tr '\n' ',' | sed 's/,$//') + + # Set the branches as an output + echo "branches=$BRANCHES_JOINED" >> $GITHUB_OUTPUT + + # notes on all the vagaries of wiring up automated PR's + # https://github.com/peter-evans/create-pull-request/blob/main/docs/concepts-guidelines.md#triggering-further-workflow-runs + # we must use a custom token for GH_TOKEN to trigger the subsequent PR checks + - name: Create and Merge Pull Requests to Matching Release Branches + env: + HOTFIX_COMMIT: ${{ github.event.inputs.hotfix_commit }} + HOTFIX_SUFFIX: ${{ github.event.inputs.hotfix_suffix }} + AUTO_MERGE: ${{ github.event.inputs.auto_merge }} + GH_TOKEN: ${{ secrets.RKUO_PERSONAL_ACCESS_TOKEN }} + run: | + # Get the branches from the previous step + BRANCHES="${{ steps.get_release_branches.outputs.branches }}" + + # Convert BRANCHES to an array + IFS=$',' read -ra BRANCH_ARRAY <<< "$BRANCHES" + + # Loop through each release branch and create and merge a PR + for RELEASE_BRANCH in "${BRANCH_ARRAY[@]}"; do + echo "Processing $RELEASE_BRANCH..." + + # Parse out the release version by removing "release/" from the branch name + RELEASE_VERSION=${RELEASE_BRANCH#release/} + echo "Release version parsed: $RELEASE_VERSION" + + HOTFIX_BRANCH="hotfix/${RELEASE_VERSION}-${HOTFIX_SUFFIX}" + echo "Creating PR from $HOTFIX_BRANCH to $RELEASE_BRANCH" + + # Checkout the release branch + echo "Checking out $RELEASE_BRANCH" + git checkout "$RELEASE_BRANCH" + + # Create the new hotfix branch + if git rev-parse --verify "$HOTFIX_BRANCH" >/dev/null 2>&1; then + echo "Hotfix branch $HOTFIX_BRANCH already exists. Skipping branch creation." + else + echo "Branching $RELEASE_BRANCH to $HOTFIX_BRANCH" + git checkout -b "$HOTFIX_BRANCH" + fi + + # Check if the hotfix commit is a merge commit + if git rev-list --merges -n 1 "$HOTFIX_COMMIT" >/dev/null 2>&1; then + # -m 1 uses the target branch as the base (which is what we want) + echo "Hotfix commit $HOTFIX_COMMIT is a merge commit, using -m 1 for cherry-pick" + CHERRY_PICK_CMD="git cherry-pick -m 1 $HOTFIX_COMMIT" + else + CHERRY_PICK_CMD="git cherry-pick $HOTFIX_COMMIT" + fi + + # Perform the cherry-pick + echo "Executing: $CHERRY_PICK_CMD" + eval "$CHERRY_PICK_CMD" + + if [ $? -ne 0 ]; then + echo "Cherry-pick failed for $HOTFIX_COMMIT on $HOTFIX_BRANCH. Aborting..." + git cherry-pick --abort + continue + fi + + # Push the hotfix branch to the remote + echo "Pushing $HOTFIX_BRANCH..." + git push origin "$HOTFIX_BRANCH" + echo "Hotfix branch $HOTFIX_BRANCH created and pushed." + + # Check if PR already exists + EXISTING_PR=$(gh pr list --head "$HOTFIX_BRANCH" --base "$RELEASE_BRANCH" --state open --json number --jq '.[0].number') + + if [ -n "$EXISTING_PR" ]; then + echo "An open PR already exists: #$EXISTING_PR. Skipping..." + continue + fi + + # Create a new PR and capture the output + PR_OUTPUT=$(gh pr create --title "Merge $HOTFIX_BRANCH into $RELEASE_BRANCH" \ + --body "Automated PR to merge \`$HOTFIX_BRANCH\` into \`$RELEASE_BRANCH\`." \ + --head "$HOTFIX_BRANCH" --base "$RELEASE_BRANCH") + + # Extract the URL from the output + PR_URL=$(echo "$PR_OUTPUT" | grep -Eo 'https://github.com/[^ ]+') + echo "Pull request created: $PR_URL" + + # Extract PR number from URL + PR_NUMBER=$(basename "$PR_URL") + echo "Pull request created: $PR_NUMBER" + + if [ "$AUTO_MERGE" == "true" ]; then + echo "Attempting to merge pull request #$PR_NUMBER" + + # Attempt to merge the PR + gh pr merge "$PR_NUMBER" --merge --auto --delete-branch + + if [ $? -eq 0 ]; then + echo "Pull request #$PR_NUMBER merged successfully." + else + # Optionally, handle the error or continue + echo "Failed to merge pull request #$PR_NUMBER." + fi + fi + done \ No newline at end of file diff --git a/.github/workflows/nightly-close-stale-issues.yml b/.github/workflows/nightly-close-stale-issues.yml new file mode 100644 index 00000000000..a7d296e0a92 --- /dev/null +++ b/.github/workflows/nightly-close-stale-issues.yml @@ -0,0 +1,23 @@ +name: 'Nightly - Close stale issues and PRs' +on: + schedule: + - cron: '0 11 * * *' # Runs every day at 3 AM PST / 4 AM PDT / 11 AM UTC + +permissions: + # contents: write # only for delete-branch option + issues: write + pull-requests: write + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v9 + with: + stale-issue-message: 'This issue is stale because it has been open 75 days with no activity. Remove stale label or comment or this will be closed in 15 days.' + stale-pr-message: 'This PR is stale because it has been open 75 days with no activity. Remove stale label or comment or this will be closed in 15 days.' + close-issue-message: 'This issue was closed because it has been stalled for 90 days with no activity.' + close-pr-message: 'This PR was closed because it has been stalled for 90 days with no activity.' + days-before-stale: 75 +# days-before-close: 90 # uncomment after we test stale behavior + \ No newline at end of file diff --git a/.github/workflows/nightly-scan-licenses.yml b/.github/workflows/nightly-scan-licenses.yml new file mode 100644 index 00000000000..9aa7030e0b9 --- /dev/null +++ b/.github/workflows/nightly-scan-licenses.yml @@ -0,0 +1,76 @@ +# Scan for problematic software licenses + +# trivy has their own rate limiting issues causing this action to flake +# we worked around it by hardcoding to different db repos in env +# can re-enable when they figure it out +# https://github.com/aquasecurity/trivy/discussions/7538 +# https://github.com/aquasecurity/trivy-action/issues/389 + +name: 'Nightly - Scan licenses' +on: +# schedule: +# - cron: '0 14 * * *' # Runs every day at 6 AM PST / 7 AM PDT / 2 PM UTC + workflow_dispatch: # Allows manual triggering + +permissions: + actions: read + contents: read + security-events: write + +jobs: + scan-licenses: + # See https://runs-on.com/runners/linux/ + runs-on: [runs-on,runner=2cpu-linux-x64,"run-id=${{ github.run_id }}"] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + cache-dependency-path: | + backend/requirements/default.txt + backend/requirements/dev.txt + backend/requirements/model_server.txt + + - name: Get explicit and transitive dependencies + run: | + python -m pip install --upgrade pip + pip install --retries 5 --timeout 30 -r backend/requirements/default.txt + pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt + pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt + pip freeze > requirements-all.txt + + - name: Check python + id: license_check_report + uses: pilosus/action-pip-license-checker@v2 + with: + requirements: 'requirements-all.txt' + fail: 'Copyleft' + exclude: '(?i)^(pylint|aio[-_]*).*' + + - name: Print report + if: ${{ always() }} + run: echo "${{ steps.license_check_report.outputs.report }}" + + - name: Install npm dependencies + working-directory: ./web + run: npm ci + + - name: Run Trivy vulnerability scanner in repo mode + uses: aquasecurity/trivy-action@0.28.0 + with: + scan-type: fs + scanners: license + format: table +# format: sarif +# output: trivy-results.sarif + severity: HIGH,CRITICAL + +# - name: Upload Trivy scan results to GitHub Security tab +# uses: github/codeql-action/upload-sarif@v3 +# with: +# sarif_file: trivy-results.sarif diff --git a/.github/workflows/pr-backport-autotrigger.yml b/.github/workflows/pr-backport-autotrigger.yml new file mode 100644 index 00000000000..273f00a5c5a --- /dev/null +++ b/.github/workflows/pr-backport-autotrigger.yml @@ -0,0 +1,124 @@ +name: Backport on Merge + +# Note this workflow does not trigger the builds, be sure to manually tag the branches to trigger the builds + +on: + pull_request: + types: [closed] # Later we check for merge so only PRs that go in can get backported + +permissions: + contents: write + actions: write + +jobs: + backport: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + env: + GITHUB_TOKEN: ${{ secrets.YUHONG_GH_ACTIONS }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ssh-key: "${{ secrets.RKUO_DEPLOY_KEY }}" + fetch-depth: 0 + + - name: Set up Git user + run: | + git config user.name "Richard Kuo [bot]" + git config user.email "rkuo[bot]@danswer.ai" + git fetch --prune + + - name: Check for Backport Checkbox + id: checkbox-check + run: | + PR_BODY="${{ github.event.pull_request.body }}" + if [[ "$PR_BODY" == *"[x] This PR should be backported"* ]]; then + echo "backport=true" >> $GITHUB_OUTPUT + else + echo "backport=false" >> $GITHUB_OUTPUT + fi + + - name: List and sort release branches + id: list-branches + run: | + git fetch --all --tags + BRANCHES=$(git for-each-ref --format='%(refname:short)' refs/remotes/origin/release/* | sed 's|origin/release/||' | sort -Vr) + BETA=$(echo "$BRANCHES" | head -n 1) + STABLE=$(echo "$BRANCHES" | head -n 2 | tail -n 1) + echo "beta=release/$BETA" >> $GITHUB_OUTPUT + echo "stable=release/$STABLE" >> $GITHUB_OUTPUT + # Fetch latest tags for beta and stable + LATEST_BETA_TAG=$(git tag -l "v[0-9]*.[0-9]*.[0-9]*-beta.[0-9]*" | grep -E "^v[0-9]+\.[0-9]+\.[0-9]+-beta\.[0-9]+$" | grep -v -- "-cloud" | sort -Vr | head -n 1) + LATEST_STABLE_TAG=$(git tag -l "v[0-9]*.[0-9]*.[0-9]*" | grep -E "^v[0-9]+\.[0-9]+\.[0-9]+$" | sort -Vr | head -n 1) + + # Handle case where no beta tags exist + if [[ -z "$LATEST_BETA_TAG" ]]; then + NEW_BETA_TAG="v1.0.0-beta.1" + else + NEW_BETA_TAG=$(echo $LATEST_BETA_TAG | awk -F '[.-]' '{print $1 "." $2 "." $3 "-beta." ($NF+1)}') + fi + + # Increment latest stable tag + NEW_STABLE_TAG=$(echo $LATEST_STABLE_TAG | awk -F '.' '{print $1 "." $2 "." ($3+1)}') + echo "latest_beta_tag=$LATEST_BETA_TAG" >> $GITHUB_OUTPUT + echo "latest_stable_tag=$LATEST_STABLE_TAG" >> $GITHUB_OUTPUT + echo "new_beta_tag=$NEW_BETA_TAG" >> $GITHUB_OUTPUT + echo "new_stable_tag=$NEW_STABLE_TAG" >> $GITHUB_OUTPUT + + - name: Echo branch and tag information + run: | + echo "Beta branch: ${{ steps.list-branches.outputs.beta }}" + echo "Stable branch: ${{ steps.list-branches.outputs.stable }}" + echo "Latest beta tag: ${{ steps.list-branches.outputs.latest_beta_tag }}" + echo "Latest stable tag: ${{ steps.list-branches.outputs.latest_stable_tag }}" + echo "New beta tag: ${{ steps.list-branches.outputs.new_beta_tag }}" + echo "New stable tag: ${{ steps.list-branches.outputs.new_stable_tag }}" + + - name: Trigger Backport + if: steps.checkbox-check.outputs.backport == 'true' + run: | + set -e + echo "Backporting to beta ${{ steps.list-branches.outputs.beta }} and stable ${{ steps.list-branches.outputs.stable }}" + + # Echo the merge commit SHA + echo "Merge commit SHA: ${{ github.event.pull_request.merge_commit_sha }}" + + # Fetch all history for all branches and tags + git fetch --prune + + # Reset and prepare the beta branch + git checkout ${{ steps.list-branches.outputs.beta }} + echo "Last 5 commits on beta branch:" + git log -n 5 --pretty=format:"%H" + echo "" # Newline for formatting + + # Cherry-pick the merge commit from the merged PR + git cherry-pick -m 1 ${{ github.event.pull_request.merge_commit_sha }} || { + echo "Cherry-pick to beta failed due to conflicts." + exit 1 + } + + # Create new beta branch/tag + git tag ${{ steps.list-branches.outputs.new_beta_tag }} + # Push the changes and tag to the beta branch using PAT + git push origin ${{ steps.list-branches.outputs.beta }} + git push origin ${{ steps.list-branches.outputs.new_beta_tag }} + + # Reset and prepare the stable branch + git checkout ${{ steps.list-branches.outputs.stable }} + echo "Last 5 commits on stable branch:" + git log -n 5 --pretty=format:"%H" + echo "" # Newline for formatting + + # Cherry-pick the merge commit from the merged PR + git cherry-pick -m 1 ${{ github.event.pull_request.merge_commit_sha }} || { + echo "Cherry-pick to stable failed due to conflicts." + exit 1 + } + + # Create new stable branch/tag + git tag ${{ steps.list-branches.outputs.new_stable_tag }} + # Push the changes and tag to the stable branch using PAT + git push origin ${{ steps.list-branches.outputs.stable }} + git push origin ${{ steps.list-branches.outputs.new_stable_tag }} diff --git a/.github/workflows/pr-chromatic-tests.yml b/.github/workflows/pr-chromatic-tests.yml new file mode 100644 index 00000000000..5d8b29ed572 --- /dev/null +++ b/.github/workflows/pr-chromatic-tests.yml @@ -0,0 +1,225 @@ +name: Run Chromatic Tests +concurrency: + group: Run-Chromatic-Tests-${{ github.workflow }}-${{ github.head_ref || github.event.workflow_run.head_branch || github.run_id }} + cancel-in-progress: true + +on: push + +env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + +jobs: + playwright-tests: + name: Playwright Tests + + # See https://runs-on.com/runners/linux/ + runs-on: [runs-on,runner=8cpu-linux-x64,ram=16,"run-id=${{ github.run_id }}"] + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + cache-dependency-path: | + backend/requirements/default.txt + backend/requirements/dev.txt + backend/requirements/model_server.txt + - run: | + python -m pip install --upgrade pip + pip install --retries 5 --timeout 30 -r backend/requirements/default.txt + pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt + pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt + + - name: Setup node + uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Install node dependencies + working-directory: ./web + run: npm ci + + - name: Install playwright browsers + working-directory: ./web + run: npx playwright install --with-deps + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_TOKEN }} + + # tag every docker image with "test" so that we can spin up the correct set + # of images during testing + + # we use the runs-on cache for docker builds + # in conjunction with runs-on runners, it has better speed and unlimited caching + # https://runs-on.com/caching/s3-cache-for-github-actions/ + # https://runs-on.com/caching/docker/ + # https://github.com/moby/buildkit#s3-cache-experimental + + # images are built and run locally for testing purposes. Not pushed. + + - name: Build Web Docker image + uses: ./.github/actions/custom-build-and-push + with: + context: ./web + file: ./web/Dockerfile + platforms: linux/amd64 + tags: danswer/danswer-web-server:test + push: false + load: true + cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/web-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} + cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/web-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max + + - name: Build Backend Docker image + uses: ./.github/actions/custom-build-and-push + with: + context: ./backend + file: ./backend/Dockerfile + platforms: linux/amd64 + tags: danswer/danswer-backend:test + push: false + load: true + cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} + cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max + + - name: Build Model Server Docker image + uses: ./.github/actions/custom-build-and-push + with: + context: ./backend + file: ./backend/Dockerfile.model_server + platforms: linux/amd64 + tags: danswer/danswer-model-server:test + push: false + load: true + cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} + cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max + + - name: Start Docker containers + run: | + cd deployment/docker_compose + ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \ + AUTH_TYPE=basic \ + REQUIRE_EMAIL_VERIFICATION=false \ + DISABLE_TELEMETRY=true \ + IMAGE_TAG=test \ + docker compose -f docker-compose.dev.yml -p danswer-stack up -d + id: start_docker + + - name: Wait for service to be ready + run: | + echo "Starting wait-for-service script..." + + docker logs -f danswer-stack-api_server-1 & + + start_time=$(date +%s) + timeout=300 # 5 minutes in seconds + + while true; do + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + + if [ $elapsed_time -ge $timeout ]; then + echo "Timeout reached. Service did not become ready in 5 minutes." + exit 1 + fi + + # Use curl with error handling to ignore specific exit code 56 + response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health || echo "curl_error") + + if [ "$response" = "200" ]; then + echo "Service is ready!" + break + elif [ "$response" = "curl_error" ]; then + echo "Curl encountered an error, possibly exit code 56. Continuing to retry..." + else + echo "Service not ready yet (HTTP status $response). Retrying in 5 seconds..." + fi + + sleep 5 + done + echo "Finished waiting for service." + + - name: Run pytest playwright test init + working-directory: ./backend + env: + PYTEST_IGNORE_SKIP: true + run: pytest -s tests/integration/tests/playwright/test_playwright.py + + - name: Run Playwright tests + working-directory: ./web + run: npx playwright test + + - uses: actions/upload-artifact@v4 + if: always() + with: + # Chromatic automatically defaults to the test-results directory. + # Replace with the path to your custom directory and adjust the CHROMATIC_ARCHIVE_LOCATION environment variable accordingly. + name: test-results + path: ./web/test-results + retention-days: 30 + + # save before stopping the containers so the logs can be captured + - name: Save Docker logs + if: success() || failure() + run: | + cd deployment/docker_compose + docker compose -f docker-compose.dev.yml -p danswer-stack logs > docker-compose.log + mv docker-compose.log ${{ github.workspace }}/docker-compose.log + + - name: Upload logs + if: success() || failure() + uses: actions/upload-artifact@v4 + with: + name: docker-logs + path: ${{ github.workspace }}/docker-compose.log + + - name: Stop Docker containers + run: | + cd deployment/docker_compose + docker compose -f docker-compose.dev.yml -p danswer-stack down -v + + chromatic-tests: + name: Chromatic Tests + + needs: playwright-tests + runs-on: [runs-on,runner=8cpu-linux-x64,ram=16,"run-id=${{ github.run_id }}"] + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup node + uses: actions/setup-node@v4 + with: + node-version: 22 + + - name: Install node dependencies + working-directory: ./web + run: npm ci + + - name: Download Playwright test results + uses: actions/download-artifact@v4 + with: + name: test-results + path: ./web/test-results + + - name: Run Chromatic + uses: chromaui/action@latest + with: + playwright: true + projectToken: ${{ secrets.CHROMATIC_PROJECT_TOKEN }} + workingDir: ./web + env: + CHROMATIC_ARCHIVE_LOCATION: ./test-results diff --git a/.github/workflows/pr-helm-chart-testing.yml b/.github/workflows/pr-helm-chart-testing.yml new file mode 100644 index 00000000000..f26ab43e780 --- /dev/null +++ b/.github/workflows/pr-helm-chart-testing.yml @@ -0,0 +1,72 @@ +name: Helm - Lint and Test Charts + +on: + merge_group: + pull_request: + branches: [ main ] + workflow_dispatch: # Allows manual triggering + +jobs: + helm-chart-check: + # See https://runs-on.com/runners/linux/ + runs-on: [runs-on,runner=8cpu-linux-x64,hdd=256,"run-id=${{ github.run_id }}"] + + # fetch-depth 0 is required for helm/chart-testing-action + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Helm + uses: azure/setup-helm@v4.2.0 + with: + version: v3.14.4 + + - name: Set up chart-testing + uses: helm/chart-testing-action@v2.6.1 + + # even though we specify chart-dirs in ct.yaml, it isn't used by ct for the list-changed command... + - name: Run chart-testing (list-changed) + id: list-changed + run: | + echo "default_branch: ${{ github.event.repository.default_branch }}" + changed=$(ct list-changed --remote origin --target-branch ${{ github.event.repository.default_branch }} --chart-dirs deployment/helm/charts) + echo "list-changed output: $changed" + if [[ -n "$changed" ]]; then + echo "changed=true" >> "$GITHUB_OUTPUT" + fi + +# rkuo: I don't think we need python? +# - name: Set up Python +# uses: actions/setup-python@v5 +# with: +# python-version: '3.11' +# cache: 'pip' +# cache-dependency-path: | +# backend/requirements/default.txt +# backend/requirements/dev.txt +# backend/requirements/model_server.txt +# - run: | +# python -m pip install --upgrade pip +# pip install --retries 5 --timeout 30 -r backend/requirements/default.txt +# pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt +# pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt + + # lint all charts if any changes were detected + - name: Run chart-testing (lint) + if: steps.list-changed.outputs.changed == 'true' + run: ct lint --config ct.yaml --all + # the following would lint only changed charts, but linting isn't expensive + # run: ct lint --config ct.yaml --target-branch ${{ github.event.repository.default_branch }} + + - name: Create kind cluster + if: steps.list-changed.outputs.changed == 'true' + uses: helm/kind-action@v1.10.0 + + - name: Run chart-testing (install) + if: steps.list-changed.outputs.changed == 'true' + run: ct install --all --helm-extra-set-args="--set=nginx.enabled=false" --debug --config ct.yaml + # the following would install only changed charts, but we only have one chart so + # don't worry about that for now + # run: ct install --target-branch ${{ github.event.repository.default_branch }} diff --git a/.github/workflows/pr-helm-chart-testing.yml.disabled.txt b/.github/workflows/pr-helm-chart-testing.yml.disabled.txt deleted file mode 100644 index 7c4903a07f7..00000000000 --- a/.github/workflows/pr-helm-chart-testing.yml.disabled.txt +++ /dev/null @@ -1,67 +0,0 @@ -# This workflow is intentionally disabled while we're still working on it -# It's close to ready, but a race condition needs to be fixed with -# API server and Vespa startup, and it needs to have a way to build/test against -# local containers - -name: Helm - Lint and Test Charts - -on: - merge_group: - pull_request: - branches: [ main ] - -jobs: - lint-test: - runs-on: Amd64 - - # fetch-depth 0 is required for helm/chart-testing-action - steps: - - name: Checkout code - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Helm - uses: azure/setup-helm@v4.2.0 - with: - version: v3.14.4 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - cache: 'pip' - cache-dependency-path: | - backend/requirements/default.txt - backend/requirements/dev.txt - backend/requirements/model_server.txt - - run: | - python -m pip install --upgrade pip - pip install -r backend/requirements/default.txt - pip install -r backend/requirements/dev.txt - pip install -r backend/requirements/model_server.txt - - - name: Set up chart-testing - uses: helm/chart-testing-action@v2.6.1 - - - name: Run chart-testing (list-changed) - id: list-changed - run: | - changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }}) - if [[ -n "$changed" ]]; then - echo "changed=true" >> "$GITHUB_OUTPUT" - fi - - - name: Run chart-testing (lint) -# if: steps.list-changed.outputs.changed == 'true' - run: ct lint --all --config ct.yaml --target-branch ${{ github.event.repository.default_branch }} - - - name: Create kind cluster -# if: steps.list-changed.outputs.changed == 'true' - uses: helm/kind-action@v1.10.0 - - - name: Run chart-testing (install) -# if: steps.list-changed.outputs.changed == 'true' - run: ct install --all --config ct.yaml -# run: ct install --target-branch ${{ github.event.repository.default_branch }} - \ No newline at end of file diff --git a/.github/workflows/pr-integration-tests.yml b/.github/workflows/pr-integration-tests.yml new file mode 100644 index 00000000000..f2dc97e75da --- /dev/null +++ b/.github/workflows/pr-integration-tests.yml @@ -0,0 +1,243 @@ +name: Run Integration Tests v2 +concurrency: + group: Run-Integration-Tests-${{ github.workflow }}-${{ github.head_ref || github.event.workflow_run.head_branch || github.run_id }} + cancel-in-progress: true + +on: + merge_group: + pull_request: + branches: + - main + - 'release/**' + +env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }} + CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }} + CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }} + +jobs: + integration-tests: + # See https://runs-on.com/runners/linux/ + runs-on: [runs-on,runner=8cpu-linux-x64,ram=16,"run-id=${{ github.run_id }}"] + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_TOKEN }} + + # tag every docker image with "test" so that we can spin up the correct set + # of images during testing + + # We don't need to build the Web Docker image since it's not yet used + # in the integration tests. We have a separate action to verify that it builds + # successfully. + - name: Pull Web Docker image + run: | + docker pull danswer/danswer-web-server:latest + docker tag danswer/danswer-web-server:latest danswer/danswer-web-server:test + + # we use the runs-on cache for docker builds + # in conjunction with runs-on runners, it has better speed and unlimited caching + # https://runs-on.com/caching/s3-cache-for-github-actions/ + # https://runs-on.com/caching/docker/ + # https://github.com/moby/buildkit#s3-cache-experimental + + # images are built and run locally for testing purposes. Not pushed. + - name: Build Backend Docker image + uses: ./.github/actions/custom-build-and-push + with: + context: ./backend + file: ./backend/Dockerfile + platforms: linux/amd64 + tags: danswer/danswer-backend:test + push: false + load: true + cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} + cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max + + - name: Build Model Server Docker image + uses: ./.github/actions/custom-build-and-push + with: + context: ./backend + file: ./backend/Dockerfile.model_server + platforms: linux/amd64 + tags: danswer/danswer-model-server:test + push: false + load: true + cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} + cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max + + - name: Build integration test Docker image + uses: ./.github/actions/custom-build-and-push + with: + context: ./backend + file: ./backend/tests/integration/Dockerfile + platforms: linux/amd64 + tags: danswer/danswer-integration:test + push: false + load: true + cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }} + cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max + + # Start containers for multi-tenant tests + - name: Start Docker containers for multi-tenant tests + run: | + cd deployment/docker_compose + ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \ + MULTI_TENANT=true \ + AUTH_TYPE=basic \ + REQUIRE_EMAIL_VERIFICATION=false \ + DISABLE_TELEMETRY=true \ + IMAGE_TAG=test \ + docker compose -f docker-compose.dev.yml -p danswer-stack up -d + id: start_docker_multi_tenant + + # In practice, `cloud` Auth type would require OAUTH credentials to be set. + - name: Run Multi-Tenant Integration Tests + run: | + echo "Running integration tests..." + docker run --rm --network danswer-stack_default \ + --name test-runner \ + -e POSTGRES_HOST=relational_db \ + -e POSTGRES_USER=postgres \ + -e POSTGRES_PASSWORD=password \ + -e POSTGRES_DB=postgres \ + -e VESPA_HOST=index \ + -e REDIS_HOST=cache \ + -e API_SERVER_HOST=api_server \ + -e OPENAI_API_KEY=${OPENAI_API_KEY} \ + -e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \ + -e TEST_WEB_HOSTNAME=test-runner \ + -e AUTH_TYPE=cloud \ + -e MULTI_TENANT=true \ + danswer/danswer-integration:test \ + /app/tests/integration/multitenant_tests + continue-on-error: true + id: run_multitenant_tests + + - name: Check multi-tenant test results + run: | + if [ ${{ steps.run_tests.outcome }} == 'failure' ]; then + echo "Integration tests failed. Exiting with error." + exit 1 + else + echo "All integration tests passed successfully." + fi + + - name: Stop multi-tenant Docker containers + run: | + cd deployment/docker_compose + docker compose -f docker-compose.dev.yml -p danswer-stack down -v + + + - name: Start Docker containers + run: | + cd deployment/docker_compose + ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \ + AUTH_TYPE=basic \ + REQUIRE_EMAIL_VERIFICATION=false \ + DISABLE_TELEMETRY=true \ + IMAGE_TAG=test \ + docker compose -f docker-compose.dev.yml -p danswer-stack up -d + id: start_docker + + - name: Wait for service to be ready + run: | + echo "Starting wait-for-service script..." + + docker logs -f danswer-stack-api_server-1 & + + start_time=$(date +%s) + timeout=300 # 5 minutes in seconds + + while true; do + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + + if [ $elapsed_time -ge $timeout ]; then + echo "Timeout reached. Service did not become ready in 5 minutes." + exit 1 + fi + + # Use curl with error handling to ignore specific exit code 56 + response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health || echo "curl_error") + + if [ "$response" = "200" ]; then + echo "Service is ready!" + break + elif [ "$response" = "curl_error" ]; then + echo "Curl encountered an error, possibly exit code 56. Continuing to retry..." + else + echo "Service not ready yet (HTTP status $response). Retrying in 5 seconds..." + fi + + sleep 5 + done + echo "Finished waiting for service." + + - name: Run Standard Integration Tests + run: | + echo "Running integration tests..." + docker run --rm --network danswer-stack_default \ + --name test-runner \ + -e POSTGRES_HOST=relational_db \ + -e POSTGRES_USER=postgres \ + -e POSTGRES_PASSWORD=password \ + -e POSTGRES_DB=postgres \ + -e VESPA_HOST=index \ + -e REDIS_HOST=cache \ + -e API_SERVER_HOST=api_server \ + -e OPENAI_API_KEY=${OPENAI_API_KEY} \ + -e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \ + -e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \ + -e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \ + -e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \ + -e TEST_WEB_HOSTNAME=test-runner \ + danswer/danswer-integration:test \ + /app/tests/integration/tests \ + /app/tests/integration/connector_job_tests + continue-on-error: true + id: run_tests + + - name: Check test results + run: | + if [ ${{ steps.run_tests.outcome }} == 'failure' ]; then + echo "Integration tests failed. Exiting with error." + exit 1 + else + echo "All integration tests passed successfully." + fi + + # save before stopping the containers so the logs can be captured + - name: Save Docker logs + if: success() || failure() + run: | + cd deployment/docker_compose + docker compose -f docker-compose.dev.yml -p danswer-stack logs > docker-compose.log + mv docker-compose.log ${{ github.workspace }}/docker-compose.log + + - name: Stop Docker containers + run: | + cd deployment/docker_compose + docker compose -f docker-compose.dev.yml -p danswer-stack down -v + + - name: Upload logs + if: success() || failure() + uses: actions/upload-artifact@v4 + with: + name: docker-logs + path: ${{ github.workspace }}/docker-compose.log + + - name: Stop Docker containers + run: | + cd deployment/docker_compose + docker compose -f docker-compose.dev.yml -p danswer-stack down -v diff --git a/.github/workflows/pr-python-checks.yml b/.github/workflows/pr-python-checks.yml index 9cc624fa073..db16848bd2f 100644 --- a/.github/workflows/pr-python-checks.yml +++ b/.github/workflows/pr-python-checks.yml @@ -3,18 +3,21 @@ name: Python Checks on: merge_group: pull_request: - branches: [ main ] + branches: + - main + - 'release/**' jobs: mypy-check: - runs-on: ubuntu-latest + # See https://runs-on.com/runners/linux/ + runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"] steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.11' cache: 'pip' @@ -24,9 +27,9 @@ jobs: backend/requirements/model_server.txt - run: | python -m pip install --upgrade pip - pip install -r backend/requirements/default.txt - pip install -r backend/requirements/dev.txt - pip install -r backend/requirements/model_server.txt + pip install --retries 5 --timeout 30 -r backend/requirements/default.txt + pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt + pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt - name: Run MyPy run: | diff --git a/.github/workflows/pr-python-connector-tests.yml b/.github/workflows/pr-python-connector-tests.yml index 00b92c9b003..6e122860ee9 100644 --- a/.github/workflows/pr-python-connector-tests.yml +++ b/.github/workflows/pr-python-connector-tests.yml @@ -15,10 +15,20 @@ env: CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }} CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }} CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }} + # Jira + JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }} + JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }} + # Google + GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR }} + GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1 }} + GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR }} + GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR }} + GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }} jobs: connectors-check: - runs-on: ubuntu-latest + # See https://runs-on.com/runners/linux/ + runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"] env: PYTHONPATH: ./backend @@ -28,7 +38,7 @@ jobs: uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" cache: "pip" @@ -39,8 +49,8 @@ jobs: - name: Install Dependencies run: | python -m pip install --upgrade pip - pip install -r backend/requirements/default.txt - pip install -r backend/requirements/dev.txt + pip install --retries 5 --timeout 30 -r backend/requirements/default.txt + pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt - name: Run Tests shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}" diff --git a/.github/workflows/pr-python-model-tests.yml b/.github/workflows/pr-python-model-tests.yml new file mode 100644 index 00000000000..a070eea27a7 --- /dev/null +++ b/.github/workflows/pr-python-model-tests.yml @@ -0,0 +1,58 @@ +name: Connector Tests + +on: + schedule: + # This cron expression runs the job daily at 16:00 UTC (9am PT) + - cron: "0 16 * * *" + +env: + # Bedrock + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_REGION_NAME: ${{ secrets.AWS_REGION_NAME }} + + # OpenAI + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + +jobs: + model-check: + # See https://runs-on.com/runners/linux/ + runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"] + + env: + PYTHONPATH: ./backend + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + cache-dependency-path: | + backend/requirements/default.txt + backend/requirements/dev.txt + + - name: Install Dependencies + run: | + python -m pip install --upgrade pip + pip install --retries 5 --timeout 30 -r backend/requirements/default.txt + pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt + + - name: Run Tests + shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}" + run: | + py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/llm + py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/embedding + + - name: Alert on Failure + if: failure() && github.event_name == 'schedule' + env: + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + run: | + curl -X POST \ + -H 'Content-type: application/json' \ + --data '{"text":"Scheduled Model Tests failed! Check the run at: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"}' \ + $SLACK_WEBHOOK diff --git a/.github/workflows/pr-python-tests.yml b/.github/workflows/pr-python-tests.yml index 1acdbc5ddb1..5637300615b 100644 --- a/.github/workflows/pr-python-tests.yml +++ b/.github/workflows/pr-python-tests.yml @@ -3,11 +3,14 @@ name: Python Unit Tests on: merge_group: pull_request: - branches: [ main ] + branches: + - main + - 'release/**' jobs: backend-check: - runs-on: ubuntu-latest + # See https://runs-on.com/runners/linux/ + runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"] env: PYTHONPATH: ./backend @@ -18,7 +21,7 @@ jobs: uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.11' cache: 'pip' @@ -29,8 +32,8 @@ jobs: - name: Install Dependencies run: | python -m pip install --upgrade pip - pip install -r backend/requirements/default.txt - pip install -r backend/requirements/dev.txt + pip install --retries 5 --timeout 30 -r backend/requirements/default.txt + pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt - name: Run Tests shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}" diff --git a/.github/workflows/pr-quality-checks.yml b/.github/workflows/pr-quality-checks.yml index 8a42541ea5d..3ba206669a6 100644 --- a/.github/workflows/pr-quality-checks.yml +++ b/.github/workflows/pr-quality-checks.yml @@ -1,6 +1,6 @@ name: Quality Checks PR concurrency: - group: Quality-Checks-PR-${{ github.head_ref }} + group: Quality-Checks-PR-${{ github.workflow }}-${{ github.head_ref || github.event.workflow_run.head_branch || github.run_id }} cancel-in-progress: true on: @@ -9,7 +9,8 @@ on: jobs: quality-checks: - runs-on: ubuntu-latest + # See https://runs-on.com/runners/linux/ + runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"] steps: - uses: actions/checkout@v4 with: @@ -17,6 +18,6 @@ jobs: - uses: actions/setup-python@v5 with: python-version: "3.11" - - uses: pre-commit/action@v3.0.0 + - uses: pre-commit/action@v3.0.1 with: extra_args: ${{ github.event_name == 'pull_request' && format('--from-ref {0} --to-ref {1}', github.event.pull_request.base.sha, github.event.pull_request.head.sha) || '' }} diff --git a/.github/workflows/run-it.yml b/.github/workflows/run-it.yml deleted file mode 100644 index 75df647c462..00000000000 --- a/.github/workflows/run-it.yml +++ /dev/null @@ -1,162 +0,0 @@ -name: Run Integration Tests -concurrency: - group: Run-Integration-Tests-${{ github.head_ref }} - cancel-in-progress: true - -on: - merge_group: - pull_request: - branches: [ main ] - -env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - -jobs: - integration-tests: - runs-on: - group: 'arm64-image-builders' - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_TOKEN }} - - # NOTE: we don't need to build the Web Docker image since it's not used - # during the IT for now. We have a separate action to verify it builds - # succesfully - - name: Pull Web Docker image - run: | - docker pull danswer/danswer-web-server:latest - docker tag danswer/danswer-web-server:latest danswer/danswer-web-server:it - - - name: Build Backend Docker image - uses: ./.github/actions/custom-build-and-push - with: - context: ./backend - file: ./backend/Dockerfile - platforms: linux/arm64 - tags: danswer/danswer-backend:it - cache-from: type=registry,ref=danswer/danswer-backend:it - cache-to: | - type=registry,ref=danswer/danswer-backend:it,mode=max - type=inline - - - name: Build Model Server Docker image - uses: ./.github/actions/custom-build-and-push - with: - context: ./backend - file: ./backend/Dockerfile.model_server - platforms: linux/arm64 - tags: danswer/danswer-model-server:it - cache-from: type=registry,ref=danswer/danswer-model-server:it - cache-to: | - type=registry,ref=danswer/danswer-model-server:it,mode=max - type=inline - - - name: Build integration test Docker image - uses: ./.github/actions/custom-build-and-push - with: - context: ./backend - file: ./backend/tests/integration/Dockerfile - platforms: linux/arm64 - tags: danswer/integration-test-runner:it - cache-from: type=registry,ref=danswer/integration-test-runner:it - cache-to: | - type=registry,ref=danswer/integration-test-runner:it,mode=max - type=inline - - - name: Start Docker containers - run: | - cd deployment/docker_compose - ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \ - AUTH_TYPE=basic \ - REQUIRE_EMAIL_VERIFICATION=false \ - DISABLE_TELEMETRY=true \ - IMAGE_TAG=it \ - docker compose -f docker-compose.dev.yml -p danswer-stack up -d - id: start_docker - - - name: Wait for service to be ready - run: | - echo "Starting wait-for-service script..." - - docker logs -f danswer-stack-api_server-1 & - - start_time=$(date +%s) - timeout=300 # 5 minutes in seconds - - while true; do - current_time=$(date +%s) - elapsed_time=$((current_time - start_time)) - - if [ $elapsed_time -ge $timeout ]; then - echo "Timeout reached. Service did not become ready in 5 minutes." - exit 1 - fi - - # Use curl with error handling to ignore specific exit code 56 - response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health || echo "curl_error") - - if [ "$response" = "200" ]; then - echo "Service is ready!" - break - elif [ "$response" = "curl_error" ]; then - echo "Curl encountered an error, possibly exit code 56. Continuing to retry..." - else - echo "Service not ready yet (HTTP status $response). Retrying in 5 seconds..." - fi - - sleep 5 - done - echo "Finished waiting for service." - - - name: Run integration tests - run: | - echo "Running integration tests..." - docker run --rm --network danswer-stack_default \ - -e POSTGRES_HOST=relational_db \ - -e POSTGRES_USER=postgres \ - -e POSTGRES_PASSWORD=password \ - -e POSTGRES_DB=postgres \ - -e VESPA_HOST=index \ - -e REDIS_HOST=cache \ - -e API_SERVER_HOST=api_server \ - -e OPENAI_API_KEY=${OPENAI_API_KEY} \ - danswer/integration-test-runner:it - continue-on-error: true - id: run_tests - - - name: Check test results - run: | - if [ ${{ steps.run_tests.outcome }} == 'failure' ]; then - echo "Integration tests failed. Exiting with error." - exit 1 - else - echo "All integration tests passed successfully." - fi - - - name: Save Docker logs - if: success() || failure() - run: | - cd deployment/docker_compose - docker compose -f docker-compose.dev.yml -p danswer-stack logs > docker-compose.log - mv docker-compose.log ${{ github.workspace }}/docker-compose.log - - - name: Upload logs - if: success() || failure() - uses: actions/upload-artifact@v3 - with: - name: docker-logs - path: ${{ github.workspace }}/docker-compose.log - - - name: Stop Docker containers - run: | - cd deployment/docker_compose - docker compose -f docker-compose.dev.yml -p danswer-stack down -v diff --git a/.github/workflows/tag-nightly.yml b/.github/workflows/tag-nightly.yml new file mode 100644 index 00000000000..50bb20808a3 --- /dev/null +++ b/.github/workflows/tag-nightly.yml @@ -0,0 +1,54 @@ +name: Nightly Tag Push + +on: + schedule: + - cron: '0 10 * * *' # Runs every day at 2 AM PST / 3 AM PDT / 10 AM UTC + +permissions: + contents: write # Allows pushing tags to the repository + +jobs: + create-and-push-tag: + runs-on: [runs-on,runner=2cpu-linux-x64,"run-id=${{ github.run_id }}"] + + steps: + # actions using GITHUB_TOKEN cannot trigger another workflow, but we do want this to trigger docker pushes + # see https://github.com/orgs/community/discussions/27028#discussioncomment-3254367 for the workaround we + # implement here which needs an actual user's deploy key + - name: Checkout code + uses: actions/checkout@v4 + with: + ssh-key: "${{ secrets.RKUO_DEPLOY_KEY }}" + + - name: Set up Git user + run: | + git config user.name "Richard Kuo [bot]" + git config user.email "rkuo[bot]@danswer.ai" + + - name: Check for existing nightly tag + id: check_tag + run: | + if git tag --points-at HEAD --list "nightly-latest*" | grep -q .; then + echo "A tag starting with 'nightly-latest' already exists on HEAD." + echo "tag_exists=true" >> $GITHUB_OUTPUT + else + echo "No tag starting with 'nightly-latest' exists on HEAD." + echo "tag_exists=false" >> $GITHUB_OUTPUT + fi + + # don't tag again if HEAD already has a nightly-latest tag on it + - name: Create Nightly Tag + if: steps.check_tag.outputs.tag_exists == 'false' + env: + DATE: ${{ github.run_id }} + run: | + TAG_NAME="nightly-latest-$(date +'%Y%m%d')" + echo "Creating tag: $TAG_NAME" + git tag $TAG_NAME + + - name: Push Tag + if: steps.check_tag.outputs.tag_exists == 'false' + run: | + TAG_NAME="nightly-latest-$(date +'%Y%m%d')" + git push origin $TAG_NAME + \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1d7e04272ad..745553495b2 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ /backend/tests/regression/answer_quality/search_test_config.yaml env.sh .cursorrules -danswer_checkpoint/ \ No newline at end of file +danswer_checkpoint/ +/web/test-results/ diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 00000000000..0164457a03e --- /dev/null +++ b/.prettierignore @@ -0,0 +1 @@ +backend/tests/integration/tests/pruning/website diff --git a/.vscode/launch.template.jsonc b/.vscode/launch.template.jsonc index c733800981c..1f1faed097d 100644 --- a/.vscode/launch.template.jsonc +++ b/.vscode/launch.template.jsonc @@ -6,19 +6,69 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "compounds": [ + { + // Dummy entry used to label the group + "name": "--- Compound ---", + "configurations": [ + "--- Individual ---" + ], + "presentation": { + "group": "1", + } + }, { "name": "Run All Danswer Services", "configurations": [ "Web Server", "Model Server", "API Server", - "Indexing", - "Background Jobs", - "Slack Bot" - ] - } + "Slack Bot", + "Celery primary", + "Celery light", + "Celery heavy", + "Celery indexing", + "Celery beat", + ], + "presentation": { + "group": "1", + } + }, + { + "name": "Web / Model / API", + "configurations": [ + "Web Server", + "Model Server", + "API Server", + ], + "presentation": { + "group": "1", + } + }, + { + "name": "Celery (all)", + "configurations": [ + "Celery primary", + "Celery light", + "Celery heavy", + "Celery indexing", + "Celery beat" + ], + "presentation": { + "group": "1", + } + } ], "configurations": [ + { + // Dummy entry used to label the group + "name": "--- Individual ---", + "type": "node", + "request": "launch", + "presentation": { + "group": "2", + "order": 0 + } + }, { "name": "Web Server", "type": "node", @@ -29,7 +79,11 @@ "runtimeArgs": [ "run", "dev" ], - "console": "integratedTerminal" + "presentation": { + "group": "2", + }, + "console": "integratedTerminal", + "consoleTitle": "Web Server Console" }, { "name": "Model Server", @@ -48,7 +102,11 @@ "--reload", "--port", "9000" - ] + ], + "presentation": { + "group": "2", + }, + "consoleTitle": "Model Server Console" }, { "name": "API Server", @@ -68,57 +126,171 @@ "--reload", "--port", "8080" - ] + ], + "presentation": { + "group": "2", + }, + "consoleTitle": "API Server Console" }, + // For the listener to access the Slack API, + // DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project { - "name": "Indexing", - "consoleName": "Indexing", + "name": "Slack Bot", + "consoleName": "Slack Bot", "type": "debugpy", "request": "launch", - "program": "danswer/background/update.py", + "program": "danswer/danswerbot/slack/listener.py", "cwd": "${workspaceFolder}/backend", "envFile": "${workspaceFolder}/.vscode/.env", "env": { - "ENABLE_MULTIPASS_INDEXING": "false", "LOG_LEVEL": "DEBUG", "PYTHONUNBUFFERED": "1", "PYTHONPATH": "." - } + }, + "presentation": { + "group": "2", + }, + "consoleTitle": "Slack Bot Console" }, - // Celery and all async jobs, usually would include indexing as well but this is handled separately above for dev { - "name": "Background Jobs", - "consoleName": "Background Jobs", + "name": "Celery primary", "type": "debugpy", "request": "launch", - "program": "scripts/dev_run_background_jobs.py", + "module": "celery", "cwd": "${workspaceFolder}/backend", "envFile": "${workspaceFolder}/.vscode/.env", "env": { - "LOG_DANSWER_MODEL_INTERACTIONS": "True", + "LOG_LEVEL": "INFO", + "PYTHONUNBUFFERED": "1", + "PYTHONPATH": "." + }, + "args": [ + "-A", + "danswer.background.celery.versioned_apps.primary", + "worker", + "--pool=threads", + "--concurrency=4", + "--prefetch-multiplier=1", + "--loglevel=INFO", + "--hostname=primary@%n", + "-Q", + "celery", + ], + "presentation": { + "group": "2", + }, + "consoleTitle": "Celery primary Console" + }, + { + "name": "Celery light", + "type": "debugpy", + "request": "launch", + "module": "celery", + "cwd": "${workspaceFolder}/backend", + "envFile": "${workspaceFolder}/.vscode/.env", + "env": { + "LOG_LEVEL": "INFO", + "PYTHONUNBUFFERED": "1", + "PYTHONPATH": "." + }, + "args": [ + "-A", + "danswer.background.celery.versioned_apps.light", + "worker", + "--pool=threads", + "--concurrency=64", + "--prefetch-multiplier=8", + "--loglevel=INFO", + "--hostname=light@%n", + "-Q", + "vespa_metadata_sync,connector_deletion,doc_permissions_upsert", + ], + "presentation": { + "group": "2", + }, + "consoleTitle": "Celery light Console" + }, + { + "name": "Celery heavy", + "type": "debugpy", + "request": "launch", + "module": "celery", + "cwd": "${workspaceFolder}/backend", + "envFile": "${workspaceFolder}/.vscode/.env", + "env": { + "LOG_LEVEL": "INFO", + "PYTHONUNBUFFERED": "1", + "PYTHONPATH": "." + }, + "args": [ + "-A", + "danswer.background.celery.versioned_apps.heavy", + "worker", + "--pool=threads", + "--concurrency=4", + "--prefetch-multiplier=1", + "--loglevel=INFO", + "--hostname=heavy@%n", + "-Q", + "connector_pruning,connector_doc_permissions_sync,connector_external_group_sync", + ], + "presentation": { + "group": "2", + }, + "consoleTitle": "Celery heavy Console" + }, + { + "name": "Celery indexing", + "type": "debugpy", + "request": "launch", + "module": "celery", + "cwd": "${workspaceFolder}/backend", + "envFile": "${workspaceFolder}/.vscode/.env", + "env": { + "ENABLE_MULTIPASS_INDEXING": "false", "LOG_LEVEL": "DEBUG", "PYTHONUNBUFFERED": "1", "PYTHONPATH": "." }, "args": [ - "--no-indexing" - ] + "-A", + "danswer.background.celery.versioned_apps.indexing", + "worker", + "--pool=threads", + "--concurrency=1", + "--prefetch-multiplier=1", + "--loglevel=INFO", + "--hostname=indexing@%n", + "-Q", + "connector_indexing", + ], + "presentation": { + "group": "2", + }, + "consoleTitle": "Celery indexing Console" }, - // For the listner to access the Slack API, - // DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project { - "name": "Slack Bot", - "consoleName": "Slack Bot", + "name": "Celery beat", "type": "debugpy", "request": "launch", - "program": "danswer/danswerbot/slack/listener.py", + "module": "celery", "cwd": "${workspaceFolder}/backend", "envFile": "${workspaceFolder}/.vscode/.env", "env": { "LOG_LEVEL": "DEBUG", "PYTHONUNBUFFERED": "1", "PYTHONPATH": "." - } + }, + "args": [ + "-A", + "danswer.background.celery.versioned_apps.beat", + "beat", + "--loglevel=INFO", + ], + "presentation": { + "group": "2", + }, + "consoleTitle": "Celery beat Console" }, { "name": "Pytest", @@ -137,8 +309,22 @@ "-v" // Specify a sepcific module/test to run or provide nothing to run all tests //"tests/unit/danswer/llm/answering/test_prune_and_merge.py" - ] + ], + "presentation": { + "group": "2", + }, + "consoleTitle": "Pytest Console" }, + { + // Dummy entry used to label the group + "name": "--- Tasks ---", + "type": "node", + "request": "launch", + "presentation": { + "group": "3", + "order": 0 + } + }, { "name": "Clear and Restart External Volumes and Containers", "type": "node", @@ -147,7 +333,27 @@ "runtimeArgs": ["${workspaceFolder}/backend/scripts/restart_containers.sh"], "cwd": "${workspaceFolder}", "console": "integratedTerminal", - "stopOnEntry": true - } + "stopOnEntry": true, + "presentation": { + "group": "3", + }, + }, + { + // Celery jobs launched through a single background script (legacy) + // Recommend using the "Celery (all)" compound launch instead. + "name": "Background Jobs", + "consoleName": "Background Jobs", + "type": "debugpy", + "request": "launch", + "program": "scripts/dev_run_background_jobs.py", + "cwd": "${workspaceFolder}/backend", + "envFile": "${workspaceFolder}/.vscode/.env", + "env": { + "LOG_DANSWER_MODEL_INTERACTIONS": "True", + "LOG_LEVEL": "DEBUG", + "PYTHONUNBUFFERED": "1", + "PYTHONPATH": "." + }, + }, ] } diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3e4415188a1..736c482252f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,7 +22,7 @@ Your input is vital to making sure that Danswer moves in the right direction. Before starting on implementation, please raise a GitHub issue. And always feel free to message us (Chris Weaver / Yuhong Sun) on -[Slack](https://join.slack.com/t/danswer/shared_invite/zt-2afut44lv-Rw3kSWu6_OmdAXRpCv80DQ) / +[Slack](https://join.slack.com/t/danswer/shared_invite/zt-2lcmqw703-071hBuZBfNEOGUsLa5PXvQ) / [Discord](https://discord.gg/TDJ59cGV2X) directly about anything at all. @@ -32,7 +32,7 @@ To contribute to this project, please follow the When opening a pull request, mention related issues and feel free to tag relevant maintainers. Before creating a pull request please make sure that the new changes conform to the formatting and linting requirements. -See the [Formatting and Linting](#-formatting-and-linting) section for how to run these checks locally. +See the [Formatting and Linting](#formatting-and-linting) section for how to run these checks locally. ### Getting Help 🙋 diff --git a/README.md b/README.md index aff3cd57d5a..1f9fbef5b2f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ +

@@ -11,7 +12,7 @@ Documentation - + Slack @@ -68,13 +69,13 @@ We also have built-in support for deployment on Kubernetes. Files for that can b ## 🚧 Roadmap * Chat/Prompt sharing with specific teammates and user groups. -* Multi-Model model support, chat with images, video etc. +* Multimodal model support, chat with images, video etc. * Choosing between LLMs and parameters during chat session. * Tool calling and agent configurations options. * Organizational understanding and ability to locate and suggest experts from your team. -## Other Noteable Benefits of Danswer +## Other Notable Benefits of Danswer * User Authentication with document level access management. * Best in class Hybrid Search across all sources (BM-25 + prefix aware embedding models). * Admin Dashboard to configure connectors, document-sets, access, etc. @@ -127,3 +128,19 @@ To try the Danswer Enterprise Edition: ## 💡 Contributing Looking to contribute? Please check out the [Contribution Guide](CONTRIBUTING.md) for more details. + +## ⭐Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=danswer-ai/danswer&type=Date)](https://star-history.com/#danswer-ai/danswer&Date) + +## ✨Contributors + + + contributors + + +

+ + ↑ Back to Top ↑ + +

diff --git a/backend/danswer/dynamic_configs/__init__.py b/backend/DELETE similarity index 100% rename from backend/danswer/dynamic_configs/__init__.py rename to backend/DELETE diff --git a/backend/Dockerfile b/backend/Dockerfile index c0432f4fe0b..ac203e398c6 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -8,10 +8,11 @@ Edition features outside of personal development or testing purposes. Please rea founders@danswer.ai for more information. Please visit https://github.com/danswer-ai/danswer" # Default DANSWER_VERSION, typically overriden during builds by GitHub Actions. -ARG DANSWER_VERSION=0.3-dev +ARG DANSWER_VERSION=0.8-dev ENV DANSWER_VERSION=${DANSWER_VERSION} \ DANSWER_RUNNING_IN_DOCKER="true" + RUN echo "DANSWER_VERSION: ${DANSWER_VERSION}" # Install system dependencies # cmake needed for psycopg (postgres) @@ -36,6 +37,8 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* && \ apt-get clean + + # Install Python dependencies # Remove py which is pulled in by retry, py is not needed and is a CVE COPY ./requirements/default.txt /tmp/requirements.txt @@ -74,7 +77,6 @@ RUN apt-get update && \ RUN python -c "from tokenizers import Tokenizer; \ Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')" - # Pre-downloading NLTK for setups with limited egress RUN python -c "import nltk; \ nltk.download('stopwords', quiet=True); \ @@ -101,6 +103,7 @@ COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf COPY ./danswer /app/danswer COPY ./shared_configs /app/shared_configs COPY ./alembic /app/alembic +COPY ./alembic_tenants /app/alembic_tenants COPY ./alembic.ini /app/alembic.ini COPY supervisord.conf /usr/etc/supervisord.conf @@ -110,7 +113,7 @@ COPY ./scripts/force_delete_connector_by_id.py /app/scripts/force_delete_connect # Put logo in assets COPY ./assets /app/assets -ENV PYTHONPATH /app +ENV PYTHONPATH=/app # Default command which does nothing # This container is used by api server and background which specify their own CMD diff --git a/backend/Dockerfile.model_server b/backend/Dockerfile.model_server index 16e187f40c7..8b7f12e1307 100644 --- a/backend/Dockerfile.model_server +++ b/backend/Dockerfile.model_server @@ -7,7 +7,7 @@ You can find it at https://hub.docker.com/r/danswer/danswer-model-server. For mo visit https://github.com/danswer-ai/danswer." # Default DANSWER_VERSION, typically overriden during builds by GitHub Actions. -ARG DANSWER_VERSION=0.3-dev +ARG DANSWER_VERSION=0.8-dev ENV DANSWER_VERSION=${DANSWER_VERSION} \ DANSWER_RUNNING_IN_DOCKER="true" @@ -59,6 +59,6 @@ COPY ./shared_configs /app/shared_configs # Model Server main code COPY ./model_server /app/model_server -ENV PYTHONPATH /app +ENV PYTHONPATH=/app CMD ["uvicorn", "model_server.main:app", "--host", "0.0.0.0", "--port", "9000"] diff --git a/backend/alembic.ini b/backend/alembic.ini index 10ae5cfdd27..599c46fadd7 100644 --- a/backend/alembic.ini +++ b/backend/alembic.ini @@ -1,6 +1,6 @@ # A generic, single database configuration. -[alembic] +[DEFAULT] # path to migration scripts script_location = alembic @@ -47,7 +47,8 @@ prepend_sys_path = . # version_path_separator = : # version_path_separator = ; # version_path_separator = space -version_path_separator = os # Use os.pathsep. Default configuration used for new projects. +version_path_separator = os +# Use os.pathsep. Default configuration used for new projects. # set to 'true' to search source files recursively # in each "version_locations" directory @@ -106,3 +107,12 @@ formatter = generic [formatter_generic] format = %(levelname)-5.5s [%(name)s] %(message)s datefmt = %H:%M:%S + + +[alembic] +script_location = alembic +version_locations = %(script_location)s/versions + +[schema_private] +script_location = alembic_tenants +version_locations = %(script_location)s/versions diff --git a/backend/alembic/env.py b/backend/alembic/env.py index 154d6ff3d66..019ea94b836 100644 --- a/backend/alembic/env.py +++ b/backend/alembic/env.py @@ -1,107 +1,203 @@ +from sqlalchemy.engine.base import Connection +from typing import Any import asyncio from logging.config import fileConfig +import logging from alembic import context -from danswer.db.engine import build_connection_string -from danswer.db.models import Base from sqlalchemy import pool -from sqlalchemy.engine import Connection from sqlalchemy.ext.asyncio import create_async_engine +from sqlalchemy.sql import text + +from shared_configs.configs import MULTI_TENANT +from danswer.db.engine import build_connection_string +from danswer.db.models import Base from celery.backends.database.session import ResultModelBase # type: ignore -from sqlalchemy.schema import SchemaItem +from danswer.db.engine import get_all_tenant_ids +from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA -# this is the Alembic Config object, which provides -# access to the values within the .ini file in use. +# Alembic Config object config = context.config # Interpret the config file for Python logging. -# This line sets up loggers basically. if config.config_file_name is not None and config.attributes.get( "configure_logger", True ): fileConfig(config.config_file_name) -# add your model's MetaData object here -# for 'autogenerate' support -# from myapp import mymodel -# target_metadata = mymodel.Base.metadata +# Add your model's MetaData object here for 'autogenerate' support target_metadata = [Base.metadata, ResultModelBase.metadata] -# other values from the config, defined by the needs of env.py, -# can be acquired: -# my_important_option = config.get_main_option("my_important_option") -# ... etc. - EXCLUDE_TABLES = {"kombu_queue", "kombu_message"} +# Set up logging +logger = logging.getLogger(__name__) + def include_object( - object: SchemaItem, - name: str, - type_: str, - reflected: bool, - compare_to: SchemaItem | None, + object: Any, name: str, type_: str, reflected: bool, compare_to: Any ) -> bool: + """ + Determines whether a database object should be included in migrations. + Excludes specified tables from migrations. + """ if type_ == "table" and name in EXCLUDE_TABLES: return False return True -def run_migrations_offline() -> None: - """Run migrations in 'offline' mode. - - This configures the context with just a URL - and not an Engine, though an Engine is acceptable - here as well. By skipping the Engine creation - we don't even need a DBAPI to be available. - - Calls to context.execute() here emit the given string to the - script output. - +def get_schema_options() -> tuple[str, bool, bool]: """ - url = build_connection_string() - context.configure( - url=url, - target_metadata=target_metadata, # type: ignore - literal_binds=True, - dialect_opts={"paramstyle": "named"}, - ) + Parses command-line options passed via '-x' in Alembic commands. + Recognizes 'schema', 'create_schema', and 'upgrade_all_tenants' options. + """ + x_args_raw = context.get_x_argument() + x_args = {} + for arg in x_args_raw: + for pair in arg.split(","): + if "=" in pair: + key, value = pair.split("=", 1) + x_args[key.strip()] = value.strip() + schema_name = x_args.get("schema", POSTGRES_DEFAULT_SCHEMA) + create_schema = x_args.get("create_schema", "true").lower() == "true" + upgrade_all_tenants = x_args.get("upgrade_all_tenants", "false").lower() == "true" + + if ( + MULTI_TENANT + and schema_name == POSTGRES_DEFAULT_SCHEMA + and not upgrade_all_tenants + ): + raise ValueError( + "Cannot run default migrations in public schema when multi-tenancy is enabled. " + "Please specify a tenant-specific schema." + ) + + return schema_name, create_schema, upgrade_all_tenants + + +def do_run_migrations( + connection: Connection, schema_name: str, create_schema: bool +) -> None: + """ + Executes migrations in the specified schema. + """ + logger.info(f"About to migrate schema: {schema_name}") - with context.begin_transaction(): - context.run_migrations() + if create_schema: + connection.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{schema_name}"')) + connection.execute(text("COMMIT")) + # Set search_path to the target schema + connection.execute(text(f'SET search_path TO "{schema_name}"')) -def do_run_migrations(connection: Connection) -> None: context.configure( connection=connection, target_metadata=target_metadata, # type: ignore include_object=include_object, - ) # type: ignore + version_table_schema=schema_name, + include_schemas=True, + compare_type=True, + compare_server_default=True, + script_location=config.get_main_option("script_location"), + ) with context.begin_transaction(): context.run_migrations() async def run_async_migrations() -> None: - """In this scenario we need to create an Engine - and associate a connection with the context. - """ + Determines whether to run migrations for a single schema or all schemas, + and executes migrations accordingly. + """ + schema_name, create_schema, upgrade_all_tenants = get_schema_options() - connectable = create_async_engine( + engine = create_async_engine( build_connection_string(), poolclass=pool.NullPool, ) - async with connectable.connect() as connection: - await connection.run_sync(do_run_migrations) + if upgrade_all_tenants: + # Run migrations for all tenant schemas sequentially + tenant_schemas = get_all_tenant_ids() + + for schema in tenant_schemas: + try: + logger.info(f"Migrating schema: {schema}") + async with engine.connect() as connection: + await connection.run_sync( + do_run_migrations, + schema_name=schema, + create_schema=create_schema, + ) + except Exception as e: + logger.error(f"Error migrating schema {schema}: {e}") + raise + else: + try: + logger.info(f"Migrating schema: {schema_name}") + async with engine.connect() as connection: + await connection.run_sync( + do_run_migrations, + schema_name=schema_name, + create_schema=create_schema, + ) + except Exception as e: + logger.error(f"Error migrating schema {schema_name}: {e}") + raise + + await engine.dispose() - await connectable.dispose() +def run_migrations_offline() -> None: + """ + Run migrations in 'offline' mode. + """ + schema_name, _, upgrade_all_tenants = get_schema_options() + url = build_connection_string() -def run_migrations_online() -> None: - """Run migrations in 'online' mode.""" + if upgrade_all_tenants: + # Run offline migrations for all tenant schemas + engine = create_async_engine(url) + tenant_schemas = get_all_tenant_ids() + engine.sync_engine.dispose() + + for schema in tenant_schemas: + logger.info(f"Migrating schema: {schema}") + context.configure( + url=url, + target_metadata=target_metadata, # type: ignore + literal_binds=True, + include_object=include_object, + version_table_schema=schema, + include_schemas=True, + script_location=config.get_main_option("script_location"), + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + else: + logger.info(f"Migrating schema: {schema_name}") + context.configure( + url=url, + target_metadata=target_metadata, # type: ignore + literal_binds=True, + include_object=include_object, + version_table_schema=schema_name, + include_schemas=True, + script_location=config.get_main_option("script_location"), + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + +def run_migrations_online() -> None: + """ + Runs migrations in 'online' mode using an asynchronous engine. + """ asyncio.run(run_async_migrations()) diff --git a/backend/alembic/versions/177de57c21c9_display_custom_llm_models.py b/backend/alembic/versions/177de57c21c9_display_custom_llm_models.py new file mode 100644 index 00000000000..d622f55b2e3 --- /dev/null +++ b/backend/alembic/versions/177de57c21c9_display_custom_llm_models.py @@ -0,0 +1,59 @@ +"""display custom llm models + +Revision ID: 177de57c21c9 +Revises: 4ee1287bd26a +Create Date: 2024-11-21 11:49:04.488677 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy import and_ + +revision = "177de57c21c9" +down_revision = "4ee1287bd26a" +branch_labels = None +depends_on = None +depends_on = None + + +def upgrade() -> None: + conn = op.get_bind() + llm_provider = sa.table( + "llm_provider", + sa.column("id", sa.Integer), + sa.column("provider", sa.String), + sa.column("model_names", postgresql.ARRAY(sa.String)), + sa.column("display_model_names", postgresql.ARRAY(sa.String)), + ) + + excluded_providers = ["openai", "bedrock", "anthropic", "azure"] + + providers_to_update = sa.select( + llm_provider.c.id, + llm_provider.c.model_names, + llm_provider.c.display_model_names, + ).where( + and_( + ~llm_provider.c.provider.in_(excluded_providers), + llm_provider.c.model_names.isnot(None), + ) + ) + + results = conn.execute(providers_to_update).fetchall() + + for provider_id, model_names, display_model_names in results: + if display_model_names is None: + display_model_names = [] + + combined_model_names = list(set(display_model_names + model_names)) + update_stmt = ( + llm_provider.update() + .where(llm_provider.c.id == provider_id) + .values(display_model_names=combined_model_names) + ) + conn.execute(update_stmt) + + +def downgrade() -> None: + pass diff --git a/backend/alembic/versions/1b10e1fda030_add_additional_data_to_notifications.py b/backend/alembic/versions/1b10e1fda030_add_additional_data_to_notifications.py new file mode 100644 index 00000000000..71c31e2c862 --- /dev/null +++ b/backend/alembic/versions/1b10e1fda030_add_additional_data_to_notifications.py @@ -0,0 +1,26 @@ +"""add additional data to notifications + +Revision ID: 1b10e1fda030 +Revises: 6756efa39ada +Create Date: 2024-10-15 19:26:44.071259 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "1b10e1fda030" +down_revision = "6756efa39ada" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "notification", sa.Column("additional_data", postgresql.JSONB(), nullable=True) + ) + + +def downgrade() -> None: + op.drop_column("notification", "additional_data") diff --git a/backend/alembic/versions/1b8206b29c5d_add_user_delete_cascades.py b/backend/alembic/versions/1b8206b29c5d_add_user_delete_cascades.py new file mode 100644 index 00000000000..250621f74e2 --- /dev/null +++ b/backend/alembic/versions/1b8206b29c5d_add_user_delete_cascades.py @@ -0,0 +1,102 @@ +"""add_user_delete_cascades + +Revision ID: 1b8206b29c5d +Revises: 35e6853a51d5 +Create Date: 2024-09-18 11:48:59.418726 + +""" +from alembic import op + + +# revision identifiers, used by Alembic. +revision = "1b8206b29c5d" +down_revision = "35e6853a51d5" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.drop_constraint("credential_user_id_fkey", "credential", type_="foreignkey") + op.create_foreign_key( + "credential_user_id_fkey", + "credential", + "user", + ["user_id"], + ["id"], + ondelete="CASCADE", + ) + + op.drop_constraint("chat_session_user_id_fkey", "chat_session", type_="foreignkey") + op.create_foreign_key( + "chat_session_user_id_fkey", + "chat_session", + "user", + ["user_id"], + ["id"], + ondelete="CASCADE", + ) + + op.drop_constraint("chat_folder_user_id_fkey", "chat_folder", type_="foreignkey") + op.create_foreign_key( + "chat_folder_user_id_fkey", + "chat_folder", + "user", + ["user_id"], + ["id"], + ondelete="CASCADE", + ) + + op.drop_constraint("prompt_user_id_fkey", "prompt", type_="foreignkey") + op.create_foreign_key( + "prompt_user_id_fkey", "prompt", "user", ["user_id"], ["id"], ondelete="CASCADE" + ) + + op.drop_constraint("notification_user_id_fkey", "notification", type_="foreignkey") + op.create_foreign_key( + "notification_user_id_fkey", + "notification", + "user", + ["user_id"], + ["id"], + ondelete="CASCADE", + ) + + op.drop_constraint("inputprompt_user_id_fkey", "inputprompt", type_="foreignkey") + op.create_foreign_key( + "inputprompt_user_id_fkey", + "inputprompt", + "user", + ["user_id"], + ["id"], + ondelete="CASCADE", + ) + + +def downgrade() -> None: + op.drop_constraint("credential_user_id_fkey", "credential", type_="foreignkey") + op.create_foreign_key( + "credential_user_id_fkey", "credential", "user", ["user_id"], ["id"] + ) + + op.drop_constraint("chat_session_user_id_fkey", "chat_session", type_="foreignkey") + op.create_foreign_key( + "chat_session_user_id_fkey", "chat_session", "user", ["user_id"], ["id"] + ) + + op.drop_constraint("chat_folder_user_id_fkey", "chat_folder", type_="foreignkey") + op.create_foreign_key( + "chat_folder_user_id_fkey", "chat_folder", "user", ["user_id"], ["id"] + ) + + op.drop_constraint("prompt_user_id_fkey", "prompt", type_="foreignkey") + op.create_foreign_key("prompt_user_id_fkey", "prompt", "user", ["user_id"], ["id"]) + + op.drop_constraint("notification_user_id_fkey", "notification", type_="foreignkey") + op.create_foreign_key( + "notification_user_id_fkey", "notification", "user", ["user_id"], ["id"] + ) + + op.drop_constraint("inputprompt_user_id_fkey", "inputprompt", type_="foreignkey") + op.create_foreign_key( + "inputprompt_user_id_fkey", "inputprompt", "user", ["user_id"], ["id"] + ) diff --git a/backend/alembic/versions/26b931506ecb_default_chosen_assistants_to_none.py b/backend/alembic/versions/26b931506ecb_default_chosen_assistants_to_none.py new file mode 100644 index 00000000000..368f74c7599 --- /dev/null +++ b/backend/alembic/versions/26b931506ecb_default_chosen_assistants_to_none.py @@ -0,0 +1,68 @@ +"""default chosen assistants to none + +Revision ID: 26b931506ecb +Revises: 2daa494a0851 +Create Date: 2024-11-12 13:23:29.858995 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "26b931506ecb" +down_revision = "2daa494a0851" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "user", sa.Column("chosen_assistants_new", postgresql.JSONB(), nullable=True) + ) + + op.execute( + """ + UPDATE "user" + SET chosen_assistants_new = + CASE + WHEN chosen_assistants = '[-2, -1, 0]' THEN NULL + ELSE chosen_assistants + END + """ + ) + + op.drop_column("user", "chosen_assistants") + + op.alter_column( + "user", "chosen_assistants_new", new_column_name="chosen_assistants" + ) + + +def downgrade() -> None: + op.add_column( + "user", + sa.Column( + "chosen_assistants_old", + postgresql.JSONB(), + nullable=False, + server_default="[-2, -1, 0]", + ), + ) + + op.execute( + """ + UPDATE "user" + SET chosen_assistants_old = + CASE + WHEN chosen_assistants IS NULL THEN '[-2, -1, 0]'::jsonb + ELSE chosen_assistants + END + """ + ) + + op.drop_column("user", "chosen_assistants") + + op.alter_column( + "user", "chosen_assistants_old", new_column_name="chosen_assistants" + ) diff --git a/backend/alembic/versions/2daa494a0851_add_group_sync_time.py b/backend/alembic/versions/2daa494a0851_add_group_sync_time.py new file mode 100644 index 00000000000..c8a98f7693e --- /dev/null +++ b/backend/alembic/versions/2daa494a0851_add_group_sync_time.py @@ -0,0 +1,30 @@ +"""add-group-sync-time + +Revision ID: 2daa494a0851 +Revises: c0fd6e4da83a +Create Date: 2024-11-11 10:57:22.991157 + +""" +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = "2daa494a0851" +down_revision = "c0fd6e4da83a" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "connector_credential_pair", + sa.Column( + "last_time_external_group_sync", + sa.DateTime(timezone=True), + nullable=True, + ), + ) + + +def downgrade() -> None: + op.drop_column("connector_credential_pair", "last_time_external_group_sync") diff --git a/backend/alembic/versions/33cb72ea4d80_single_tool_call_per_message.py b/backend/alembic/versions/33cb72ea4d80_single_tool_call_per_message.py new file mode 100644 index 00000000000..0cd3da444bc --- /dev/null +++ b/backend/alembic/versions/33cb72ea4d80_single_tool_call_per_message.py @@ -0,0 +1,50 @@ +"""single tool call per message + +Revision ID: 33cb72ea4d80 +Revises: 5b29123cd710 +Create Date: 2024-11-01 12:51:01.535003 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "33cb72ea4d80" +down_revision = "5b29123cd710" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Step 1: Delete extraneous ToolCall entries + # Keep only the ToolCall with the smallest 'id' for each 'message_id' + op.execute( + sa.text( + """ + DELETE FROM tool_call + WHERE id NOT IN ( + SELECT MIN(id) + FROM tool_call + WHERE message_id IS NOT NULL + GROUP BY message_id + ); + """ + ) + ) + + # Step 2: Add a unique constraint on message_id + op.create_unique_constraint( + constraint_name="uq_tool_call_message_id", + table_name="tool_call", + columns=["message_id"], + ) + + +def downgrade() -> None: + # Step 1: Drop the unique constraint on message_id + op.drop_constraint( + constraint_name="uq_tool_call_message_id", + table_name="tool_call", + type_="unique", + ) diff --git a/backend/alembic/versions/46b7a812670f_fix_user__external_user_group_id_fk.py b/backend/alembic/versions/46b7a812670f_fix_user__external_user_group_id_fk.py new file mode 100644 index 00000000000..437d7a97e76 --- /dev/null +++ b/backend/alembic/versions/46b7a812670f_fix_user__external_user_group_id_fk.py @@ -0,0 +1,46 @@ +"""fix_user__external_user_group_id_fk + +Revision ID: 46b7a812670f +Revises: f32615f71aeb +Create Date: 2024-09-23 12:58:03.894038 + +""" +from alembic import op + +# revision identifiers, used by Alembic. +revision = "46b7a812670f" +down_revision = "f32615f71aeb" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Drop the existing primary key + op.drop_constraint( + "user__external_user_group_id_pkey", + "user__external_user_group_id", + type_="primary", + ) + + # Add the new composite primary key + op.create_primary_key( + "user__external_user_group_id_pkey", + "user__external_user_group_id", + ["user_id", "external_user_group_id", "cc_pair_id"], + ) + + +def downgrade() -> None: + # Drop the composite primary key + op.drop_constraint( + "user__external_user_group_id_pkey", + "user__external_user_group_id", + type_="primary", + ) + # Delete all entries from the table + op.execute("DELETE FROM user__external_user_group_id") + + # Recreate the original primary key on user_id + op.create_primary_key( + "user__external_user_group_id_pkey", "user__external_user_group_id", ["user_id"] + ) diff --git a/backend/alembic/versions/47e5bef3a1d7_add_persona_categories.py b/backend/alembic/versions/47e5bef3a1d7_add_persona_categories.py new file mode 100644 index 00000000000..432e0ab42a5 --- /dev/null +++ b/backend/alembic/versions/47e5bef3a1d7_add_persona_categories.py @@ -0,0 +1,45 @@ +"""add persona categories + +Revision ID: 47e5bef3a1d7 +Revises: dfbe9e93d3c7 +Create Date: 2024-11-05 18:55:02.221064 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "47e5bef3a1d7" +down_revision = "dfbe9e93d3c7" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Create the persona_category table + op.create_table( + "persona_category", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("name", sa.String(), nullable=False), + sa.Column("description", sa.String(), nullable=True), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("name"), + ) + + # Add category_id to persona table + op.add_column("persona", sa.Column("category_id", sa.Integer(), nullable=True)) + op.create_foreign_key( + "fk_persona_category", + "persona", + "persona_category", + ["category_id"], + ["id"], + ondelete="SET NULL", + ) + + +def downgrade() -> None: + op.drop_constraint("fk_persona_category", "persona", type_="foreignkey") + op.drop_column("persona", "category_id") + op.drop_table("persona_category") diff --git a/backend/alembic/versions/4ee1287bd26a_add_multiple_slack_bot_support.py b/backend/alembic/versions/4ee1287bd26a_add_multiple_slack_bot_support.py new file mode 100644 index 00000000000..e29f388beb2 --- /dev/null +++ b/backend/alembic/versions/4ee1287bd26a_add_multiple_slack_bot_support.py @@ -0,0 +1,280 @@ +"""add_multiple_slack_bot_support + +Revision ID: 4ee1287bd26a +Revises: 47e5bef3a1d7 +Create Date: 2024-11-06 13:15:53.302644 + +""" +import logging +from typing import cast +from alembic import op +import sqlalchemy as sa +from sqlalchemy.orm import Session +from danswer.key_value_store.factory import get_kv_store +from danswer.db.models import SlackBot +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "4ee1287bd26a" +down_revision = "47e5bef3a1d7" +branch_labels: None = None +depends_on: None = None + +# Configure logging +logger = logging.getLogger("alembic.runtime.migration") +logger.setLevel(logging.INFO) + + +def upgrade() -> None: + logger.info(f"{revision}: create_table: slack_bot") + # Create new slack_bot table + op.create_table( + "slack_bot", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("name", sa.String(), nullable=False), + sa.Column("enabled", sa.Boolean(), nullable=False, server_default="true"), + sa.Column("bot_token", sa.LargeBinary(), nullable=False), + sa.Column("app_token", sa.LargeBinary(), nullable=False), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("bot_token"), + sa.UniqueConstraint("app_token"), + ) + + # # Create new slack_channel_config table + op.create_table( + "slack_channel_config", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("slack_bot_id", sa.Integer(), nullable=True), + sa.Column("persona_id", sa.Integer(), nullable=True), + sa.Column("channel_config", postgresql.JSONB(), nullable=False), + sa.Column("response_type", sa.String(), nullable=False), + sa.Column( + "enable_auto_filters", sa.Boolean(), nullable=False, server_default="false" + ), + sa.ForeignKeyConstraint( + ["slack_bot_id"], + ["slack_bot.id"], + ), + sa.ForeignKeyConstraint( + ["persona_id"], + ["persona.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) + + # Handle existing Slack bot tokens first + logger.info(f"{revision}: Checking for existing Slack bot.") + bot_token = None + app_token = None + first_row_id = None + + try: + tokens = cast(dict, get_kv_store().load("slack_bot_tokens_config_key")) + except Exception: + logger.warning("No existing Slack bot tokens found.") + tokens = {} + + bot_token = tokens.get("bot_token") + app_token = tokens.get("app_token") + + if bot_token and app_token: + logger.info(f"{revision}: Found bot and app tokens.") + + session = Session(bind=op.get_bind()) + new_slack_bot = SlackBot( + name="Slack Bot (Migrated)", + enabled=True, + bot_token=bot_token, + app_token=app_token, + ) + session.add(new_slack_bot) + session.commit() + first_row_id = new_slack_bot.id + + # Create a default bot if none exists + # This is in case there are no slack tokens but there are channels configured + op.execute( + sa.text( + """ + INSERT INTO slack_bot (name, enabled, bot_token, app_token) + SELECT 'Default Bot', true, '', '' + WHERE NOT EXISTS (SELECT 1 FROM slack_bot) + RETURNING id; + """ + ) + ) + + # Get the bot ID to use (either from existing migration or newly created) + bot_id_query = sa.text( + """ + SELECT COALESCE( + :first_row_id, + (SELECT id FROM slack_bot ORDER BY id ASC LIMIT 1) + ) as bot_id; + """ + ) + result = op.get_bind().execute(bot_id_query, {"first_row_id": first_row_id}) + bot_id = result.scalar() + + # CTE (Common Table Expression) that transforms the old slack_bot_config table data + # This splits up the channel_names into their own rows + channel_names_cte = """ + WITH channel_names AS ( + SELECT + sbc.id as config_id, + sbc.persona_id, + sbc.response_type, + sbc.enable_auto_filters, + jsonb_array_elements_text(sbc.channel_config->'channel_names') as channel_name, + sbc.channel_config->>'respond_tag_only' as respond_tag_only, + sbc.channel_config->>'respond_to_bots' as respond_to_bots, + sbc.channel_config->'respond_member_group_list' as respond_member_group_list, + sbc.channel_config->'answer_filters' as answer_filters, + sbc.channel_config->'follow_up_tags' as follow_up_tags + FROM slack_bot_config sbc + ) + """ + + # Insert the channel names into the new slack_channel_config table + insert_statement = """ + INSERT INTO slack_channel_config ( + slack_bot_id, + persona_id, + channel_config, + response_type, + enable_auto_filters + ) + SELECT + :bot_id, + channel_name.persona_id, + jsonb_build_object( + 'channel_name', channel_name.channel_name, + 'respond_tag_only', + COALESCE((channel_name.respond_tag_only)::boolean, false), + 'respond_to_bots', + COALESCE((channel_name.respond_to_bots)::boolean, false), + 'respond_member_group_list', + COALESCE(channel_name.respond_member_group_list, '[]'::jsonb), + 'answer_filters', + COALESCE(channel_name.answer_filters, '[]'::jsonb), + 'follow_up_tags', + COALESCE(channel_name.follow_up_tags, '[]'::jsonb) + ), + channel_name.response_type, + channel_name.enable_auto_filters + FROM channel_names channel_name; + """ + + op.execute(sa.text(channel_names_cte + insert_statement).bindparams(bot_id=bot_id)) + + # Clean up old tokens if they existed + try: + if bot_token and app_token: + logger.info(f"{revision}: Removing old bot and app tokens.") + get_kv_store().delete("slack_bot_tokens_config_key") + except Exception: + logger.warning("tried to delete tokens in dynamic config but failed") + # Rename the table + op.rename_table( + "slack_bot_config__standard_answer_category", + "slack_channel_config__standard_answer_category", + ) + + # Rename the column + op.alter_column( + "slack_channel_config__standard_answer_category", + "slack_bot_config_id", + new_column_name="slack_channel_config_id", + ) + + # Drop the table with CASCADE to handle dependent objects + op.execute("DROP TABLE slack_bot_config CASCADE") + + logger.info(f"{revision}: Migration complete.") + + +def downgrade() -> None: + # Recreate the old slack_bot_config table + op.create_table( + "slack_bot_config", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("persona_id", sa.Integer(), nullable=True), + sa.Column("channel_config", postgresql.JSONB(), nullable=False), + sa.Column("response_type", sa.String(), nullable=False), + sa.Column("enable_auto_filters", sa.Boolean(), nullable=False), + sa.ForeignKeyConstraint( + ["persona_id"], + ["persona.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) + + # Migrate data back to the old format + # Group by persona_id to combine channel names back into arrays + op.execute( + sa.text( + """ + INSERT INTO slack_bot_config ( + persona_id, + channel_config, + response_type, + enable_auto_filters + ) + SELECT DISTINCT ON (persona_id) + persona_id, + jsonb_build_object( + 'channel_names', ( + SELECT jsonb_agg(c.channel_config->>'channel_name') + FROM slack_channel_config c + WHERE c.persona_id = scc.persona_id + ), + 'respond_tag_only', (channel_config->>'respond_tag_only')::boolean, + 'respond_to_bots', (channel_config->>'respond_to_bots')::boolean, + 'respond_member_group_list', channel_config->'respond_member_group_list', + 'answer_filters', channel_config->'answer_filters', + 'follow_up_tags', channel_config->'follow_up_tags' + ), + response_type, + enable_auto_filters + FROM slack_channel_config scc + WHERE persona_id IS NOT NULL; + """ + ) + ) + + # Rename the table back + op.rename_table( + "slack_channel_config__standard_answer_category", + "slack_bot_config__standard_answer_category", + ) + + # Rename the column back + op.alter_column( + "slack_bot_config__standard_answer_category", + "slack_channel_config_id", + new_column_name="slack_bot_config_id", + ) + + # Try to save the first bot's tokens back to KV store + try: + first_bot = ( + op.get_bind() + .execute( + sa.text( + "SELECT bot_token, app_token FROM slack_bot ORDER BY id LIMIT 1" + ) + ) + .first() + ) + if first_bot and first_bot.bot_token and first_bot.app_token: + tokens = { + "bot_token": first_bot.bot_token, + "app_token": first_bot.app_token, + } + get_kv_store().store("slack_bot_tokens_config_key", tokens) + except Exception: + logger.warning("Failed to save tokens back to KV store") + + # Drop the new tables in reverse order + op.drop_table("slack_channel_config") + op.drop_table("slack_bot") diff --git a/backend/alembic/versions/52a219fb5233_add_last_synced_and_last_modified_to_document_table.py b/backend/alembic/versions/52a219fb5233_add_last_synced_and_last_modified_to_document_table.py index f284c7b4bf1..068342095b6 100644 --- a/backend/alembic/versions/52a219fb5233_add_last_synced_and_last_modified_to_document_table.py +++ b/backend/alembic/versions/52a219fb5233_add_last_synced_and_last_modified_to_document_table.py @@ -1,7 +1,7 @@ """Add last synced and last modified to document table Revision ID: 52a219fb5233 -Revises: f17bf3b0d9f1 +Revises: f7e58d357687 Create Date: 2024-08-28 17:40:46.077470 """ diff --git a/backend/alembic/versions/55546a7967ee_assistant_rework.py b/backend/alembic/versions/55546a7967ee_assistant_rework.py new file mode 100644 index 00000000000..a027321a7c6 --- /dev/null +++ b/backend/alembic/versions/55546a7967ee_assistant_rework.py @@ -0,0 +1,79 @@ +"""assistant_rework + +Revision ID: 55546a7967ee +Revises: 61ff3651add4 +Create Date: 2024-09-18 17:00:23.755399 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + + +# revision identifiers, used by Alembic. +revision = "55546a7967ee" +down_revision = "61ff3651add4" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Reworking persona and user tables for new assistant features + # keep track of user's chosen assistants separate from their `ordering` + op.add_column("persona", sa.Column("builtin_persona", sa.Boolean(), nullable=True)) + op.execute("UPDATE persona SET builtin_persona = default_persona") + op.alter_column("persona", "builtin_persona", nullable=False) + op.drop_index("_default_persona_name_idx", table_name="persona") + op.create_index( + "_builtin_persona_name_idx", + "persona", + ["name"], + unique=True, + postgresql_where=sa.text("builtin_persona = true"), + ) + + op.add_column( + "user", sa.Column("visible_assistants", postgresql.JSONB(), nullable=True) + ) + op.add_column( + "user", sa.Column("hidden_assistants", postgresql.JSONB(), nullable=True) + ) + op.execute( + "UPDATE \"user\" SET visible_assistants = '[]'::jsonb, hidden_assistants = '[]'::jsonb" + ) + op.alter_column( + "user", + "visible_assistants", + nullable=False, + server_default=sa.text("'[]'::jsonb"), + ) + op.alter_column( + "user", + "hidden_assistants", + nullable=False, + server_default=sa.text("'[]'::jsonb"), + ) + op.drop_column("persona", "default_persona") + op.add_column( + "persona", sa.Column("is_default_persona", sa.Boolean(), nullable=True) + ) + + +def downgrade() -> None: + # Reverting changes made in upgrade + op.drop_column("user", "hidden_assistants") + op.drop_column("user", "visible_assistants") + op.drop_index("_builtin_persona_name_idx", table_name="persona") + + op.drop_column("persona", "is_default_persona") + op.add_column("persona", sa.Column("default_persona", sa.Boolean(), nullable=True)) + op.execute("UPDATE persona SET default_persona = builtin_persona") + op.alter_column("persona", "default_persona", nullable=False) + op.drop_column("persona", "builtin_persona") + op.create_index( + "_default_persona_name_idx", + "persona", + ["name"], + unique=True, + postgresql_where=sa.text("default_persona = true"), + ) diff --git a/backend/alembic/versions/5b29123cd710_nullable_search_settings_for_historic_.py b/backend/alembic/versions/5b29123cd710_nullable_search_settings_for_historic_.py new file mode 100644 index 00000000000..58164cd4c14 --- /dev/null +++ b/backend/alembic/versions/5b29123cd710_nullable_search_settings_for_historic_.py @@ -0,0 +1,70 @@ +"""nullable search settings for historic index attempts + +Revision ID: 5b29123cd710 +Revises: 949b4a92a401 +Create Date: 2024-10-30 19:37:59.630704 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "5b29123cd710" +down_revision = "949b4a92a401" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Drop the existing foreign key constraint + op.drop_constraint( + "fk_index_attempt_search_settings", "index_attempt", type_="foreignkey" + ) + + # Modify the column to be nullable + op.alter_column( + "index_attempt", "search_settings_id", existing_type=sa.INTEGER(), nullable=True + ) + + # Add back the foreign key with ON DELETE SET NULL + op.create_foreign_key( + "fk_index_attempt_search_settings", + "index_attempt", + "search_settings", + ["search_settings_id"], + ["id"], + ondelete="SET NULL", + ) + + +def downgrade() -> None: + # Warning: This will delete all index attempts that don't have search settings + op.execute( + """ + DELETE FROM index_attempt + WHERE search_settings_id IS NULL + """ + ) + + # Drop foreign key constraint + op.drop_constraint( + "fk_index_attempt_search_settings", "index_attempt", type_="foreignkey" + ) + + # Modify the column to be not nullable + op.alter_column( + "index_attempt", + "search_settings_id", + existing_type=sa.INTEGER(), + nullable=False, + ) + + # Add back the foreign key without ON DELETE SET NULL + op.create_foreign_key( + "fk_index_attempt_search_settings", + "index_attempt", + "search_settings", + ["search_settings_id"], + ["id"], + ) diff --git a/backend/alembic/versions/5d12a446f5c0_add_api_version_and_deployment_name_to_.py b/backend/alembic/versions/5d12a446f5c0_add_api_version_and_deployment_name_to_.py new file mode 100644 index 00000000000..85b5431ecc3 --- /dev/null +++ b/backend/alembic/versions/5d12a446f5c0_add_api_version_and_deployment_name_to_.py @@ -0,0 +1,30 @@ +"""add api_version and deployment_name to search settings + +Revision ID: 5d12a446f5c0 +Revises: e4334d5b33ba +Create Date: 2024-10-08 15:56:07.975636 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "5d12a446f5c0" +down_revision = "e4334d5b33ba" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "embedding_provider", sa.Column("api_version", sa.String(), nullable=True) + ) + op.add_column( + "embedding_provider", sa.Column("deployment_name", sa.String(), nullable=True) + ) + + +def downgrade() -> None: + op.drop_column("embedding_provider", "deployment_name") + op.drop_column("embedding_provider", "api_version") diff --git a/backend/alembic/versions/61ff3651add4_add_permission_syncing.py b/backend/alembic/versions/61ff3651add4_add_permission_syncing.py new file mode 100644 index 00000000000..697e1060e0b --- /dev/null +++ b/backend/alembic/versions/61ff3651add4_add_permission_syncing.py @@ -0,0 +1,162 @@ +"""Add Permission Syncing + +Revision ID: 61ff3651add4 +Revises: 1b8206b29c5d +Create Date: 2024-09-05 13:57:11.770413 + +""" +import fastapi_users_db_sqlalchemy + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "61ff3651add4" +down_revision = "1b8206b29c5d" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Admin user who set up connectors will lose access to the docs temporarily + # only way currently to give back access is to rerun from beginning + op.add_column( + "connector_credential_pair", + sa.Column( + "access_type", + sa.String(), + nullable=True, + ), + ) + op.execute( + "UPDATE connector_credential_pair SET access_type = 'PUBLIC' WHERE is_public = true" + ) + op.execute( + "UPDATE connector_credential_pair SET access_type = 'PRIVATE' WHERE is_public = false" + ) + op.alter_column("connector_credential_pair", "access_type", nullable=False) + + op.add_column( + "connector_credential_pair", + sa.Column( + "auto_sync_options", + postgresql.JSONB(astext_type=sa.Text()), + nullable=True, + ), + ) + op.add_column( + "connector_credential_pair", + sa.Column("last_time_perm_sync", sa.DateTime(timezone=True), nullable=True), + ) + op.drop_column("connector_credential_pair", "is_public") + + op.add_column( + "document", + sa.Column("external_user_emails", postgresql.ARRAY(sa.String()), nullable=True), + ) + op.add_column( + "document", + sa.Column( + "external_user_group_ids", postgresql.ARRAY(sa.String()), nullable=True + ), + ) + op.add_column( + "document", + sa.Column("is_public", sa.Boolean(), nullable=True), + ) + + op.create_table( + "user__external_user_group_id", + sa.Column( + "user_id", fastapi_users_db_sqlalchemy.generics.GUID(), nullable=False + ), + sa.Column("external_user_group_id", sa.String(), nullable=False), + sa.Column("cc_pair_id", sa.Integer(), nullable=False), + sa.PrimaryKeyConstraint("user_id"), + ) + + op.drop_column("external_permission", "user_id") + op.drop_column("email_to_external_user_cache", "user_id") + op.drop_table("permission_sync_run") + op.drop_table("external_permission") + op.drop_table("email_to_external_user_cache") + + +def downgrade() -> None: + op.add_column( + "connector_credential_pair", + sa.Column("is_public", sa.BOOLEAN(), nullable=True), + ) + op.execute( + "UPDATE connector_credential_pair SET is_public = (access_type = 'PUBLIC')" + ) + op.alter_column("connector_credential_pair", "is_public", nullable=False) + + op.drop_column("connector_credential_pair", "auto_sync_options") + op.drop_column("connector_credential_pair", "access_type") + op.drop_column("connector_credential_pair", "last_time_perm_sync") + op.drop_column("document", "external_user_emails") + op.drop_column("document", "external_user_group_ids") + op.drop_column("document", "is_public") + + op.drop_table("user__external_user_group_id") + + # Drop the enum type at the end of the downgrade + op.create_table( + "permission_sync_run", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column( + "source_type", + sa.String(), + nullable=False, + ), + sa.Column("update_type", sa.String(), nullable=False), + sa.Column("cc_pair_id", sa.Integer(), nullable=True), + sa.Column( + "status", + sa.String(), + nullable=False, + ), + sa.Column("error_msg", sa.Text(), nullable=True), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["cc_pair_id"], + ["connector_credential_pair.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_table( + "external_permission", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("user_id", sa.UUID(), nullable=True), + sa.Column("user_email", sa.String(), nullable=False), + sa.Column( + "source_type", + sa.String(), + nullable=False, + ), + sa.Column("external_permission_group", sa.String(), nullable=False), + sa.ForeignKeyConstraint( + ["user_id"], + ["user.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_table( + "email_to_external_user_cache", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("external_user_id", sa.String(), nullable=False), + sa.Column("user_id", sa.UUID(), nullable=True), + sa.Column("user_email", sa.String(), nullable=False), + sa.ForeignKeyConstraint( + ["user_id"], + ["user.id"], + ), + sa.PrimaryKeyConstraint("id"), + ) diff --git a/backend/alembic/versions/6756efa39ada_id_uuid_for_chat_session.py b/backend/alembic/versions/6756efa39ada_id_uuid_for_chat_session.py new file mode 100644 index 00000000000..083fececd87 --- /dev/null +++ b/backend/alembic/versions/6756efa39ada_id_uuid_for_chat_session.py @@ -0,0 +1,153 @@ +"""Migrate chat_session and chat_message tables to use UUID primary keys + +Revision ID: 6756efa39ada +Revises: 5d12a446f5c0 +Create Date: 2024-10-15 17:47:44.108537 + +""" +from alembic import op +import sqlalchemy as sa + +revision = "6756efa39ada" +down_revision = "5d12a446f5c0" +branch_labels = None +depends_on = None + +""" +This script: +1. Adds UUID columns to chat_session and chat_message +2. Populates new columns with UUIDs +3. Updates foreign key relationships +4. Removes old integer ID columns + +Note: Downgrade will assign new integer IDs, not restore original ones. +""" + + +def upgrade() -> None: + op.execute("CREATE EXTENSION IF NOT EXISTS pgcrypto;") + + op.add_column( + "chat_session", + sa.Column( + "new_id", + sa.UUID(as_uuid=True), + server_default=sa.text("gen_random_uuid()"), + nullable=False, + ), + ) + + op.execute("UPDATE chat_session SET new_id = gen_random_uuid();") + + op.add_column( + "chat_message", + sa.Column("new_chat_session_id", sa.UUID(as_uuid=True), nullable=True), + ) + + op.execute( + """ + UPDATE chat_message + SET new_chat_session_id = cs.new_id + FROM chat_session cs + WHERE chat_message.chat_session_id = cs.id; + """ + ) + + op.drop_constraint( + "chat_message_chat_session_id_fkey", "chat_message", type_="foreignkey" + ) + + op.drop_column("chat_message", "chat_session_id") + op.alter_column( + "chat_message", "new_chat_session_id", new_column_name="chat_session_id" + ) + + op.drop_constraint("chat_session_pkey", "chat_session", type_="primary") + op.drop_column("chat_session", "id") + op.alter_column("chat_session", "new_id", new_column_name="id") + + op.create_primary_key("chat_session_pkey", "chat_session", ["id"]) + + op.create_foreign_key( + "chat_message_chat_session_id_fkey", + "chat_message", + "chat_session", + ["chat_session_id"], + ["id"], + ondelete="CASCADE", + ) + + +def downgrade() -> None: + op.drop_constraint( + "chat_message_chat_session_id_fkey", "chat_message", type_="foreignkey" + ) + + op.add_column( + "chat_session", + sa.Column("old_id", sa.Integer, autoincrement=True, nullable=True), + ) + + op.execute("CREATE SEQUENCE chat_session_old_id_seq OWNED BY chat_session.old_id;") + op.execute( + "ALTER TABLE chat_session ALTER COLUMN old_id SET DEFAULT nextval('chat_session_old_id_seq');" + ) + + op.execute( + "UPDATE chat_session SET old_id = nextval('chat_session_old_id_seq') WHERE old_id IS NULL;" + ) + + op.alter_column("chat_session", "old_id", nullable=False) + + op.drop_constraint("chat_session_pkey", "chat_session", type_="primary") + op.create_primary_key("chat_session_pkey", "chat_session", ["old_id"]) + + op.add_column( + "chat_message", + sa.Column("old_chat_session_id", sa.Integer, nullable=True), + ) + + op.execute( + """ + UPDATE chat_message + SET old_chat_session_id = cs.old_id + FROM chat_session cs + WHERE chat_message.chat_session_id = cs.id; + """ + ) + + op.drop_column("chat_message", "chat_session_id") + op.alter_column( + "chat_message", "old_chat_session_id", new_column_name="chat_session_id" + ) + + op.create_foreign_key( + "chat_message_chat_session_id_fkey", + "chat_message", + "chat_session", + ["chat_session_id"], + ["old_id"], + ondelete="CASCADE", + ) + + op.drop_column("chat_session", "id") + op.alter_column("chat_session", "old_id", new_column_name="id") + + op.alter_column( + "chat_session", + "id", + type_=sa.Integer(), + existing_type=sa.Integer(), + existing_nullable=False, + existing_server_default=False, + ) + + # Rename the sequence + op.execute("ALTER SEQUENCE chat_session_old_id_seq RENAME TO chat_session_id_seq;") + + # Update the default value to use the renamed sequence + op.alter_column( + "chat_session", + "id", + server_default=sa.text("nextval('chat_session_id_seq'::regclass)"), + ) diff --git a/backend/alembic/versions/6d562f86c78b_remove_default_bot.py b/backend/alembic/versions/6d562f86c78b_remove_default_bot.py new file mode 100644 index 00000000000..3e7097b87bc --- /dev/null +++ b/backend/alembic/versions/6d562f86c78b_remove_default_bot.py @@ -0,0 +1,45 @@ +"""remove default bot + +Revision ID: 6d562f86c78b +Revises: 177de57c21c9 +Create Date: 2024-11-22 11:51:29.331336 + +""" +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = "6d562f86c78b" +down_revision = "177de57c21c9" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute( + sa.text( + """ + DELETE FROM slack_bot + WHERE name = 'Default Bot' + AND bot_token = '' + AND app_token = '' + AND NOT EXISTS ( + SELECT 1 FROM slack_channel_config + WHERE slack_channel_config.slack_bot_id = slack_bot.id + ) + """ + ) + ) + + +def downgrade() -> None: + op.execute( + sa.text( + """ + INSERT INTO slack_bot (name, enabled, bot_token, app_token) + SELECT 'Default Bot', true, '', '' + WHERE NOT EXISTS (SELECT 1 FROM slack_bot) + RETURNING id; + """ + ) + ) diff --git a/backend/alembic/versions/703313b75876_add_tokenratelimit_tables.py b/backend/alembic/versions/703313b75876_add_tokenratelimit_tables.py index ed1993efed3..9e1fdf3cb9e 100644 --- a/backend/alembic/versions/703313b75876_add_tokenratelimit_tables.py +++ b/backend/alembic/versions/703313b75876_add_tokenratelimit_tables.py @@ -9,7 +9,7 @@ from typing import cast from alembic import op import sqlalchemy as sa -from danswer.dynamic_configs.factory import get_dynamic_config_store +from danswer.key_value_store.factory import get_kv_store # revision identifiers, used by Alembic. revision = "703313b75876" @@ -54,9 +54,7 @@ def upgrade() -> None: ) try: - settings_json = cast( - str, get_dynamic_config_store().load("token_budget_settings") - ) + settings_json = cast(str, get_kv_store().load("token_budget_settings")) settings = json.loads(settings_json) is_enabled = settings.get("enable_token_budget", False) @@ -71,7 +69,7 @@ def upgrade() -> None: ) # Delete the dynamic config - get_dynamic_config_store().delete("token_budget_settings") + get_kv_store().delete("token_budget_settings") except Exception: # Ignore if the dynamic config is not found diff --git a/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py b/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py index c2ba10b3875..09c0f7209d4 100644 --- a/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py +++ b/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py @@ -9,8 +9,8 @@ import sqlalchemy as sa from danswer.db.models import IndexModelStatus -from danswer.search.enums import RecencyBiasSetting -from danswer.search.enums import SearchType +from danswer.context.search.enums import RecencyBiasSetting +from danswer.context.search.enums import SearchType # revision identifiers, used by Alembic. revision = "776b3bbe9092" diff --git a/backend/alembic/versions/797089dfb4d2_persona_start_date.py b/backend/alembic/versions/797089dfb4d2_persona_start_date.py new file mode 100644 index 00000000000..52ade3dea4e --- /dev/null +++ b/backend/alembic/versions/797089dfb4d2_persona_start_date.py @@ -0,0 +1,27 @@ +"""persona_start_date + +Revision ID: 797089dfb4d2 +Revises: 55546a7967ee +Create Date: 2024-09-11 14:51:49.785835 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "797089dfb4d2" +down_revision = "55546a7967ee" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "persona", + sa.Column("search_start_date", sa.DateTime(timezone=True), nullable=True), + ) + + +def downgrade() -> None: + op.drop_column("persona", "search_start_date") diff --git a/backend/alembic/versions/93560ba1b118_add_web_ui_option_to_slack_config.py b/backend/alembic/versions/93560ba1b118_add_web_ui_option_to_slack_config.py new file mode 100644 index 00000000000..ab084aee314 --- /dev/null +++ b/backend/alembic/versions/93560ba1b118_add_web_ui_option_to_slack_config.py @@ -0,0 +1,35 @@ +"""add web ui option to slack config + +Revision ID: 93560ba1b118 +Revises: 6d562f86c78b +Create Date: 2024-11-24 06:36:17.490612 + +""" +from alembic import op + +# revision identifiers, used by Alembic. +revision = "93560ba1b118" +down_revision = "6d562f86c78b" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Add show_continue_in_web_ui with default False to all existing channel_configs + op.execute( + """ + UPDATE slack_channel_config + SET channel_config = channel_config || '{"show_continue_in_web_ui": false}'::jsonb + WHERE NOT channel_config ? 'show_continue_in_web_ui' + """ + ) + + +def downgrade() -> None: + # Remove show_continue_in_web_ui from all channel_configs + op.execute( + """ + UPDATE slack_channel_config + SET channel_config = channel_config - 'show_continue_in_web_ui' + """ + ) diff --git a/backend/alembic/versions/949b4a92a401_remove_rt.py b/backend/alembic/versions/949b4a92a401_remove_rt.py new file mode 100644 index 00000000000..5d4f63ff87e --- /dev/null +++ b/backend/alembic/versions/949b4a92a401_remove_rt.py @@ -0,0 +1,72 @@ +"""remove rt + +Revision ID: 949b4a92a401 +Revises: 1b10e1fda030 +Create Date: 2024-10-26 13:06:06.937969 + +""" +from alembic import op +from sqlalchemy.orm import Session +from sqlalchemy import text + +# Import your models and constants +from danswer.db.models import ( + Connector, + ConnectorCredentialPair, + Credential, + IndexAttempt, +) + + +# revision identifiers, used by Alembic. +revision = "949b4a92a401" +down_revision = "1b10e1fda030" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Deletes all RequestTracker connectors and associated data + bind = op.get_bind() + session = Session(bind=bind) + + # Get connectors using raw SQL + result = bind.execute( + text("SELECT id FROM connector WHERE source = 'requesttracker'") + ) + connector_ids = [row[0] for row in result] + + if connector_ids: + cc_pairs_to_delete = ( + session.query(ConnectorCredentialPair) + .filter(ConnectorCredentialPair.connector_id.in_(connector_ids)) + .all() + ) + + cc_pair_ids = [cc_pair.id for cc_pair in cc_pairs_to_delete] + + if cc_pair_ids: + session.query(IndexAttempt).filter( + IndexAttempt.connector_credential_pair_id.in_(cc_pair_ids) + ).delete(synchronize_session=False) + + session.query(ConnectorCredentialPair).filter( + ConnectorCredentialPair.id.in_(cc_pair_ids) + ).delete(synchronize_session=False) + + credential_ids = [cc_pair.credential_id for cc_pair in cc_pairs_to_delete] + if credential_ids: + session.query(Credential).filter(Credential.id.in_(credential_ids)).delete( + synchronize_session=False + ) + + session.query(Connector).filter(Connector.id.in_(connector_ids)).delete( + synchronize_session=False + ) + + session.commit() + + +def downgrade() -> None: + # No-op downgrade as we cannot restore deleted data + pass diff --git a/backend/alembic/versions/9cf5c00f72fe_add_creator_to_cc_pair.py b/backend/alembic/versions/9cf5c00f72fe_add_creator_to_cc_pair.py new file mode 100644 index 00000000000..f4ed3a67909 --- /dev/null +++ b/backend/alembic/versions/9cf5c00f72fe_add_creator_to_cc_pair.py @@ -0,0 +1,30 @@ +"""add creator to cc pair + +Revision ID: 9cf5c00f72fe +Revises: 26b931506ecb +Create Date: 2024-11-12 15:16:42.682902 + +""" +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = "9cf5c00f72fe" +down_revision = "26b931506ecb" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "connector_credential_pair", + sa.Column( + "creator_id", + sa.UUID(as_uuid=True), + nullable=True, + ), + ) + + +def downgrade() -> None: + op.drop_column("connector_credential_pair", "creator_id") diff --git a/backend/alembic/versions/ac5eaac849f9_add_last_pruned_to_connector_table.py b/backend/alembic/versions/ac5eaac849f9_add_last_pruned_to_connector_table.py new file mode 100644 index 00000000000..b2c33e1688d --- /dev/null +++ b/backend/alembic/versions/ac5eaac849f9_add_last_pruned_to_connector_table.py @@ -0,0 +1,27 @@ +"""add last_pruned to the connector_credential_pair table + +Revision ID: ac5eaac849f9 +Revises: 52a219fb5233 +Create Date: 2024-09-10 15:04:26.437118 + +""" +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = "ac5eaac849f9" +down_revision = "46b7a812670f" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # last pruned represents the last time the connector was pruned + op.add_column( + "connector_credential_pair", + sa.Column("last_pruned", sa.DateTime(timezone=True), nullable=True), + ) + + +def downgrade() -> None: + op.drop_column("connector_credential_pair", "last_pruned") diff --git a/backend/alembic/versions/b082fec533f0_make_last_attempt_status_nullable.py b/backend/alembic/versions/b082fec533f0_make_last_attempt_status_nullable.py index a6938e365c6..db7b330c3e0 100644 --- a/backend/alembic/versions/b082fec533f0_make_last_attempt_status_nullable.py +++ b/backend/alembic/versions/b082fec533f0_make_last_attempt_status_nullable.py @@ -31,6 +31,12 @@ def upgrade() -> None: def downgrade() -> None: + # First, update any null values to a default value + op.execute( + "UPDATE connector_credential_pair SET last_attempt_status = 'NOT_STARTED' WHERE last_attempt_status IS NULL" + ) + + # Then, make the column non-nullable op.alter_column( "connector_credential_pair", "last_attempt_status", diff --git a/backend/alembic/versions/b156fa702355_chat_reworked.py b/backend/alembic/versions/b156fa702355_chat_reworked.py index c80ab6a0fb1..a6d75fb508b 100644 --- a/backend/alembic/versions/b156fa702355_chat_reworked.py +++ b/backend/alembic/versions/b156fa702355_chat_reworked.py @@ -288,6 +288,15 @@ def upgrade() -> None: def downgrade() -> None: + # NOTE: you will lose all chat history. This is to satisfy the non-nullable constraints + # below + op.execute("DELETE FROM chat_feedback") + op.execute("DELETE FROM chat_message__search_doc") + op.execute("DELETE FROM document_retrieval_feedback") + op.execute("DELETE FROM document_retrieval_feedback") + op.execute("DELETE FROM chat_message") + op.execute("DELETE FROM chat_session") + op.drop_constraint( "chat_feedback__chat_message_fk", "chat_feedback", type_="foreignkey" ) diff --git a/backend/alembic/versions/b72ed7a5db0e_remove_description_from_starter_messages.py b/backend/alembic/versions/b72ed7a5db0e_remove_description_from_starter_messages.py new file mode 100644 index 00000000000..96dfa964eac --- /dev/null +++ b/backend/alembic/versions/b72ed7a5db0e_remove_description_from_starter_messages.py @@ -0,0 +1,48 @@ +"""remove description from starter messages + +Revision ID: b72ed7a5db0e +Revises: 33cb72ea4d80 +Create Date: 2024-11-03 15:55:28.944408 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "b72ed7a5db0e" +down_revision = "33cb72ea4d80" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute( + sa.text( + """ + UPDATE persona + SET starter_messages = ( + SELECT jsonb_agg(elem - 'description') + FROM jsonb_array_elements(starter_messages) elem + ) + WHERE starter_messages IS NOT NULL + AND jsonb_typeof(starter_messages) = 'array' + """ + ) + ) + + +def downgrade() -> None: + op.execute( + sa.text( + """ + UPDATE persona + SET starter_messages = ( + SELECT jsonb_agg(elem || '{"description": ""}') + FROM jsonb_array_elements(starter_messages) elem + ) + WHERE starter_messages IS NOT NULL + AND jsonb_typeof(starter_messages) = 'array' + """ + ) + ) diff --git a/backend/alembic/versions/bd2921608c3a_non_nullable_default_persona.py b/backend/alembic/versions/bd2921608c3a_non_nullable_default_persona.py new file mode 100644 index 00000000000..834d3f6731c --- /dev/null +++ b/backend/alembic/versions/bd2921608c3a_non_nullable_default_persona.py @@ -0,0 +1,43 @@ +"""non nullable default persona + +Revision ID: bd2921608c3a +Revises: 797089dfb4d2 +Create Date: 2024-09-20 10:28:37.992042 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "bd2921608c3a" +down_revision = "797089dfb4d2" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Set existing NULL values to False + op.execute( + "UPDATE persona SET is_default_persona = FALSE WHERE is_default_persona IS NULL" + ) + + # Alter the column to be not nullable with a default value of False + op.alter_column( + "persona", + "is_default_persona", + existing_type=sa.Boolean(), + nullable=False, + server_default=sa.text("false"), + ) + + +def downgrade() -> None: + # Revert the changes + op.alter_column( + "persona", + "is_default_persona", + existing_type=sa.Boolean(), + nullable=True, + server_default=None, + ) diff --git a/backend/alembic/versions/c0fd6e4da83a_add_recent_assistants.py b/backend/alembic/versions/c0fd6e4da83a_add_recent_assistants.py new file mode 100644 index 00000000000..8ad1f007756 --- /dev/null +++ b/backend/alembic/versions/c0fd6e4da83a_add_recent_assistants.py @@ -0,0 +1,29 @@ +"""add recent assistants + +Revision ID: c0fd6e4da83a +Revises: b72ed7a5db0e +Create Date: 2024-11-03 17:28:54.916618 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "c0fd6e4da83a" +down_revision = "b72ed7a5db0e" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "user", + sa.Column( + "recent_assistants", postgresql.JSONB(), server_default="[]", nullable=False + ), + ) + + +def downgrade() -> None: + op.drop_column("user", "recent_assistants") diff --git a/backend/alembic/versions/c99d76fcd298_add_nullable_to_persona_id_in_chat_.py b/backend/alembic/versions/c99d76fcd298_add_nullable_to_persona_id_in_chat_.py index 58fcf482c85..222605189fe 100644 --- a/backend/alembic/versions/c99d76fcd298_add_nullable_to_persona_id_in_chat_.py +++ b/backend/alembic/versions/c99d76fcd298_add_nullable_to_persona_id_in_chat_.py @@ -23,6 +23,56 @@ def upgrade() -> None: def downgrade() -> None: + # Delete chat messages and feedback first since they reference chat sessions + # Get chat messages from sessions with null persona_id + chat_messages_query = """ + SELECT id + FROM chat_message + WHERE chat_session_id IN ( + SELECT id + FROM chat_session + WHERE persona_id IS NULL + ) + """ + + # Delete dependent records first + op.execute( + f""" + DELETE FROM document_retrieval_feedback + WHERE chat_message_id IN ( + {chat_messages_query} + ) + """ + ) + op.execute( + f""" + DELETE FROM chat_message__search_doc + WHERE chat_message_id IN ( + {chat_messages_query} + ) + """ + ) + + # Delete chat messages + op.execute( + """ + DELETE FROM chat_message + WHERE chat_session_id IN ( + SELECT id + FROM chat_session + WHERE persona_id IS NULL + ) + """ + ) + + # Now we can safely delete the chat sessions + op.execute( + """ + DELETE FROM chat_session + WHERE persona_id IS NULL + """ + ) + op.alter_column( "chat_session", "persona_id", diff --git a/backend/alembic/versions/da4c21c69164_chosen_assistants_changed_to_jsonb.py b/backend/alembic/versions/da4c21c69164_chosen_assistants_changed_to_jsonb.py index 95b53cbeb41..8e0a8e6072d 100644 --- a/backend/alembic/versions/da4c21c69164_chosen_assistants_changed_to_jsonb.py +++ b/backend/alembic/versions/da4c21c69164_chosen_assistants_changed_to_jsonb.py @@ -20,7 +20,7 @@ def upgrade() -> None: conn = op.get_bind() existing_ids_and_chosen_assistants = conn.execute( - sa.text("select id, chosen_assistants from public.user") + sa.text('select id, chosen_assistants from "user"') ) op.drop_column( "user", @@ -37,7 +37,7 @@ def upgrade() -> None: for id, chosen_assistants in existing_ids_and_chosen_assistants: conn.execute( sa.text( - "update public.user set chosen_assistants = :chosen_assistants where id = :id" + 'update "user" set chosen_assistants = :chosen_assistants where id = :id' ), {"chosen_assistants": json.dumps(chosen_assistants), "id": id}, ) @@ -46,7 +46,7 @@ def upgrade() -> None: def downgrade() -> None: conn = op.get_bind() existing_ids_and_chosen_assistants = conn.execute( - sa.text("select id, chosen_assistants from public.user") + sa.text('select id, chosen_assistants from "user"') ) op.drop_column( "user", @@ -59,7 +59,7 @@ def downgrade() -> None: for id, chosen_assistants in existing_ids_and_chosen_assistants: conn.execute( sa.text( - "update public.user set chosen_assistants = :chosen_assistants where id = :id" + 'update "user" set chosen_assistants = :chosen_assistants where id = :id' ), {"chosen_assistants": chosen_assistants, "id": id}, ) diff --git a/backend/alembic/versions/dfbe9e93d3c7_extended_role_for_non_web.py b/backend/alembic/versions/dfbe9e93d3c7_extended_role_for_non_web.py new file mode 100644 index 00000000000..3f717ff09f1 --- /dev/null +++ b/backend/alembic/versions/dfbe9e93d3c7_extended_role_for_non_web.py @@ -0,0 +1,42 @@ +"""extended_role_for_non_web + +Revision ID: dfbe9e93d3c7 +Revises: 9cf5c00f72fe +Create Date: 2024-11-16 07:54:18.727906 + +""" +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = "dfbe9e93d3c7" +down_revision = "9cf5c00f72fe" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute( + """ + UPDATE "user" + SET role = 'EXT_PERM_USER' + WHERE has_web_login = false + """ + ) + op.drop_column("user", "has_web_login") + + +def downgrade() -> None: + op.add_column( + "user", + sa.Column("has_web_login", sa.Boolean(), nullable=False, server_default="true"), + ) + + op.execute( + """ + UPDATE "user" + SET has_web_login = false, + role = 'BASIC' + WHERE role IN ('SLACK_USER', 'EXT_PERM_USER') + """ + ) diff --git a/backend/alembic/versions/e4334d5b33ba_add_deployment_name_to_llmprovider.py b/backend/alembic/versions/e4334d5b33ba_add_deployment_name_to_llmprovider.py new file mode 100644 index 00000000000..e837b87e3e0 --- /dev/null +++ b/backend/alembic/versions/e4334d5b33ba_add_deployment_name_to_llmprovider.py @@ -0,0 +1,26 @@ +"""add_deployment_name_to_llmprovider + +Revision ID: e4334d5b33ba +Revises: ac5eaac849f9 +Create Date: 2024-10-04 09:52:34.896867 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "e4334d5b33ba" +down_revision = "ac5eaac849f9" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "llm_provider", sa.Column("deployment_name", sa.String(), nullable=True) + ) + + +def downgrade() -> None: + op.drop_column("llm_provider", "deployment_name") diff --git a/backend/alembic/versions/efb35676026c_standard_answer_match_regex_flag.py b/backend/alembic/versions/efb35676026c_standard_answer_match_regex_flag.py index c85bb68a3b9..e67d31b81ff 100644 --- a/backend/alembic/versions/efb35676026c_standard_answer_match_regex_flag.py +++ b/backend/alembic/versions/efb35676026c_standard_answer_match_regex_flag.py @@ -1,7 +1,7 @@ """standard answer match_regex flag Revision ID: efb35676026c -Revises: 52a219fb5233 +Revises: 0ebb1d516877 Create Date: 2024-09-11 13:55:46.101149 """ diff --git a/backend/alembic/versions/f32615f71aeb_add_custom_headers_to_tools.py b/backend/alembic/versions/f32615f71aeb_add_custom_headers_to_tools.py new file mode 100644 index 00000000000..904059e6ee3 --- /dev/null +++ b/backend/alembic/versions/f32615f71aeb_add_custom_headers_to_tools.py @@ -0,0 +1,26 @@ +"""add custom headers to tools + +Revision ID: f32615f71aeb +Revises: bd2921608c3a +Create Date: 2024-09-12 20:26:38.932377 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "f32615f71aeb" +down_revision = "bd2921608c3a" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + "tool", sa.Column("custom_headers", postgresql.JSONB(), nullable=True) + ) + + +def downgrade() -> None: + op.drop_column("tool", "custom_headers") diff --git a/backend/alembic/versions/f7e58d357687_add_has_web_column_to_user.py b/backend/alembic/versions/f7e58d357687_add_has_web_column_to_user.py index 2d8e7402e48..c2a131d6002 100644 --- a/backend/alembic/versions/f7e58d357687_add_has_web_column_to_user.py +++ b/backend/alembic/versions/f7e58d357687_add_has_web_column_to_user.py @@ -1,7 +1,7 @@ """add has_web_login column to user Revision ID: f7e58d357687 -Revises: bceb1e139447 +Revises: ba98eba0f66a Create Date: 2024-09-07 20:20:54.522620 """ diff --git a/backend/alembic_tenants/README.md b/backend/alembic_tenants/README.md new file mode 100644 index 00000000000..f075b958305 --- /dev/null +++ b/backend/alembic_tenants/README.md @@ -0,0 +1,3 @@ +These files are for public table migrations when operating with multi tenancy. + +If you are not a Danswer developer, you can ignore this directory entirely. \ No newline at end of file diff --git a/backend/alembic_tenants/env.py b/backend/alembic_tenants/env.py new file mode 100644 index 00000000000..f0f1178ce09 --- /dev/null +++ b/backend/alembic_tenants/env.py @@ -0,0 +1,111 @@ +import asyncio +from logging.config import fileConfig + +from sqlalchemy import pool +from sqlalchemy.engine import Connection +from sqlalchemy.ext.asyncio import create_async_engine +from sqlalchemy.schema import SchemaItem + +from alembic import context +from danswer.db.engine import build_connection_string +from danswer.db.models import PublicBase + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +if config.config_file_name is not None and config.attributes.get( + "configure_logger", True +): + fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +# from myapp import mymodel +# target_metadata = mymodel.Base.metadata +target_metadata = [PublicBase.metadata] + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + +EXCLUDE_TABLES = {"kombu_queue", "kombu_message"} + + +def include_object( + object: SchemaItem, + name: str, + type_: str, + reflected: bool, + compare_to: SchemaItem | None, +) -> bool: + if type_ == "table" and name in EXCLUDE_TABLES: + return False + return True + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = build_connection_string() + context.configure( + url=url, + target_metadata=target_metadata, # type: ignore + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def do_run_migrations(connection: Connection) -> None: + context.configure( + connection=connection, + target_metadata=target_metadata, # type: ignore + include_object=include_object, + ) # type: ignore + + with context.begin_transaction(): + context.run_migrations() + + +async def run_async_migrations() -> None: + """In this scenario we need to create an Engine + and associate a connection with the context. + + """ + + connectable = create_async_engine( + build_connection_string(), + poolclass=pool.NullPool, + ) + + async with connectable.connect() as connection: + await connection.run_sync(do_run_migrations) + + await connectable.dispose() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode.""" + + asyncio.run(run_async_migrations()) + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/backend/alembic_tenants/script.py.mako b/backend/alembic_tenants/script.py.mako new file mode 100644 index 00000000000..55df2863d20 --- /dev/null +++ b/backend/alembic_tenants/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/backend/alembic_tenants/versions/14a83a331951_create_usertenantmapping_table.py b/backend/alembic_tenants/versions/14a83a331951_create_usertenantmapping_table.py new file mode 100644 index 00000000000..f8f3016bab1 --- /dev/null +++ b/backend/alembic_tenants/versions/14a83a331951_create_usertenantmapping_table.py @@ -0,0 +1,24 @@ +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "14a83a331951" +down_revision = None +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "user_tenant_mapping", + sa.Column("email", sa.String(), nullable=False), + sa.Column("tenant_id", sa.String(), nullable=False), + sa.UniqueConstraint("email", "tenant_id", name="uq_user_tenant"), + sa.UniqueConstraint("email", name="uq_email"), + schema="public", + ) + + +def downgrade() -> None: + op.drop_table("user_tenant_mapping", schema="public") diff --git a/backend/danswer/__init__.py b/backend/danswer/__init__.py index e2d480be4e6..ea33fd83292 100644 --- a/backend/danswer/__init__.py +++ b/backend/danswer/__init__.py @@ -1,3 +1,3 @@ import os -__version__ = os.environ.get("DANSWER_VERSION", "") or "0.3-dev" +__version__ = os.environ.get("DANSWER_VERSION", "") or "Development" diff --git a/backend/danswer/access/access.py b/backend/danswer/access/access.py index 9088ddf8425..7c879099594 100644 --- a/backend/danswer/access/access.py +++ b/backend/danswer/access/access.py @@ -1,7 +1,7 @@ from sqlalchemy.orm import Session from danswer.access.models import DocumentAccess -from danswer.access.utils import prefix_user +from danswer.access.utils import prefix_user_email from danswer.configs.constants import PUBLIC_DOC_PAT from danswer.db.document import get_access_info_for_document from danswer.db.document import get_access_info_for_documents @@ -18,10 +18,13 @@ def _get_access_for_document( document_id=document_id, ) - if not info: - return DocumentAccess.build(user_ids=[], user_groups=[], is_public=False) - - return DocumentAccess.build(user_ids=info[1], user_groups=[], is_public=info[2]) + return DocumentAccess.build( + user_emails=info[1] if info and info[1] else [], + user_groups=[], + external_user_emails=[], + external_user_group_ids=[], + is_public=info[2] if info else False, + ) def get_access_for_document( @@ -34,6 +37,16 @@ def get_access_for_document( return versioned_get_access_for_document_fn(document_id, db_session) # type: ignore +def get_null_document_access() -> DocumentAccess: + return DocumentAccess( + user_emails=set(), + user_groups=set(), + is_public=False, + external_user_emails=set(), + external_user_group_ids=set(), + ) + + def _get_access_for_documents( document_ids: list[str], db_session: Session, @@ -42,13 +55,27 @@ def _get_access_for_documents( db_session=db_session, document_ids=document_ids, ) - return { - document_id: DocumentAccess.build( - user_ids=user_ids, user_groups=[], is_public=is_public + doc_access = { + document_id: DocumentAccess( + user_emails=set([email for email in user_emails if email]), + # MIT version will wipe all groups and external groups on update + user_groups=set(), + is_public=is_public, + external_user_emails=set(), + external_user_group_ids=set(), ) - for document_id, user_ids, is_public in document_access_info + for document_id, user_emails, is_public in document_access_info } + # Sometimes the document has not be indexed by the indexing job yet, in those cases + # the document does not exist and so we use least permissive. Specifically the EE version + # checks the MIT version permissions and creates a superset. This ensures that this flow + # does not fail even if the Document has not yet been indexed. + for doc_id in document_ids: + if doc_id not in doc_access: + doc_access[doc_id] = get_null_document_access() + return doc_access + def get_access_for_documents( document_ids: list[str], @@ -70,7 +97,7 @@ def _get_acl_for_user(user: User | None, db_session: Session) -> set[str]: matches one entry in the returned set. """ if user: - return {prefix_user(str(user.id)), PUBLIC_DOC_PAT} + return {prefix_user_email(user.email), PUBLIC_DOC_PAT} return {PUBLIC_DOC_PAT} diff --git a/backend/danswer/access/models.py b/backend/danswer/access/models.py index a87e2d94f25..126648eb41e 100644 --- a/backend/danswer/access/models.py +++ b/backend/danswer/access/models.py @@ -1,30 +1,116 @@ from dataclasses import dataclass -from uuid import UUID -from danswer.access.utils import prefix_user +from danswer.access.utils import prefix_external_group +from danswer.access.utils import prefix_user_email from danswer.access.utils import prefix_user_group from danswer.configs.constants import PUBLIC_DOC_PAT @dataclass(frozen=True) -class DocumentAccess: - user_ids: set[str] # stringified UUIDs - user_groups: set[str] # names of user groups associated with this document +class ExternalAccess: + # Emails of external users with access to the doc externally + external_user_emails: set[str] + # Names or external IDs of groups with access to the doc + external_user_group_ids: set[str] + # Whether the document is public in the external system or Danswer is_public: bool - def to_acl(self) -> list[str]: - return ( - [prefix_user(user_id) for user_id in self.user_ids] + +@dataclass(frozen=True) +class DocExternalAccess: + external_access: ExternalAccess + # The document ID + doc_id: str + + def to_dict(self) -> dict: + return { + "external_access": { + "external_user_emails": list(self.external_access.external_user_emails), + "external_user_group_ids": list( + self.external_access.external_user_group_ids + ), + "is_public": self.external_access.is_public, + }, + "doc_id": self.doc_id, + } + + @classmethod + def from_dict(cls, data: dict) -> "DocExternalAccess": + external_access = ExternalAccess( + external_user_emails=set( + data["external_access"].get("external_user_emails", []) + ), + external_user_group_ids=set( + data["external_access"].get("external_user_group_ids", []) + ), + is_public=data["external_access"]["is_public"], + ) + return cls( + external_access=external_access, + doc_id=data["doc_id"], + ) + + +@dataclass(frozen=True) +class DocumentAccess(ExternalAccess): + # User emails for Danswer users, None indicates admin + user_emails: set[str | None] + # Names of user groups associated with this document + user_groups: set[str] + + def to_acl(self) -> set[str]: + return set( + [ + prefix_user_email(user_email) + for user_email in self.user_emails + if user_email + ] + [prefix_user_group(group_name) for group_name in self.user_groups] + + [ + prefix_user_email(user_email) + for user_email in self.external_user_emails + ] + + [ + # The group names are already prefixed by the source type + # This adds an additional prefix of "external_group:" + prefix_external_group(group_name) + for group_name in self.external_user_group_ids + ] + ([PUBLIC_DOC_PAT] if self.is_public else []) ) @classmethod def build( - cls, user_ids: list[UUID | None], user_groups: list[str], is_public: bool + cls, + user_emails: list[str | None], + user_groups: list[str], + external_user_emails: list[str], + external_user_group_ids: list[str], + is_public: bool, ) -> "DocumentAccess": return cls( - user_ids={str(user_id) for user_id in user_ids if user_id}, + external_user_emails={ + prefix_user_email(external_email) + for external_email in external_user_emails + }, + external_user_group_ids={ + prefix_external_group(external_group_id) + for external_group_id in external_user_group_ids + }, + user_emails={ + prefix_user_email(user_email) + for user_email in user_emails + if user_email + }, user_groups=set(user_groups), is_public=is_public, ) + + +default_public_access = DocumentAccess( + external_user_emails=set(), + external_user_group_ids=set(), + user_emails=set(), + user_groups=set(), + is_public=True, +) diff --git a/backend/danswer/access/utils.py b/backend/danswer/access/utils.py index 060560eaedc..82abf9785f8 100644 --- a/backend/danswer/access/utils.py +++ b/backend/danswer/access/utils.py @@ -1,10 +1,24 @@ -def prefix_user(user_id: str) -> str: - """Prefixes a user ID to eliminate collision with group names. - This assumes that groups are prefixed with a different prefix.""" - return f"user_id:{user_id}" +from danswer.configs.constants import DocumentSource + + +def prefix_user_email(user_email: str) -> str: + """Prefixes a user email to eliminate collision with group names. + This applies to both a Danswer user and an External user, this is to make the query time + more efficient""" + return f"user_email:{user_email}" def prefix_user_group(user_group_name: str) -> str: - """Prefixes a user group name to eliminate collision with user IDs. + """Prefixes a user group name to eliminate collision with user emails. This assumes that user ids are prefixed with a different prefix.""" return f"group:{user_group_name}" + + +def prefix_external_group(ext_group_name: str) -> str: + """Prefixes an external group name to eliminate collision with user emails / Danswer groups.""" + return f"external_group:{ext_group_name}" + + +def prefix_group_w_source(ext_group_name: str, source: DocumentSource) -> str: + """External groups may collide across sources, every source needs its own prefix.""" + return f"{source.value.upper()}_{ext_group_name}" diff --git a/backend/danswer/auth/api_key.py b/backend/danswer/auth/api_key.py new file mode 100644 index 00000000000..aef557960f6 --- /dev/null +++ b/backend/danswer/auth/api_key.py @@ -0,0 +1,89 @@ +import secrets +import uuid +from urllib.parse import quote +from urllib.parse import unquote + +from fastapi import Request +from passlib.hash import sha256_crypt +from pydantic import BaseModel + +from danswer.auth.schemas import UserRole +from danswer.configs.app_configs import API_KEY_HASH_ROUNDS + + +_API_KEY_HEADER_NAME = "Authorization" +# NOTE for others who are curious: In the context of a header, "X-" often refers +# to non-standard, experimental, or custom headers in HTTP or other protocols. It +# indicates that the header is not part of the official standards defined by +# organizations like the Internet Engineering Task Force (IETF). +_API_KEY_HEADER_ALTERNATIVE_NAME = "X-Danswer-Authorization" +_BEARER_PREFIX = "Bearer " +_API_KEY_PREFIX = "dn_" +_API_KEY_LEN = 192 + + +class ApiKeyDescriptor(BaseModel): + api_key_id: int + api_key_display: str + api_key: str | None = None # only present on initial creation + api_key_name: str | None = None + api_key_role: UserRole + + user_id: uuid.UUID + + +def generate_api_key(tenant_id: str | None = None) -> str: + # For backwards compatibility, if no tenant_id, generate old style key + if not tenant_id: + return _API_KEY_PREFIX + secrets.token_urlsafe(_API_KEY_LEN) + + encoded_tenant = quote(tenant_id) # URL encode the tenant ID + return f"{_API_KEY_PREFIX}{encoded_tenant}.{secrets.token_urlsafe(_API_KEY_LEN)}" + + +def extract_tenant_from_api_key_header(request: Request) -> str | None: + """Extract tenant ID from request. Returns None if auth is disabled or invalid format.""" + raw_api_key_header = request.headers.get( + _API_KEY_HEADER_ALTERNATIVE_NAME + ) or request.headers.get(_API_KEY_HEADER_NAME) + + if not raw_api_key_header or not raw_api_key_header.startswith(_BEARER_PREFIX): + return None + + api_key = raw_api_key_header[len(_BEARER_PREFIX) :].strip() + + if not api_key.startswith(_API_KEY_PREFIX): + return None + + parts = api_key[len(_API_KEY_PREFIX) :].split(".", 1) + if len(parts) != 2: + return None + + tenant_id = parts[0] + return unquote(tenant_id) if tenant_id else None + + +def hash_api_key(api_key: str) -> str: + # NOTE: no salt is needed, as the API key is randomly generated + # and overlaps are impossible + return sha256_crypt.hash(api_key, salt="", rounds=API_KEY_HASH_ROUNDS) + + +def build_displayable_api_key(api_key: str) -> str: + if api_key.startswith(_API_KEY_PREFIX): + api_key = api_key[len(_API_KEY_PREFIX) :] + + return _API_KEY_PREFIX + api_key[:4] + "********" + api_key[-4:] + + +def get_hashed_api_key_from_request(request: Request) -> str | None: + raw_api_key_header = request.headers.get( + _API_KEY_HEADER_ALTERNATIVE_NAME + ) or request.headers.get(_API_KEY_HEADER_NAME) + if raw_api_key_header is None: + return None + + if raw_api_key_header.startswith(_BEARER_PREFIX): + raw_api_key_header = raw_api_key_header[len(_BEARER_PREFIX) :].strip() + + return hash_api_key(raw_api_key_header) diff --git a/backend/danswer/auth/invited_users.py b/backend/danswer/auth/invited_users.py index efce858f265..fb30332afd9 100644 --- a/backend/danswer/auth/invited_users.py +++ b/backend/danswer/auth/invited_users.py @@ -1,20 +1,21 @@ from typing import cast from danswer.configs.constants import KV_USER_STORE_KEY -from danswer.dynamic_configs.factory import get_dynamic_config_store -from danswer.dynamic_configs.interface import ConfigNotFoundError -from danswer.dynamic_configs.interface import JSON_ro +from danswer.key_value_store.factory import get_kv_store +from danswer.key_value_store.interface import KvKeyNotFoundError +from danswer.utils.special_types import JSON_ro def get_invited_users() -> list[str]: try: - store = get_dynamic_config_store() + store = get_kv_store() + return cast(list, store.load(KV_USER_STORE_KEY)) - except ConfigNotFoundError: + except KvKeyNotFoundError: return list() def write_invited_users(emails: list[str]) -> int: - store = get_dynamic_config_store() + store = get_kv_store() store.store(KV_USER_STORE_KEY, cast(JSON_ro, emails)) return len(emails) diff --git a/backend/danswer/auth/noauth_user.py b/backend/danswer/auth/noauth_user.py index 9520ef41c23..9eb589dbb25 100644 --- a/backend/danswer/auth/noauth_user.py +++ b/backend/danswer/auth/noauth_user.py @@ -4,29 +4,29 @@ from danswer.auth.schemas import UserRole from danswer.configs.constants import KV_NO_AUTH_USER_PREFERENCES_KEY -from danswer.dynamic_configs.store import ConfigNotFoundError -from danswer.dynamic_configs.store import DynamicConfigStore +from danswer.key_value_store.store import KeyValueStore +from danswer.key_value_store.store import KvKeyNotFoundError from danswer.server.manage.models import UserInfo from danswer.server.manage.models import UserPreferences def set_no_auth_user_preferences( - store: DynamicConfigStore, preferences: UserPreferences + store: KeyValueStore, preferences: UserPreferences ) -> None: store.store(KV_NO_AUTH_USER_PREFERENCES_KEY, preferences.model_dump()) -def load_no_auth_user_preferences(store: DynamicConfigStore) -> UserPreferences: +def load_no_auth_user_preferences(store: KeyValueStore) -> UserPreferences: try: preferences_data = cast( Mapping[str, Any], store.load(KV_NO_AUTH_USER_PREFERENCES_KEY) ) return UserPreferences(**preferences_data) - except ConfigNotFoundError: + except KvKeyNotFoundError: return UserPreferences(chosen_assistants=None, default_model=None) -def fetch_no_auth_user(store: DynamicConfigStore) -> UserInfo: +def fetch_no_auth_user(store: KeyValueStore) -> UserInfo: return UserInfo( id="__no_auth_user__", email="anonymous@danswer.ai", diff --git a/backend/danswer/auth/schemas.py b/backend/danswer/auth/schemas.py index db8a97ceb04..e943504c584 100644 --- a/backend/danswer/auth/schemas.py +++ b/backend/danswer/auth/schemas.py @@ -13,12 +13,24 @@ class UserRole(str, Enum): groups they are curators of - Global Curator can perform admin actions for all groups they are a member of + - Limited can access a limited set of basic api endpoints + - Slack are users that have used danswer via slack but dont have a web login + - External permissioned users that have been picked up during the external permissions sync process but don't have a web login """ + LIMITED = "limited" BASIC = "basic" ADMIN = "admin" CURATOR = "curator" GLOBAL_CURATOR = "global_curator" + SLACK_USER = "slack_user" + EXT_PERM_USER = "ext_perm_user" + + def is_web_login(self) -> bool: + return self not in [ + UserRole.SLACK_USER, + UserRole.EXT_PERM_USER, + ] class UserStatus(str, Enum): @@ -33,9 +45,8 @@ class UserRead(schemas.BaseUser[uuid.UUID]): class UserCreate(schemas.BaseUserCreate): role: UserRole = UserRole.BASIC - has_web_login: bool | None = True + tenant_id: str | None = None class UserUpdate(schemas.BaseUserUpdate): role: UserRole - has_web_login: bool | None = True diff --git a/backend/danswer/auth/users.py b/backend/danswer/auth/users.py index ac02d125850..cf3de018f4b 100644 --- a/backend/danswer/auth/users.py +++ b/backend/danswer/auth/users.py @@ -5,17 +5,23 @@ from datetime import timezone from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText +from typing import Dict +from typing import List from typing import Optional from typing import Tuple +import jwt from email_validator import EmailNotValidError +from email_validator import EmailUndeliverableError from email_validator import validate_email from fastapi import APIRouter from fastapi import Depends from fastapi import HTTPException +from fastapi import Query from fastapi import Request from fastapi import Response from fastapi import status +from fastapi.responses import RedirectResponse from fastapi.security import OAuth2PasswordRequestForm from fastapi_users import BaseUserManager from fastapi_users import exceptions @@ -25,19 +31,34 @@ from fastapi_users import UUIDIDMixin from fastapi_users.authentication import AuthenticationBackend from fastapi_users.authentication import CookieTransport +from fastapi_users.authentication import JWTStrategy from fastapi_users.authentication import Strategy from fastapi_users.authentication.strategy.db import AccessTokenDatabase from fastapi_users.authentication.strategy.db import DatabaseStrategy +from fastapi_users.exceptions import UserAlreadyExists +from fastapi_users.jwt import decode_jwt +from fastapi_users.jwt import generate_jwt +from fastapi_users.jwt import SecretType +from fastapi_users.manager import UserManagerDependency from fastapi_users.openapi import OpenAPIResponseType +from fastapi_users.router.common import ErrorCode +from fastapi_users.router.common import ErrorModel from fastapi_users_db_sqlalchemy import SQLAlchemyUserDatabase -from sqlalchemy.orm import Session - +from httpx_oauth.integrations.fastapi import OAuth2AuthorizeCallback +from httpx_oauth.oauth2 import BaseOAuth2 +from httpx_oauth.oauth2 import OAuth2Token +from pydantic import BaseModel +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncSession + +from danswer.auth.api_key import get_hashed_api_key_from_request from danswer.auth.invited_users import get_invited_users from danswer.auth.schemas import UserCreate from danswer.auth.schemas import UserRole from danswer.auth.schemas import UserUpdate from danswer.configs.app_configs import AUTH_TYPE from danswer.configs.app_configs import DISABLE_AUTH +from danswer.configs.app_configs import DISABLE_VERIFICATION from danswer.configs.app_configs import EMAIL_FROM from danswer.configs.app_configs import REQUIRE_EMAIL_VERIFICATION from danswer.configs.app_configs import SESSION_EXPIRE_TIME_SECONDS @@ -53,23 +74,36 @@ from danswer.configs.constants import DANSWER_API_KEY_DUMMY_EMAIL_DOMAIN from danswer.configs.constants import DANSWER_API_KEY_PREFIX from danswer.configs.constants import UNNAMED_KEY_PLACEHOLDER +from danswer.db.api_key import fetch_user_for_api_key from danswer.db.auth import get_access_token_db from danswer.db.auth import get_default_admin_user_emails from danswer.db.auth import get_user_count from danswer.db.auth import get_user_db -from danswer.db.engine import get_session -from danswer.db.engine import get_sqlalchemy_engine +from danswer.db.auth import SQLAlchemyUserAdminDB +from danswer.db.engine import get_async_session +from danswer.db.engine import get_async_session_with_tenant +from danswer.db.engine import get_session_with_tenant from danswer.db.models import AccessToken +from danswer.db.models import OAuthAccount from danswer.db.models import User from danswer.db.users import get_user_by_email from danswer.utils.logger import setup_logger from danswer.utils.telemetry import optional_telemetry from danswer.utils.telemetry import RecordType +from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop from danswer.utils.variable_functionality import fetch_versioned_implementation +from shared_configs.configs import async_return_default_schema +from shared_configs.configs import MULTI_TENANT +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR logger = setup_logger() +class BasicAuthenticationError(HTTPException): + def __init__(self, detail: str): + super().__init__(status_code=status.HTTP_403_FORBIDDEN, detail=detail) + + def is_user_admin(user: User | None) -> bool: if AUTH_TYPE == AuthType.DISABLED: return True @@ -104,7 +138,9 @@ def get_display_email(email: str | None, space_less: bool = False) -> str: def user_needs_to_be_verified() -> bool: # all other auth types besides basic should require users to be # verified - return AUTH_TYPE != AuthType.BASIC or REQUIRE_EMAIL_VERIFICATION + return not DISABLE_VERIFICATION and ( + AUTH_TYPE != AuthType.BASIC or REQUIRE_EMAIL_VERIFICATION + ) def verify_email_is_invited(email: str) -> None: @@ -115,7 +151,10 @@ def verify_email_is_invited(email: str) -> None: if not email: raise PermissionError("Email must be specified") - email_info = validate_email(email) # can raise EmailNotValidError + try: + email_info = validate_email(email) + except EmailUndeliverableError: + raise PermissionError("Email is not valid") for email_whitelist in whitelist: try: @@ -133,8 +172,8 @@ def verify_email_is_invited(email: str) -> None: raise PermissionError("User not on allowed user whitelist") -def verify_email_in_whitelist(email: str) -> None: - with Session(get_sqlalchemy_engine()) as db_session: +def verify_email_in_whitelist(email: str, tenant_id: str | None = None) -> None: + with get_session_with_tenant(tenant_id) as db_session: if not get_user_by_email(email, db_session): verify_email_is_invited(email) @@ -182,44 +221,71 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]): reset_password_token_secret = USER_AUTH_SECRET verification_token_secret = USER_AUTH_SECRET + user_db: SQLAlchemyUserDatabase[User, uuid.UUID] + async def create( self, user_create: schemas.UC | UserCreate, safe: bool = False, request: Optional[Request] = None, ) -> User: - verify_email_is_invited(user_create.email) - verify_email_domain(user_create.email) - if hasattr(user_create, "role"): - user_count = await get_user_count() - if user_count == 0 or user_create.email in get_default_admin_user_emails(): - user_create.role = UserRole.ADMIN - else: - user_create.role = UserRole.BASIC - user = None - try: - user = await super().create(user_create, safe=safe, request=request) # type: ignore - except exceptions.UserAlreadyExists: - user = await self.get_by_email(user_create.email) - # Handle case where user has used product outside of web and is now creating an account through web - if ( - not user.has_web_login - and hasattr(user_create, "has_web_login") - and user_create.has_web_login - ): - user_update = UserUpdate( - password=user_create.password, - has_web_login=True, - role=user_create.role, - is_verified=user_create.is_verified, + referral_source = None + if request is not None: + referral_source = request.cookies.get("referral_source", None) + + tenant_id = await fetch_ee_implementation_or_noop( + "danswer.server.tenants.provisioning", + "get_or_create_tenant_id", + async_return_default_schema, + )( + email=user_create.email, + referral_source=referral_source, + ) + + async with get_async_session_with_tenant(tenant_id) as db_session: + token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id) + + verify_email_is_invited(user_create.email) + verify_email_domain(user_create.email) + if MULTI_TENANT: + tenant_user_db = SQLAlchemyUserAdminDB[User, uuid.UUID]( + db_session, User, OAuthAccount ) - user = await self.update(user_update, user) - else: - raise exceptions.UserAlreadyExists() - return user + self.user_db = tenant_user_db + self.database = tenant_user_db + + if hasattr(user_create, "role"): + user_count = await get_user_count() + if ( + user_count == 0 + or user_create.email in get_default_admin_user_emails() + ): + user_create.role = UserRole.ADMIN + else: + user_create.role = UserRole.BASIC + + try: + user = await super().create(user_create, safe=safe, request=request) # type: ignore + except exceptions.UserAlreadyExists: + user = await self.get_by_email(user_create.email) + # Handle case where user has used product outside of web and is now creating an account through web + if not user.role.is_web_login() and user_create.role.is_web_login(): + user_update = UserUpdate( + password=user_create.password, + role=user_create.role, + is_verified=user_create.is_verified, + ) + user = await self.update(user_update, user) + else: + raise exceptions.UserAlreadyExists() + + finally: + CURRENT_TENANT_ID_CONTEXTVAR.reset(token) + + return user async def oauth_callback( - self: "BaseUserManager[models.UOAP, models.ID]", + self, oauth_name: str, access_token: str, account_id: str, @@ -230,46 +296,128 @@ async def oauth_callback( *, associate_by_email: bool = False, is_verified_by_default: bool = False, - ) -> models.UOAP: - verify_email_in_whitelist(account_email) - verify_email_domain(account_email) - - user = await super().oauth_callback( # type: ignore - oauth_name=oauth_name, - access_token=access_token, - account_id=account_id, - account_email=account_email, - expires_at=expires_at, - refresh_token=refresh_token, - request=request, - associate_by_email=associate_by_email, - is_verified_by_default=is_verified_by_default, + ) -> User: + referral_source = None + if request: + referral_source = getattr(request.state, "referral_source", None) + + tenant_id = await fetch_ee_implementation_or_noop( + "danswer.server.tenants.provisioning", + "get_or_create_tenant_id", + async_return_default_schema, + )( + email=account_email, + referral_source=referral_source, ) - # NOTE: Most IdPs have very short expiry times, and we don't want to force the user to - # re-authenticate that frequently, so by default this is disabled - if expires_at and TRACK_EXTERNAL_IDP_EXPIRY: - oidc_expiry = datetime.fromtimestamp(expires_at, tz=timezone.utc) - await self.user_db.update(user, update_dict={"oidc_expiry": oidc_expiry}) - - # this is needed if an organization goes from `TRACK_EXTERNAL_IDP_EXPIRY=true` to `false` - # otherwise, the oidc expiry will always be old, and the user will never be able to login - if user.oidc_expiry and not TRACK_EXTERNAL_IDP_EXPIRY: - await self.user_db.update(user, update_dict={"oidc_expiry": None}) - - # Handle case where user has used product outside of web and is now creating an account through web - if not user.has_web_login: - await self.user_db.update( - user, - update_dict={ - "is_verified": is_verified_by_default, - "has_web_login": True, - }, - ) - user.is_verified = is_verified_by_default - user.has_web_login = True + if not tenant_id: + raise HTTPException(status_code=401, detail="User not found") + + # Proceed with the tenant context + token = None + async with get_async_session_with_tenant(tenant_id) as db_session: + token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id) + + verify_email_in_whitelist(account_email, tenant_id) + verify_email_domain(account_email) + + if MULTI_TENANT: + tenant_user_db = SQLAlchemyUserAdminDB[User, uuid.UUID]( + db_session, User, OAuthAccount + ) + self.user_db = tenant_user_db + self.database = tenant_user_db + + oauth_account_dict = { + "oauth_name": oauth_name, + "access_token": access_token, + "account_id": account_id, + "account_email": account_email, + "expires_at": expires_at, + "refresh_token": refresh_token, + } + + try: + # Attempt to get user by OAuth account + user = await self.get_by_oauth_account(oauth_name, account_id) + + except exceptions.UserNotExists: + try: + # Attempt to get user by email + user = await self.get_by_email(account_email) + if not associate_by_email: + raise exceptions.UserAlreadyExists() + + user = await self.user_db.add_oauth_account( + user, oauth_account_dict + ) + + # If user not found by OAuth account or email, create a new user + except exceptions.UserNotExists: + password = self.password_helper.generate() + user_dict = { + "email": account_email, + "hashed_password": self.password_helper.hash(password), + "is_verified": is_verified_by_default, + } + + user = await self.user_db.create(user_dict) + + # Explicitly set the Postgres schema for this session to ensure + # OAuth account creation happens in the correct tenant schema + await db_session.execute(text(f'SET search_path = "{tenant_id}"')) + + # Add OAuth account + await self.user_db.add_oauth_account(user, oauth_account_dict) + await self.on_after_register(user, request) + + else: + for existing_oauth_account in user.oauth_accounts: + if ( + existing_oauth_account.account_id == account_id + and existing_oauth_account.oauth_name == oauth_name + ): + user = await self.user_db.update_oauth_account( + user, + # NOTE: OAuthAccount DOES implement the OAuthAccountProtocol + # but the type checker doesn't know that :( + existing_oauth_account, # type: ignore + oauth_account_dict, + ) + + # NOTE: Most IdPs have very short expiry times, and we don't want to force the user to + # re-authenticate that frequently, so by default this is disabled + + if expires_at and TRACK_EXTERNAL_IDP_EXPIRY: + oidc_expiry = datetime.fromtimestamp(expires_at, tz=timezone.utc) + await self.user_db.update( + user, update_dict={"oidc_expiry": oidc_expiry} + ) + + # Handle case where user has used product outside of web and is now creating an account through web + if not user.role.is_web_login(): + await self.user_db.update( + user, + { + "is_verified": is_verified_by_default, + "role": UserRole.BASIC, + }, + ) + user.is_verified = is_verified_by_default + + # this is needed if an organization goes from `TRACK_EXTERNAL_IDP_EXPIRY=true` to `false` + # otherwise, the oidc expiry will always be old, and the user will never be able to login + if ( + user.oidc_expiry is not None # type: ignore + and not TRACK_EXTERNAL_IDP_EXPIRY + ): + await self.user_db.update(user, {"oidc_expiry": None}) + user.oidc_expiry = None # type: ignore + + if token: + CURRENT_TENANT_ID_CONTEXTVAR.reset(token) - return user + return user async def on_after_register( self, user: User, request: Optional[Request] = None @@ -300,28 +448,53 @@ async def on_after_request_verify( async def authenticate( self, credentials: OAuth2PasswordRequestForm ) -> Optional[User]: - try: - user = await self.get_by_email(credentials.username) - except exceptions.UserNotExists: + email = credentials.username + + # Get tenant_id from mapping table + tenant_id = await fetch_ee_implementation_or_noop( + "danswer.server.tenants.provisioning", + "get_or_create_tenant_id", + async_return_default_schema, + )( + email=email, + ) + if not tenant_id: + # User not found in mapping self.password_helper.hash(credentials.password) return None - if not user.has_web_login: - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, - detail="NO_WEB_LOGIN_AND_HAS_NO_PASSWORD", + # Create a tenant-specific session + async with get_async_session_with_tenant(tenant_id) as tenant_session: + tenant_user_db: SQLAlchemyUserDatabase = SQLAlchemyUserDatabase( + tenant_session, User ) + self.user_db = tenant_user_db - verified, updated_password_hash = self.password_helper.verify_and_update( - credentials.password, user.hashed_password - ) - if not verified: - return None + # Proceed with authentication + try: + user = await self.get_by_email(email) + + except exceptions.UserNotExists: + self.password_helper.hash(credentials.password) + return None + + if not user.role.is_web_login(): + raise BasicAuthenticationError( + detail="NO_WEB_LOGIN_AND_HAS_NO_PASSWORD", + ) - if updated_password_hash is not None: - await self.user_db.update(user, {"hashed_password": updated_password_hash}) + verified, updated_password_hash = self.password_helper.verify_and_update( + credentials.password, user.hashed_password + ) + if not verified: + return None + + if updated_password_hash is not None: + await self.user_db.update( + user, {"hashed_password": updated_password_hash} + ) - return user + return user async def get_user_manager( @@ -336,21 +509,51 @@ async def get_user_manager( ) +# This strategy is used to add tenant_id to the JWT token +class TenantAwareJWTStrategy(JWTStrategy): + async def _create_token_data(self, user: User, impersonate: bool = False) -> dict: + tenant_id = await fetch_ee_implementation_or_noop( + "danswer.server.tenants.provisioning", + "get_or_create_tenant_id", + async_return_default_schema, + )( + email=user.email, + ) + + data = { + "sub": str(user.id), + "aud": self.token_audience, + "tenant_id": tenant_id, + } + return data + + async def write_token(self, user: User) -> str: + data = await self._create_token_data(user) + return generate_jwt( + data, self.encode_key, self.lifetime_seconds, algorithm=self.algorithm + ) + + +def get_jwt_strategy() -> TenantAwareJWTStrategy: + return TenantAwareJWTStrategy( + secret=USER_AUTH_SECRET, + lifetime_seconds=SESSION_EXPIRE_TIME_SECONDS, + ) + + def get_database_strategy( access_token_db: AccessTokenDatabase[AccessToken] = Depends(get_access_token_db), ) -> DatabaseStrategy: - strategy = DatabaseStrategy( + return DatabaseStrategy( access_token_db, lifetime_seconds=SESSION_EXPIRE_TIME_SECONDS # type: ignore ) - return strategy - auth_backend = AuthenticationBackend( - name="database", + name="jwt" if MULTI_TENANT else "database", transport=cookie_transport, - get_strategy=get_database_strategy, -) + get_strategy=get_jwt_strategy if MULTI_TENANT else get_database_strategy, # type: ignore +) # type: ignore class FastAPIUserWithLogoutRouter(FastAPIUsers[models.UP, models.ID]): @@ -364,9 +567,11 @@ def get_logout_router( This way the login router does not need to be included """ router = APIRouter() + get_current_user_token = self.authenticator.current_user_token( active=True, verified=requires_verification ) + logout_responses: OpenAPIResponseType = { **{ status.HTTP_401_UNAUTHORIZED: { @@ -404,7 +609,7 @@ async def logout( async def optional_user_( request: Request, user: User | None, - db_session: Session, + async_db_session: AsyncSession, ) -> User | None: """NOTE: `request` and `db_session` are not used here, but are included for the EE version of this function.""" @@ -413,13 +618,21 @@ async def optional_user_( async def optional_user( request: Request, + async_db_session: AsyncSession = Depends(get_async_session), user: User | None = Depends(optional_fastapi_current_user), - db_session: Session = Depends(get_session), ) -> User | None: versioned_fetch_user = fetch_versioned_implementation( "danswer.auth.users", "optional_user_" ) - return await versioned_fetch_user(request, user, db_session) + user = await versioned_fetch_user(request, user, async_db_session) + + # check if an API key is present + if user is None: + hashed_api_key = get_hashed_api_key_from_request(request) + if hashed_api_key: + user = await fetch_user_for_api_key(hashed_api_key, async_db_session) + + return user async def double_check_user( @@ -431,14 +644,12 @@ async def double_check_user( return None if user is None: - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, + raise BasicAuthenticationError( detail="Access denied. User is not authenticated.", ) if user_needs_to_be_verified() and not user.is_verified: - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, + raise BasicAuthenticationError( detail="Access denied. User is not verified.", ) @@ -447,8 +658,7 @@ async def double_check_user( and user.oidc_expiry < datetime.now(timezone.utc) and not include_expired ): - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, + raise BasicAuthenticationError( detail="Access denied. User's OIDC token has expired.", ) @@ -461,12 +671,26 @@ async def current_user_with_expired_token( return await double_check_user(user, include_expired=True) -async def current_user( +async def current_limited_user( user: User | None = Depends(optional_user), ) -> User | None: return await double_check_user(user) +async def current_user( + user: User | None = Depends(optional_user), +) -> User | None: + user = await double_check_user(user) + if not user: + return None + + if user.role == UserRole.LIMITED: + raise BasicAuthenticationError( + detail="Access denied. User role is LIMITED. BASIC or higher permissions are required.", + ) + return user + + async def current_curator_or_admin_user( user: User | None = Depends(current_user), ) -> User | None: @@ -474,15 +698,13 @@ async def current_curator_or_admin_user( return None if not user or not hasattr(user, "role"): - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, + raise BasicAuthenticationError( detail="Access denied. User is not authenticated or lacks role information.", ) allowed_roles = {UserRole.GLOBAL_CURATOR, UserRole.CURATOR, UserRole.ADMIN} if user.role not in allowed_roles: - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, + raise BasicAuthenticationError( detail="Access denied. User is not a curator or admin.", ) @@ -494,8 +716,7 @@ async def current_admin_user(user: User | None = Depends(current_user)) -> User return None if not user or not hasattr(user, "role") or user.role != UserRole.ADMIN: - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, + raise BasicAuthenticationError( detail="Access denied. User must be an admin to perform this action.", ) @@ -505,3 +726,212 @@ async def current_admin_user(user: User | None = Depends(current_user)) -> User def get_default_admin_user_emails_() -> list[str]: # No default seeding available for Danswer MIT return [] + + +STATE_TOKEN_AUDIENCE = "fastapi-users:oauth-state" + + +class OAuth2AuthorizeResponse(BaseModel): + authorization_url: str + + +def generate_state_token( + data: Dict[str, str], secret: SecretType, lifetime_seconds: int = 3600 +) -> str: + data["aud"] = STATE_TOKEN_AUDIENCE + + return generate_jwt(data, secret, lifetime_seconds) + + +# refer to https://github.com/fastapi-users/fastapi-users/blob/42ddc241b965475390e2bce887b084152ae1a2cd/fastapi_users/fastapi_users.py#L91 +def create_danswer_oauth_router( + oauth_client: BaseOAuth2, + backend: AuthenticationBackend, + state_secret: SecretType, + redirect_url: Optional[str] = None, + associate_by_email: bool = False, + is_verified_by_default: bool = False, +) -> APIRouter: + return get_oauth_router( + oauth_client, + backend, + get_user_manager, + state_secret, + redirect_url, + associate_by_email, + is_verified_by_default, + ) + + +def get_oauth_router( + oauth_client: BaseOAuth2, + backend: AuthenticationBackend, + get_user_manager: UserManagerDependency[models.UP, models.ID], + state_secret: SecretType, + redirect_url: Optional[str] = None, + associate_by_email: bool = False, + is_verified_by_default: bool = False, +) -> APIRouter: + """Generate a router with the OAuth routes.""" + router = APIRouter() + callback_route_name = f"oauth:{oauth_client.name}.{backend.name}.callback" + + if redirect_url is not None: + oauth2_authorize_callback = OAuth2AuthorizeCallback( + oauth_client, + redirect_url=redirect_url, + ) + else: + oauth2_authorize_callback = OAuth2AuthorizeCallback( + oauth_client, + route_name=callback_route_name, + ) + + @router.get( + "/authorize", + name=f"oauth:{oauth_client.name}.{backend.name}.authorize", + response_model=OAuth2AuthorizeResponse, + ) + async def authorize( + request: Request, + scopes: List[str] = Query(None), + ) -> OAuth2AuthorizeResponse: + referral_source = request.cookies.get("referral_source", None) + + if redirect_url is not None: + authorize_redirect_url = redirect_url + else: + authorize_redirect_url = str(request.url_for(callback_route_name)) + + next_url = request.query_params.get("next", "/") + + state_data: Dict[str, str] = { + "next_url": next_url, + "referral_source": referral_source or "default_referral", + } + state = generate_state_token(state_data, state_secret) + authorization_url = await oauth_client.get_authorization_url( + authorize_redirect_url, + state, + scopes, + ) + + return OAuth2AuthorizeResponse(authorization_url=authorization_url) + + @router.get( + "/callback", + name=callback_route_name, + description="The response varies based on the authentication backend used.", + responses={ + status.HTTP_400_BAD_REQUEST: { + "model": ErrorModel, + "content": { + "application/json": { + "examples": { + "INVALID_STATE_TOKEN": { + "summary": "Invalid state token.", + "value": None, + }, + ErrorCode.LOGIN_BAD_CREDENTIALS: { + "summary": "User is inactive.", + "value": {"detail": ErrorCode.LOGIN_BAD_CREDENTIALS}, + }, + } + } + }, + }, + }, + ) + async def callback( + request: Request, + access_token_state: Tuple[OAuth2Token, str] = Depends( + oauth2_authorize_callback + ), + user_manager: BaseUserManager[models.UP, models.ID] = Depends(get_user_manager), + strategy: Strategy[models.UP, models.ID] = Depends(backend.get_strategy), + ) -> RedirectResponse: + token, state = access_token_state + account_id, account_email = await oauth_client.get_id_email( + token["access_token"] + ) + + if account_email is None: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=ErrorCode.OAUTH_NOT_AVAILABLE_EMAIL, + ) + + try: + state_data = decode_jwt(state, state_secret, [STATE_TOKEN_AUDIENCE]) + except jwt.DecodeError: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST) + + next_url = state_data.get("next_url", "/") + referral_source = state_data.get("referral_source", None) + + request.state.referral_source = referral_source + + # Proceed to authenticate or create the user + try: + user = await user_manager.oauth_callback( + oauth_client.name, + token["access_token"], + account_id, + account_email, + token.get("expires_at"), + token.get("refresh_token"), + request, + associate_by_email=associate_by_email, + is_verified_by_default=is_verified_by_default, + ) + except UserAlreadyExists: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=ErrorCode.OAUTH_USER_ALREADY_EXISTS, + ) + + if not user.is_active: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=ErrorCode.LOGIN_BAD_CREDENTIALS, + ) + + # Login user + response = await backend.login(strategy, user) + await user_manager.on_after_login(user, request, response) + + # Prepare redirect response + redirect_response = RedirectResponse(next_url, status_code=302) + + # Copy headers and other attributes from 'response' to 'redirect_response' + for header_name, header_value in response.headers.items(): + redirect_response.headers[header_name] = header_value + + if hasattr(response, "body"): + redirect_response.body = response.body + if hasattr(response, "status_code"): + redirect_response.status_code = response.status_code + if hasattr(response, "media_type"): + redirect_response.media_type = response.media_type + return redirect_response + + return router + + +async def api_key_dep( + request: Request, async_db_session: AsyncSession = Depends(get_async_session) +) -> User | None: + if AUTH_TYPE == AuthType.DISABLED: + return None + + hashed_api_key = get_hashed_api_key_from_request(request) + if not hashed_api_key: + raise HTTPException(status_code=401, detail="Missing API key") + + if hashed_api_key: + user = await fetch_user_for_api_key(hashed_api_key, async_db_session) + + if user is None: + raise HTTPException(status_code=401, detail="Invalid API key") + + return user diff --git a/backend/danswer/background/celery/apps/app_base.py b/backend/danswer/background/celery/apps/app_base.py new file mode 100644 index 00000000000..d041ce0d2bc --- /dev/null +++ b/backend/danswer/background/celery/apps/app_base.py @@ -0,0 +1,402 @@ +import logging +import multiprocessing +import time +from typing import Any + +import requests +import sentry_sdk +from celery import Task +from celery.app import trace +from celery.exceptions import WorkerShutdown +from celery.states import READY_STATES +from celery.utils.log import get_task_logger +from celery.worker import strategy # type: ignore +from sentry_sdk.integrations.celery import CeleryIntegration +from sqlalchemy import text +from sqlalchemy.orm import Session + +from danswer.background.celery.apps.task_formatters import CeleryTaskColoredFormatter +from danswer.background.celery.apps.task_formatters import CeleryTaskPlainFormatter +from danswer.background.celery.celery_utils import celery_is_worker_primary +from danswer.configs.constants import DanswerRedisLocks +from danswer.db.engine import get_sqlalchemy_engine +from danswer.document_index.vespa_constants import VESPA_CONFIG_SERVER_URL +from danswer.redis.redis_connector import RedisConnector +from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair +from danswer.redis.redis_connector_delete import RedisConnectorDelete +from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync +from danswer.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync +from danswer.redis.redis_connector_prune import RedisConnectorPrune +from danswer.redis.redis_document_set import RedisDocumentSet +from danswer.redis.redis_pool import get_redis_client +from danswer.redis.redis_usergroup import RedisUserGroup +from danswer.utils.logger import ColoredFormatter +from danswer.utils.logger import PlainFormatter +from danswer.utils.logger import setup_logger +from shared_configs.configs import SENTRY_DSN + + +logger = setup_logger() + +task_logger = get_task_logger(__name__) + +if SENTRY_DSN: + sentry_sdk.init( + dsn=SENTRY_DSN, + integrations=[CeleryIntegration()], + traces_sample_rate=0.1, + ) + logger.info("Sentry initialized") +else: + logger.debug("Sentry DSN not provided, skipping Sentry initialization") + + +def on_task_prerun( + sender: Any | None = None, + task_id: str | None = None, + task: Task | None = None, + args: tuple | None = None, + kwargs: dict | None = None, + **kwds: Any, +) -> None: + pass + + +def on_task_postrun( + sender: Any | None = None, + task_id: str | None = None, + task: Task | None = None, + args: tuple | None = None, + kwargs: dict[str, Any] | None = None, + retval: Any | None = None, + state: str | None = None, + **kwds: Any, +) -> None: + """We handle this signal in order to remove completed tasks + from their respective tasksets. This allows us to track the progress of document set + and user group syncs. + + This function runs after any task completes (both success and failure) + Note that this signal does not fire on a task that failed to complete and is going + to be retried. + + This also does not fire if a worker with acks_late=False crashes (which all of our + long running workers are) + """ + if not task: + return + + task_logger.debug(f"Task {task.name} (ID: {task_id}) completed with state: {state}") + + if state not in READY_STATES: + return + + if not task_id: + return + + # Get tenant_id directly from kwargs- each celery task has a tenant_id kwarg + if not kwargs: + logger.error(f"Task {task.name} (ID: {task_id}) is missing kwargs") + tenant_id = None + else: + tenant_id = kwargs.get("tenant_id") + + task_logger.debug( + f"Task {task.name} (ID: {task_id}) completed with state: {state} " + f"{f'for tenant_id={tenant_id}' if tenant_id else ''}" + ) + + r = get_redis_client(tenant_id=tenant_id) + + if task_id.startswith(RedisConnectorCredentialPair.PREFIX): + r.srem(RedisConnectorCredentialPair.get_taskset_key(), task_id) + return + + if task_id.startswith(RedisDocumentSet.PREFIX): + document_set_id = RedisDocumentSet.get_id_from_task_id(task_id) + if document_set_id is not None: + rds = RedisDocumentSet(tenant_id, int(document_set_id)) + r.srem(rds.taskset_key, task_id) + return + + if task_id.startswith(RedisUserGroup.PREFIX): + usergroup_id = RedisUserGroup.get_id_from_task_id(task_id) + if usergroup_id is not None: + rug = RedisUserGroup(tenant_id, int(usergroup_id)) + r.srem(rug.taskset_key, task_id) + return + + if task_id.startswith(RedisConnectorDelete.PREFIX): + cc_pair_id = RedisConnector.get_id_from_task_id(task_id) + if cc_pair_id is not None: + RedisConnectorDelete.remove_from_taskset(int(cc_pair_id), task_id, r) + return + + if task_id.startswith(RedisConnectorPrune.SUBTASK_PREFIX): + cc_pair_id = RedisConnector.get_id_from_task_id(task_id) + if cc_pair_id is not None: + RedisConnectorPrune.remove_from_taskset(int(cc_pair_id), task_id, r) + return + + if task_id.startswith(RedisConnectorPermissionSync.SUBTASK_PREFIX): + cc_pair_id = RedisConnector.get_id_from_task_id(task_id) + if cc_pair_id is not None: + RedisConnectorPermissionSync.remove_from_taskset( + int(cc_pair_id), task_id, r + ) + return + + if task_id.startswith(RedisConnectorExternalGroupSync.SUBTASK_PREFIX): + cc_pair_id = RedisConnector.get_id_from_task_id(task_id) + if cc_pair_id is not None: + RedisConnectorExternalGroupSync.remove_from_taskset( + int(cc_pair_id), task_id, r + ) + return + + +def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None: + """The first signal sent on celery worker startup""" + multiprocessing.set_start_method("spawn") # fork is unsafe, set to spawn + + +def wait_for_redis(sender: Any, **kwargs: Any) -> None: + """Waits for redis to become ready subject to a hardcoded timeout. + Will raise WorkerShutdown to kill the celery worker if the timeout is reached.""" + + r = get_redis_client(tenant_id=None) + + WAIT_INTERVAL = 5 + WAIT_LIMIT = 60 + + ready = False + time_start = time.monotonic() + logger.info("Redis: Readiness probe starting.") + while True: + try: + if r.ping(): + ready = True + break + except Exception: + pass + + time_elapsed = time.monotonic() - time_start + if time_elapsed > WAIT_LIMIT: + break + + logger.info( + f"Redis: Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}" + ) + + time.sleep(WAIT_INTERVAL) + + if not ready: + msg = ( + f"Redis: Readiness probe did not succeed within the timeout " + f"({WAIT_LIMIT} seconds). Exiting..." + ) + logger.error(msg) + raise WorkerShutdown(msg) + + logger.info("Redis: Readiness probe succeeded. Continuing...") + return + + +def wait_for_db(sender: Any, **kwargs: Any) -> None: + """Waits for the db to become ready subject to a hardcoded timeout. + Will raise WorkerShutdown to kill the celery worker if the timeout is reached.""" + + WAIT_INTERVAL = 5 + WAIT_LIMIT = 60 + + ready = False + time_start = time.monotonic() + logger.info("Database: Readiness probe starting.") + while True: + try: + with Session(get_sqlalchemy_engine()) as db_session: + result = db_session.execute(text("SELECT NOW()")).scalar() + if result: + ready = True + break + except Exception: + pass + + time_elapsed = time.monotonic() - time_start + if time_elapsed > WAIT_LIMIT: + break + + logger.info( + f"Database: Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}" + ) + + time.sleep(WAIT_INTERVAL) + + if not ready: + msg = ( + f"Database: Readiness probe did not succeed within the timeout " + f"({WAIT_LIMIT} seconds). Exiting..." + ) + logger.error(msg) + raise WorkerShutdown(msg) + + logger.info("Database: Readiness probe succeeded. Continuing...") + return + + +def wait_for_vespa(sender: Any, **kwargs: Any) -> None: + """Waits for Vespa to become ready subject to a hardcoded timeout. + Will raise WorkerShutdown to kill the celery worker if the timeout is reached.""" + + WAIT_INTERVAL = 5 + WAIT_LIMIT = 60 + + ready = False + time_start = time.monotonic() + logger.info("Vespa: Readiness probe starting.") + while True: + try: + response = requests.get(f"{VESPA_CONFIG_SERVER_URL}/state/v1/health") + response.raise_for_status() + + response_dict = response.json() + if response_dict["status"]["code"] == "up": + ready = True + break + except Exception: + pass + + time_elapsed = time.monotonic() - time_start + if time_elapsed > WAIT_LIMIT: + break + + logger.info( + f"Vespa: Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}" + ) + + time.sleep(WAIT_INTERVAL) + + if not ready: + msg = ( + f"Vespa: Readiness probe did not succeed within the timeout " + f"({WAIT_LIMIT} seconds). Exiting..." + ) + logger.error(msg) + raise WorkerShutdown(msg) + + logger.info("Vespa: Readiness probe succeeded. Continuing...") + return + + +def on_secondary_worker_init(sender: Any, **kwargs: Any) -> None: + logger.info("Running as a secondary celery worker.") + + # Set up variables for waiting on primary worker + WAIT_INTERVAL = 5 + WAIT_LIMIT = 60 + r = get_redis_client(tenant_id=None) + time_start = time.monotonic() + + logger.info("Waiting for primary worker to be ready...") + while True: + if r.exists(DanswerRedisLocks.PRIMARY_WORKER): + break + + time_elapsed = time.monotonic() - time_start + logger.info( + f"Primary worker is not ready yet. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}" + ) + if time_elapsed > WAIT_LIMIT: + msg = ( + f"Primary worker was not ready within the timeout. " + f"({WAIT_LIMIT} seconds). Exiting..." + ) + logger.error(msg) + raise WorkerShutdown(msg) + + time.sleep(WAIT_INTERVAL) + + logger.info("Wait for primary worker completed successfully. Continuing...") + return + + +def on_worker_ready(sender: Any, **kwargs: Any) -> None: + task_logger.info("worker_ready signal received.") + + +def on_worker_shutdown(sender: Any, **kwargs: Any) -> None: + if not celery_is_worker_primary(sender): + return + + if not sender.primary_worker_lock: + return + + logger.info("Releasing primary worker lock.") + lock = sender.primary_worker_lock + try: + if lock.owned(): + try: + lock.release() + sender.primary_worker_lock = None + except Exception as e: + logger.error(f"Failed to release primary worker lock: {e}") + except Exception as e: + logger.error(f"Failed to check if primary worker lock is owned: {e}") + + +def on_setup_logging( + loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any +) -> None: + # TODO: could unhardcode format and colorize and accept these as options from + # celery's config + + # reformats the root logger + root_logger = logging.getLogger() + + root_handler = logging.StreamHandler() # Set up a handler for the root logger + root_formatter = ColoredFormatter( + "%(asctime)s %(filename)30s %(lineno)4s: %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", + ) + root_handler.setFormatter(root_formatter) + root_logger.addHandler(root_handler) # Apply the handler to the root logger + + if logfile: + root_file_handler = logging.FileHandler(logfile) + root_file_formatter = PlainFormatter( + "%(asctime)s %(filename)30s %(lineno)4s: %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", + ) + root_file_handler.setFormatter(root_file_formatter) + root_logger.addHandler(root_file_handler) + + root_logger.setLevel(loglevel) + + # reformats celery's task logger + task_formatter = CeleryTaskColoredFormatter( + "%(asctime)s %(filename)30s %(lineno)4s: %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", + ) + task_handler = logging.StreamHandler() # Set up a handler for the task logger + task_handler.setFormatter(task_formatter) + task_logger.addHandler(task_handler) # Apply the handler to the task logger + + if logfile: + task_file_handler = logging.FileHandler(logfile) + task_file_formatter = CeleryTaskPlainFormatter( + "%(asctime)s %(filename)30s %(lineno)4s: %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", + ) + task_file_handler.setFormatter(task_file_formatter) + task_logger.addHandler(task_file_handler) + + task_logger.setLevel(loglevel) + task_logger.propagate = False + + # hide celery task received spam + # e.g. "Task check_for_pruning[a1e96171-0ba8-4e00-887b-9fbf7442eab3] received" + strategy.logger.setLevel(logging.WARNING) + + # hide celery task succeeded/failed spam + # e.g. "Task check_for_pruning[a1e96171-0ba8-4e00-887b-9fbf7442eab3] succeeded in 0.03137450001668185s: None" + trace.logger.setLevel(logging.WARNING) diff --git a/backend/danswer/background/celery/apps/beat.py b/backend/danswer/background/celery/apps/beat.py new file mode 100644 index 00000000000..f7ae3ec2655 --- /dev/null +++ b/backend/danswer/background/celery/apps/beat.py @@ -0,0 +1,172 @@ +from datetime import timedelta +from typing import Any + +from celery import Celery +from celery import signals +from celery.beat import PersistentScheduler # type: ignore +from celery.signals import beat_init + +import danswer.background.celery.apps.app_base as app_base +from danswer.configs.constants import POSTGRES_CELERY_BEAT_APP_NAME +from danswer.db.engine import get_all_tenant_ids +from danswer.db.engine import SqlEngine +from danswer.utils.logger import setup_logger +from danswer.utils.variable_functionality import fetch_versioned_implementation +from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST +from shared_configs.configs import MULTI_TENANT + +logger = setup_logger(__name__) + +celery_app = Celery(__name__) +celery_app.config_from_object("danswer.background.celery.configs.beat") + + +class DynamicTenantScheduler(PersistentScheduler): + def __init__(self, *args: Any, **kwargs: Any) -> None: + logger.info("Initializing DynamicTenantScheduler") + super().__init__(*args, **kwargs) + self._reload_interval = timedelta(minutes=2) + self._last_reload = self.app.now() - self._reload_interval + # Let the parent class handle store initialization + self.setup_schedule() + self._update_tenant_tasks() + logger.info(f"Set reload interval to {self._reload_interval}") + + def setup_schedule(self) -> None: + logger.info("Setting up initial schedule") + super().setup_schedule() + logger.info("Initial schedule setup complete") + + def tick(self) -> float: + retval = super().tick() + now = self.app.now() + if ( + self._last_reload is None + or (now - self._last_reload) > self._reload_interval + ): + logger.info("Reload interval reached, initiating tenant task update") + self._update_tenant_tasks() + self._last_reload = now + logger.info("Tenant task update completed, reset reload timer") + return retval + + def _update_tenant_tasks(self) -> None: + logger.info("Starting tenant task update process") + try: + logger.info("Fetching all tenant IDs") + tenant_ids = get_all_tenant_ids() + logger.info(f"Found {len(tenant_ids)} tenants") + + logger.info("Fetching tasks to schedule") + tasks_to_schedule = fetch_versioned_implementation( + "danswer.background.celery.tasks.beat_schedule", "get_tasks_to_schedule" + ) + + new_beat_schedule: dict[str, dict[str, Any]] = {} + + current_schedule = self.schedule.items() + + existing_tenants = set() + for task_name, _ in current_schedule: + if "-" in task_name: + existing_tenants.add(task_name.split("-")[-1]) + logger.info(f"Found {len(existing_tenants)} existing tenants in schedule") + + for tenant_id in tenant_ids: + if ( + IGNORED_SYNCING_TENANT_LIST + and tenant_id in IGNORED_SYNCING_TENANT_LIST + ): + logger.info( + f"Skipping tenant {tenant_id} as it is in the ignored syncing list" + ) + continue + + if tenant_id not in existing_tenants: + logger.info(f"Processing new tenant: {tenant_id}") + + for task in tasks_to_schedule(): + task_name = f"{task['name']}-{tenant_id}" + logger.debug(f"Creating task configuration for {task_name}") + new_task = { + "task": task["task"], + "schedule": task["schedule"], + "kwargs": {"tenant_id": tenant_id}, + } + if options := task.get("options"): + logger.debug(f"Adding options to task {task_name}: {options}") + new_task["options"] = options + new_beat_schedule[task_name] = new_task + + if self._should_update_schedule(current_schedule, new_beat_schedule): + logger.info( + "Schedule update required", + extra={ + "new_tasks": len(new_beat_schedule), + "current_tasks": len(current_schedule), + }, + ) + + # Create schedule entries + entries = {} + for name, entry in new_beat_schedule.items(): + entries[name] = self.Entry( + name=name, + app=self.app, + task=entry["task"], + schedule=entry["schedule"], + options=entry.get("options", {}), + kwargs=entry.get("kwargs", {}), + ) + + # Update the schedule using the scheduler's methods + self.schedule.clear() + self.schedule.update(entries) + + # Ensure changes are persisted + self.sync() + + logger.info("Schedule update completed successfully") + else: + logger.info("Schedule is up to date, no changes needed") + + except (AttributeError, KeyError): + logger.exception("Failed to process task configuration") + except Exception: + logger.exception("Unexpected error updating tenant tasks") + + def _should_update_schedule( + self, current_schedule: dict, new_schedule: dict + ) -> bool: + """Compare schedules to determine if an update is needed.""" + logger.debug("Comparing current and new schedules") + current_tasks = set(name for name, _ in current_schedule) + new_tasks = set(new_schedule.keys()) + needs_update = current_tasks != new_tasks + logger.debug(f"Schedule update needed: {needs_update}") + return needs_update + + +@beat_init.connect +def on_beat_init(sender: Any, **kwargs: Any) -> None: + logger.info("beat_init signal received.") + + # Celery beat shouldn't touch the db at all. But just setting a low minimum here. + SqlEngine.set_app_name(POSTGRES_CELERY_BEAT_APP_NAME) + SqlEngine.init_engine(pool_size=2, max_overflow=0) + + # Startup checks are not needed in multi-tenant case + if MULTI_TENANT: + return + + app_base.wait_for_redis(sender, **kwargs) + + +@signals.setup_logging.connect +def on_setup_logging( + loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any +) -> None: + app_base.on_setup_logging(loglevel, logfile, format, colorize, **kwargs) + + +celery_app.conf.beat_scheduler = DynamicTenantScheduler diff --git a/backend/danswer/background/celery/apps/heavy.py b/backend/danswer/background/celery/apps/heavy.py new file mode 100644 index 00000000000..714c91ee421 --- /dev/null +++ b/backend/danswer/background/celery/apps/heavy.py @@ -0,0 +1,97 @@ +import multiprocessing +from typing import Any + +from celery import Celery +from celery import signals +from celery import Task +from celery.signals import celeryd_init +from celery.signals import worker_init +from celery.signals import worker_ready +from celery.signals import worker_shutdown + +import danswer.background.celery.apps.app_base as app_base +from danswer.configs.constants import POSTGRES_CELERY_WORKER_HEAVY_APP_NAME +from danswer.db.engine import SqlEngine +from danswer.utils.logger import setup_logger +from shared_configs.configs import MULTI_TENANT + + +logger = setup_logger() + +celery_app = Celery(__name__) +celery_app.config_from_object("danswer.background.celery.configs.heavy") + + +@signals.task_prerun.connect +def on_task_prerun( + sender: Any | None = None, + task_id: str | None = None, + task: Task | None = None, + args: tuple | None = None, + kwargs: dict | None = None, + **kwds: Any, +) -> None: + app_base.on_task_prerun(sender, task_id, task, args, kwargs, **kwds) + + +@signals.task_postrun.connect +def on_task_postrun( + sender: Any | None = None, + task_id: str | None = None, + task: Task | None = None, + args: tuple | None = None, + kwargs: dict | None = None, + retval: Any | None = None, + state: str | None = None, + **kwds: Any, +) -> None: + app_base.on_task_postrun(sender, task_id, task, args, kwargs, retval, state, **kwds) + + +@celeryd_init.connect +def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None: + app_base.on_celeryd_init(sender, conf, **kwargs) + + +@worker_init.connect +def on_worker_init(sender: Any, **kwargs: Any) -> None: + logger.info("worker_init signal received.") + logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}") + + SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_HEAVY_APP_NAME) + SqlEngine.init_engine(pool_size=4, max_overflow=12) + + # Startup checks are not needed in multi-tenant case + if MULTI_TENANT: + return + + app_base.wait_for_redis(sender, **kwargs) + app_base.wait_for_db(sender, **kwargs) + app_base.wait_for_vespa(sender, **kwargs) + app_base.on_secondary_worker_init(sender, **kwargs) + + +@worker_ready.connect +def on_worker_ready(sender: Any, **kwargs: Any) -> None: + app_base.on_worker_ready(sender, **kwargs) + + +@worker_shutdown.connect +def on_worker_shutdown(sender: Any, **kwargs: Any) -> None: + app_base.on_worker_shutdown(sender, **kwargs) + + +@signals.setup_logging.connect +def on_setup_logging( + loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any +) -> None: + app_base.on_setup_logging(loglevel, logfile, format, colorize, **kwargs) + + +celery_app.autodiscover_tasks( + [ + "danswer.background.celery.tasks.pruning", + "danswer.background.celery.tasks.doc_permission_syncing", + "danswer.background.celery.tasks.external_group_syncing", + ] +) diff --git a/backend/danswer/background/celery/apps/indexing.py b/backend/danswer/background/celery/apps/indexing.py new file mode 100644 index 00000000000..9cd8d42af5e --- /dev/null +++ b/backend/danswer/background/celery/apps/indexing.py @@ -0,0 +1,101 @@ +import multiprocessing +from typing import Any + +from celery import Celery +from celery import signals +from celery import Task +from celery.signals import celeryd_init +from celery.signals import worker_init +from celery.signals import worker_process_init +from celery.signals import worker_ready +from celery.signals import worker_shutdown + +import danswer.background.celery.apps.app_base as app_base +from danswer.configs.constants import POSTGRES_CELERY_WORKER_INDEXING_APP_NAME +from danswer.db.engine import SqlEngine +from danswer.utils.logger import setup_logger +from shared_configs.configs import MULTI_TENANT + + +logger = setup_logger() + +celery_app = Celery(__name__) +celery_app.config_from_object("danswer.background.celery.configs.indexing") + + +@signals.task_prerun.connect +def on_task_prerun( + sender: Any | None = None, + task_id: str | None = None, + task: Task | None = None, + args: tuple | None = None, + kwargs: dict | None = None, + **kwds: Any, +) -> None: + app_base.on_task_prerun(sender, task_id, task, args, kwargs, **kwds) + + +@signals.task_postrun.connect +def on_task_postrun( + sender: Any | None = None, + task_id: str | None = None, + task: Task | None = None, + args: tuple | None = None, + kwargs: dict | None = None, + retval: Any | None = None, + state: str | None = None, + **kwds: Any, +) -> None: + app_base.on_task_postrun(sender, task_id, task, args, kwargs, retval, state, **kwds) + + +@celeryd_init.connect +def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None: + app_base.on_celeryd_init(sender, conf, **kwargs) + + +@worker_init.connect +def on_worker_init(sender: Any, **kwargs: Any) -> None: + logger.info("worker_init signal received.") + logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}") + + SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME) + SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=sender.concurrency) + + # Startup checks are not needed in multi-tenant case + if MULTI_TENANT: + return + + app_base.wait_for_redis(sender, **kwargs) + app_base.wait_for_db(sender, **kwargs) + app_base.wait_for_vespa(sender, **kwargs) + app_base.on_secondary_worker_init(sender, **kwargs) + + +@worker_ready.connect +def on_worker_ready(sender: Any, **kwargs: Any) -> None: + app_base.on_worker_ready(sender, **kwargs) + + +@worker_shutdown.connect +def on_worker_shutdown(sender: Any, **kwargs: Any) -> None: + app_base.on_worker_shutdown(sender, **kwargs) + + +@worker_process_init.connect +def init_worker(**kwargs: Any) -> None: + SqlEngine.reset_engine() + + +@signals.setup_logging.connect +def on_setup_logging( + loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any +) -> None: + app_base.on_setup_logging(loglevel, logfile, format, colorize, **kwargs) + + +celery_app.autodiscover_tasks( + [ + "danswer.background.celery.tasks.indexing", + ] +) diff --git a/backend/danswer/background/celery/apps/light.py b/backend/danswer/background/celery/apps/light.py new file mode 100644 index 00000000000..17292743f9d --- /dev/null +++ b/backend/danswer/background/celery/apps/light.py @@ -0,0 +1,97 @@ +import multiprocessing +from typing import Any + +from celery import Celery +from celery import signals +from celery import Task +from celery.signals import celeryd_init +from celery.signals import worker_init +from celery.signals import worker_ready +from celery.signals import worker_shutdown + +import danswer.background.celery.apps.app_base as app_base +from danswer.configs.constants import POSTGRES_CELERY_WORKER_LIGHT_APP_NAME +from danswer.db.engine import SqlEngine +from danswer.utils.logger import setup_logger +from shared_configs.configs import MULTI_TENANT + + +logger = setup_logger() + +celery_app = Celery(__name__) +celery_app.config_from_object("danswer.background.celery.configs.light") + + +@signals.task_prerun.connect +def on_task_prerun( + sender: Any | None = None, + task_id: str | None = None, + task: Task | None = None, + args: tuple | None = None, + kwargs: dict | None = None, + **kwds: Any, +) -> None: + app_base.on_task_prerun(sender, task_id, task, args, kwargs, **kwds) + + +@signals.task_postrun.connect +def on_task_postrun( + sender: Any | None = None, + task_id: str | None = None, + task: Task | None = None, + args: tuple | None = None, + kwargs: dict | None = None, + retval: Any | None = None, + state: str | None = None, + **kwds: Any, +) -> None: + app_base.on_task_postrun(sender, task_id, task, args, kwargs, retval, state, **kwds) + + +@celeryd_init.connect +def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None: + app_base.on_celeryd_init(sender, conf, **kwargs) + + +@worker_init.connect +def on_worker_init(sender: Any, **kwargs: Any) -> None: + logger.info("worker_init signal received.") + logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}") + + SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME) + SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8) + # Startup checks are not needed in multi-tenant case + if MULTI_TENANT: + return + + app_base.wait_for_redis(sender, **kwargs) + app_base.wait_for_db(sender, **kwargs) + app_base.wait_for_vespa(sender, **kwargs) + app_base.on_secondary_worker_init(sender, **kwargs) + + +@worker_ready.connect +def on_worker_ready(sender: Any, **kwargs: Any) -> None: + app_base.on_worker_ready(sender, **kwargs) + + +@worker_shutdown.connect +def on_worker_shutdown(sender: Any, **kwargs: Any) -> None: + app_base.on_worker_shutdown(sender, **kwargs) + + +@signals.setup_logging.connect +def on_setup_logging( + loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any +) -> None: + app_base.on_setup_logging(loglevel, logfile, format, colorize, **kwargs) + + +celery_app.autodiscover_tasks( + [ + "danswer.background.celery.tasks.shared", + "danswer.background.celery.tasks.vespa", + "danswer.background.celery.tasks.connector_deletion", + "danswer.background.celery.tasks.doc_permission_syncing", + ] +) diff --git a/backend/danswer/background/celery/apps/primary.py b/backend/danswer/background/celery/apps/primary.py new file mode 100644 index 00000000000..5efe8300670 --- /dev/null +++ b/backend/danswer/background/celery/apps/primary.py @@ -0,0 +1,281 @@ +import multiprocessing +from typing import Any +from typing import cast + +from celery import bootsteps # type: ignore +from celery import Celery +from celery import signals +from celery import Task +from celery.exceptions import WorkerShutdown +from celery.signals import celeryd_init +from celery.signals import worker_init +from celery.signals import worker_ready +from celery.signals import worker_shutdown + +import danswer.background.celery.apps.app_base as app_base +from danswer.background.celery.apps.app_base import task_logger +from danswer.background.celery.celery_utils import celery_is_worker_primary +from danswer.background.celery.tasks.indexing.tasks import ( + get_unfenced_index_attempt_ids, +) +from danswer.configs.constants import CELERY_PRIMARY_WORKER_LOCK_TIMEOUT +from danswer.configs.constants import DanswerRedisLocks +from danswer.configs.constants import POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME +from danswer.db.engine import get_session_with_default_tenant +from danswer.db.engine import SqlEngine +from danswer.db.index_attempt import get_index_attempt +from danswer.db.index_attempt import mark_attempt_canceled +from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair +from danswer.redis.redis_connector_delete import RedisConnectorDelete +from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync +from danswer.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync +from danswer.redis.redis_connector_index import RedisConnectorIndex +from danswer.redis.redis_connector_prune import RedisConnectorPrune +from danswer.redis.redis_connector_stop import RedisConnectorStop +from danswer.redis.redis_document_set import RedisDocumentSet +from danswer.redis.redis_pool import get_redis_client +from danswer.redis.redis_usergroup import RedisUserGroup +from danswer.utils.logger import setup_logger +from shared_configs.configs import MULTI_TENANT + + +logger = setup_logger() + +celery_app = Celery(__name__) +celery_app.config_from_object("danswer.background.celery.configs.primary") + + +@signals.task_prerun.connect +def on_task_prerun( + sender: Any | None = None, + task_id: str | None = None, + task: Task | None = None, + args: tuple | None = None, + kwargs: dict | None = None, + **kwds: Any, +) -> None: + app_base.on_task_prerun(sender, task_id, task, args, kwargs, **kwds) + + +@signals.task_postrun.connect +def on_task_postrun( + sender: Any | None = None, + task_id: str | None = None, + task: Task | None = None, + args: tuple | None = None, + kwargs: dict | None = None, + retval: Any | None = None, + state: str | None = None, + **kwds: Any, +) -> None: + app_base.on_task_postrun(sender, task_id, task, args, kwargs, retval, state, **kwds) + + +@celeryd_init.connect +def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None: + app_base.on_celeryd_init(sender, conf, **kwargs) + + +@worker_init.connect +def on_worker_init(sender: Any, **kwargs: Any) -> None: + logger.info("worker_init signal received.") + logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}") + + SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME) + SqlEngine.init_engine(pool_size=8, max_overflow=0) + + # Startup checks are not needed in multi-tenant case + if MULTI_TENANT: + return + + app_base.wait_for_redis(sender, **kwargs) + app_base.wait_for_db(sender, **kwargs) + app_base.wait_for_vespa(sender, **kwargs) + + logger.info("Running as the primary celery worker.") + + # This is singleton work that should be done on startup exactly once + # by the primary worker. This is unnecessary in the multi tenant scenario + r = get_redis_client(tenant_id=None) + + # Log the role and slave count - being connected to a slave or slave count > 0 could be problematic + info: dict[str, Any] = cast(dict, r.info("replication")) + role: str = cast(str, info.get("role")) + connected_slaves: int = info.get("connected_slaves", 0) + + logger.info( + f"Redis INFO REPLICATION: role={role} connected_slaves={connected_slaves}" + ) + + # For the moment, we're assuming that we are the only primary worker + # that should be running. + # TODO: maybe check for or clean up another zombie primary worker if we detect it + r.delete(DanswerRedisLocks.PRIMARY_WORKER) + + # this process wide lock is taken to help other workers start up in order. + # it is planned to use this lock to enforce singleton behavior on the primary + # worker, since the primary worker does redis cleanup on startup, but this isn't + # implemented yet. + lock = r.lock( + DanswerRedisLocks.PRIMARY_WORKER, + timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT, + ) + + logger.info("Primary worker lock: Acquire starting.") + acquired = lock.acquire(blocking_timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT / 2) + if acquired: + logger.info("Primary worker lock: Acquire succeeded.") + else: + logger.error("Primary worker lock: Acquire failed!") + raise WorkerShutdown("Primary worker lock could not be acquired!") + + # tacking on our own user data to the sender + sender.primary_worker_lock = lock + + # As currently designed, when this worker starts as "primary", we reinitialize redis + # to a clean state (for our purposes, anyway) + r.delete(DanswerRedisLocks.CHECK_VESPA_SYNC_BEAT_LOCK) + r.delete(DanswerRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK) + + r.delete(RedisConnectorCredentialPair.get_taskset_key()) + r.delete(RedisConnectorCredentialPair.get_fence_key()) + + RedisDocumentSet.reset_all(r) + + RedisUserGroup.reset_all(r) + + RedisConnectorDelete.reset_all(r) + + RedisConnectorPrune.reset_all(r) + + RedisConnectorIndex.reset_all(r) + + RedisConnectorStop.reset_all(r) + + RedisConnectorPermissionSync.reset_all(r) + + RedisConnectorExternalGroupSync.reset_all(r) + + # mark orphaned index attempts as failed + with get_session_with_default_tenant() as db_session: + unfenced_attempt_ids = get_unfenced_index_attempt_ids(db_session, r) + for attempt_id in unfenced_attempt_ids: + attempt = get_index_attempt(db_session, attempt_id) + if not attempt: + continue + + failure_reason = ( + f"Canceling leftover index attempt found on startup: " + f"index_attempt={attempt.id} " + f"cc_pair={attempt.connector_credential_pair_id} " + f"search_settings={attempt.search_settings_id}" + ) + logger.warning(failure_reason) + mark_attempt_canceled(attempt.id, db_session, failure_reason) + + +@worker_ready.connect +def on_worker_ready(sender: Any, **kwargs: Any) -> None: + app_base.on_worker_ready(sender, **kwargs) + + +@worker_shutdown.connect +def on_worker_shutdown(sender: Any, **kwargs: Any) -> None: + app_base.on_worker_shutdown(sender, **kwargs) + + +@signals.setup_logging.connect +def on_setup_logging( + loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any +) -> None: + app_base.on_setup_logging(loglevel, logfile, format, colorize, **kwargs) + + +class HubPeriodicTask(bootsteps.StartStopStep): + """Regularly reacquires the primary worker lock outside of the task queue. + Use the task_logger in this class to avoid double logging. + + This cannot be done inside a regular beat task because it must run on schedule and + a queue of existing work would starve the task from running. + """ + + # it's unclear to me whether using the hub's timer or the bootstep timer is better + requires = {"celery.worker.components:Hub"} + + def __init__(self, worker: Any, **kwargs: Any) -> None: + self.interval = CELERY_PRIMARY_WORKER_LOCK_TIMEOUT / 8 # Interval in seconds + self.task_tref = None + + def start(self, worker: Any) -> None: + if not celery_is_worker_primary(worker): + return + + # Access the worker's event loop (hub) + hub = worker.consumer.controller.hub + + # Schedule the periodic task + self.task_tref = hub.call_repeatedly( + self.interval, self.run_periodic_task, worker + ) + task_logger.info("Scheduled periodic task with hub.") + + def run_periodic_task(self, worker: Any) -> None: + try: + if not celery_is_worker_primary(worker): + return + + if not hasattr(worker, "primary_worker_lock"): + return + + lock = worker.primary_worker_lock + + r = get_redis_client(tenant_id=None) + + if lock.owned(): + task_logger.debug("Reacquiring primary worker lock.") + lock.reacquire() + else: + task_logger.warning( + "Full acquisition of primary worker lock. " + "Reasons could be worker restart or lock expiration." + ) + lock = r.lock( + DanswerRedisLocks.PRIMARY_WORKER, + timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT, + ) + + task_logger.info("Primary worker lock: Acquire starting.") + acquired = lock.acquire( + blocking_timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT / 2 + ) + if acquired: + task_logger.info("Primary worker lock: Acquire succeeded.") + worker.primary_worker_lock = lock + else: + task_logger.error("Primary worker lock: Acquire failed!") + raise TimeoutError("Primary worker lock could not be acquired!") + + except Exception: + task_logger.exception("Periodic task failed.") + + def stop(self, worker: Any) -> None: + # Cancel the scheduled task when the worker stops + if self.task_tref: + self.task_tref.cancel() + task_logger.info("Canceled periodic task with hub.") + + +celery_app.steps["worker"].add(HubPeriodicTask) + +celery_app.autodiscover_tasks( + [ + "danswer.background.celery.tasks.connector_deletion", + "danswer.background.celery.tasks.indexing", + "danswer.background.celery.tasks.periodic", + "danswer.background.celery.tasks.doc_permission_syncing", + "danswer.background.celery.tasks.external_group_syncing", + "danswer.background.celery.tasks.pruning", + "danswer.background.celery.tasks.shared", + "danswer.background.celery.tasks.vespa", + ] +) diff --git a/backend/danswer/background/celery/apps/task_formatters.py b/backend/danswer/background/celery/apps/task_formatters.py new file mode 100644 index 00000000000..e82b23a5431 --- /dev/null +++ b/backend/danswer/background/celery/apps/task_formatters.py @@ -0,0 +1,26 @@ +import logging + +from celery import current_task + +from danswer.utils.logger import ColoredFormatter +from danswer.utils.logger import PlainFormatter + + +class CeleryTaskPlainFormatter(PlainFormatter): + def format(self, record: logging.LogRecord) -> str: + task = current_task + if task and task.request: + record.__dict__.update(task_id=task.request.id, task_name=task.name) + record.msg = f"[{task.name}({task.request.id})] {record.msg}" + + return super().format(record) + + +class CeleryTaskColoredFormatter(ColoredFormatter): + def format(self, record: logging.LogRecord) -> str: + task = current_task + if task and task.request: + record.__dict__.update(task_id=task.request.id, task_name=task.name) + record.msg = f"[{task.name}({task.request.id})] {record.msg}" + + return super().format(record) diff --git a/backend/danswer/background/celery/celery_app.py b/backend/danswer/background/celery/celery_app.py deleted file mode 100644 index 8c43fb2eec3..00000000000 --- a/backend/danswer/background/celery/celery_app.py +++ /dev/null @@ -1,1290 +0,0 @@ -import json -import logging -import traceback -from datetime import timedelta -from typing import Any -from typing import cast - -import redis -from celery import Celery -from celery import current_task -from celery import signals -from celery import Task -from celery.contrib.abortable import AbortableTask # type: ignore -from celery.exceptions import SoftTimeLimitExceeded -from celery.exceptions import TaskRevokedError -from celery.signals import beat_init -from celery.signals import worker_init -from celery.states import READY_STATES -from celery.utils.log import get_task_logger -from redis import Redis -from sqlalchemy import inspect -from sqlalchemy import text -from sqlalchemy.orm import Session -from sqlalchemy.orm.exc import ObjectDeletedError - -from danswer.access.access import get_access_for_document -from danswer.background.celery.celery_redis import RedisConnectorCredentialPair -from danswer.background.celery.celery_redis import RedisConnectorDeletion -from danswer.background.celery.celery_redis import RedisDocumentSet -from danswer.background.celery.celery_redis import RedisUserGroup -from danswer.background.celery.celery_utils import extract_ids_from_runnable_connector -from danswer.background.celery.celery_utils import should_prune_cc_pair -from danswer.background.connector_deletion import delete_connector_credential_pair_batch -from danswer.background.task_utils import build_celery_task_wrapper -from danswer.background.task_utils import name_cc_prune_task -from danswer.configs.app_configs import JOB_TIMEOUT -from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT -from danswer.configs.constants import DanswerCeleryPriority -from danswer.configs.constants import DanswerRedisLocks -from danswer.configs.constants import POSTGRES_CELERY_BEAT_APP_NAME -from danswer.configs.constants import POSTGRES_CELERY_WORKER_APP_NAME -from danswer.configs.constants import PostgresAdvisoryLocks -from danswer.connectors.factory import instantiate_connector -from danswer.connectors.models import InputType -from danswer.db.connector import fetch_connector_by_id -from danswer.db.connector_credential_pair import add_deletion_failure_message -from danswer.db.connector_credential_pair import ( - delete_connector_credential_pair__no_commit, -) -from danswer.db.connector_credential_pair import get_connector_credential_pair -from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id -from danswer.db.connector_credential_pair import get_connector_credential_pairs -from danswer.db.document import count_documents_by_needs_sync -from danswer.db.document import delete_document_by_connector_credential_pair__no_commit -from danswer.db.document import delete_documents_complete__no_commit -from danswer.db.document import get_document -from danswer.db.document import get_document_connector_count -from danswer.db.document import get_documents_for_connector_credential_pair -from danswer.db.document import mark_document_as_synced -from danswer.db.document_set import delete_document_set -from danswer.db.document_set import delete_document_set_cc_pair_relationship__no_commit -from danswer.db.document_set import fetch_document_sets -from danswer.db.document_set import fetch_document_sets_for_document -from danswer.db.document_set import get_document_set_by_id -from danswer.db.document_set import mark_document_set_as_synced -from danswer.db.engine import get_sqlalchemy_engine -from danswer.db.engine import init_sqlalchemy_engine -from danswer.db.enums import ConnectorCredentialPairStatus -from danswer.db.enums import IndexingStatus -from danswer.db.index_attempt import delete_index_attempts -from danswer.db.index_attempt import get_last_attempt -from danswer.db.models import ConnectorCredentialPair -from danswer.db.models import DocumentSet -from danswer.db.models import UserGroup -from danswer.db.search_settings import get_current_search_settings -from danswer.document_index.document_index_utils import get_both_index_names -from danswer.document_index.factory import get_default_document_index -from danswer.document_index.interfaces import UpdateRequest -from danswer.redis.redis_pool import RedisPool -from danswer.server.documents.models import ConnectorCredentialPairIdentifier -from danswer.utils.logger import ColoredFormatter -from danswer.utils.logger import PlainFormatter -from danswer.utils.logger import setup_logger -from danswer.utils.variable_functionality import fetch_versioned_implementation -from danswer.utils.variable_functionality import ( - fetch_versioned_implementation_with_fallback, -) -from danswer.utils.variable_functionality import noop_fallback - -logger = setup_logger() - -# use this within celery tasks to get celery task specific logging -task_logger = get_task_logger(__name__) - -redis_pool = RedisPool() - -celery_app = Celery(__name__) -celery_app.config_from_object( - "danswer.background.celery.celeryconfig" -) # Load configuration from 'celeryconfig.py' - - -##### -# Tasks that need to be run in job queue, registered via APIs -# -# If imports from this module are needed, use local imports to avoid circular importing -##### - - -@build_celery_task_wrapper(name_cc_prune_task) -@celery_app.task(soft_time_limit=JOB_TIMEOUT) -def prune_documents_task(connector_id: int, credential_id: int) -> None: - """connector pruning task. For a cc pair, this task pulls all document IDs from the source - and compares those IDs to locally stored documents and deletes all locally stored IDs missing - from the most recently pulled document ID list""" - with Session(get_sqlalchemy_engine()) as db_session: - try: - cc_pair = get_connector_credential_pair( - db_session=db_session, - connector_id=connector_id, - credential_id=credential_id, - ) - - if not cc_pair: - task_logger.warning( - f"ccpair not found for {connector_id} {credential_id}" - ) - return - - runnable_connector = instantiate_connector( - cc_pair.connector.source, - InputType.PRUNE, - cc_pair.connector.connector_specific_config, - cc_pair.credential, - db_session, - ) - - all_connector_doc_ids: set[str] = extract_ids_from_runnable_connector( - runnable_connector - ) - - all_indexed_document_ids = { - doc.id - for doc in get_documents_for_connector_credential_pair( - db_session=db_session, - connector_id=connector_id, - credential_id=credential_id, - ) - } - - doc_ids_to_remove = list(all_indexed_document_ids - all_connector_doc_ids) - - curr_ind_name, sec_ind_name = get_both_index_names(db_session) - document_index = get_default_document_index( - primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name - ) - - if len(doc_ids_to_remove) == 0: - task_logger.info( - f"No docs to prune from {cc_pair.connector.source} connector" - ) - return - - task_logger.info( - f"pruning {len(doc_ids_to_remove)} doc(s) from {cc_pair.connector.source} connector" - ) - delete_connector_credential_pair_batch( - document_ids=doc_ids_to_remove, - connector_id=connector_id, - credential_id=credential_id, - document_index=document_index, - ) - except Exception as e: - task_logger.exception( - f"Failed to run pruning for connector id {connector_id}." - ) - raise e - - -def try_generate_stale_document_sync_tasks( - db_session: Session, r: Redis, lock_beat: redis.lock.Lock -) -> int | None: - """This picks up stale documents (typically from indexing) and queues them for sync to Vespa. - - Returns an int if syncing is needed. The int represents the number of sync tasks generated. - Returns None if no syncing is required. - """ - # the fence is up, do nothing - if r.exists(RedisConnectorCredentialPair.get_fence_key()): - return None - - r.delete(RedisConnectorCredentialPair.get_taskset_key()) # delete the taskset - - # add tasks to celery and build up the task set to monitor in redis - stale_doc_count = count_documents_by_needs_sync(db_session) - if stale_doc_count == 0: - return None - - task_logger.info( - f"Stale documents found (at least {stale_doc_count}). Generating sync tasks by cc pair." - ) - - task_logger.info("RedisConnector.generate_tasks starting by cc_pair.") - - # rkuo: we could technically sync all stale docs in one big pass. - # but I feel it's more understandable to group the docs by cc_pair - total_tasks_generated = 0 - cc_pairs = get_connector_credential_pairs(db_session) - for cc_pair in cc_pairs: - rc = RedisConnectorCredentialPair(cc_pair.id) - tasks_generated = rc.generate_tasks(celery_app, db_session, r, lock_beat) - - if tasks_generated is None: - continue - - if tasks_generated == 0: - continue - - task_logger.info( - f"RedisConnector.generate_tasks finished. " - f"cc_pair_id={cc_pair.id} tasks_generated={tasks_generated}" - ) - - total_tasks_generated += tasks_generated - - task_logger.info( - f"RedisConnector.generate_tasks finished for all cc_pairs. total_tasks_generated={total_tasks_generated}" - ) - - r.set(RedisConnectorCredentialPair.get_fence_key(), total_tasks_generated) - return total_tasks_generated - - -def try_generate_document_set_sync_tasks( - document_set: DocumentSet, db_session: Session, r: Redis, lock_beat: redis.lock.Lock -) -> int | None: - """Returns an int if syncing is needed. The int represents the number of sync tasks generated. - Note that syncing can still be required even if the number of sync tasks generated is zero. - Returns None if no syncing is required. - """ - lock_beat.reacquire() - - rds = RedisDocumentSet(document_set.id) - - # don't generate document set sync tasks if tasks are still pending - if r.exists(rds.fence_key): - return None - - # don't generate sync tasks if we're up to date - # race condition with the monitor/cleanup function if we use a cached result! - db_session.refresh(document_set) - if document_set.is_up_to_date: - return None - - # add tasks to celery and build up the task set to monitor in redis - r.delete(rds.taskset_key) - - task_logger.info( - f"RedisDocumentSet.generate_tasks starting. document_set_id={document_set.id}" - ) - - # Add all documents that need to be updated into the queue - tasks_generated = rds.generate_tasks(celery_app, db_session, r, lock_beat) - if tasks_generated is None: - return None - - # Currently we are allowing the sync to proceed with 0 tasks. - # It's possible for sets/groups to be generated initially with no entries - # and they still need to be marked as up to date. - # if tasks_generated == 0: - # return 0 - - task_logger.info( - f"RedisDocumentSet.generate_tasks finished. " - f"document_set_id={document_set.id} tasks_generated={tasks_generated}" - ) - - # set this only after all tasks have been added - r.set(rds.fence_key, tasks_generated) - return tasks_generated - - -def try_generate_user_group_sync_tasks( - usergroup: UserGroup, db_session: Session, r: Redis, lock_beat: redis.lock.Lock -) -> int | None: - """Returns an int if syncing is needed. The int represents the number of sync tasks generated. - Note that syncing can still be required even if the number of sync tasks generated is zero. - Returns None if no syncing is required. - """ - lock_beat.reacquire() - - rug = RedisUserGroup(usergroup.id) - - # don't generate sync tasks if tasks are still pending - if r.exists(rug.fence_key): - return None - - # race condition with the monitor/cleanup function if we use a cached result! - db_session.refresh(usergroup) - if usergroup.is_up_to_date: - return None - - # add tasks to celery and build up the task set to monitor in redis - r.delete(rug.taskset_key) - - # Add all documents that need to be updated into the queue - task_logger.info( - f"RedisUserGroup.generate_tasks starting. usergroup_id={usergroup.id}" - ) - tasks_generated = rug.generate_tasks(celery_app, db_session, r, lock_beat) - if tasks_generated is None: - return None - - # Currently we are allowing the sync to proceed with 0 tasks. - # It's possible for sets/groups to be generated initially with no entries - # and they still need to be marked as up to date. - # if tasks_generated == 0: - # return 0 - - task_logger.info( - f"RedisUserGroup.generate_tasks finished. " - f"usergroup_id={usergroup.id} tasks_generated={tasks_generated}" - ) - - # set this only after all tasks have been added - r.set(rug.fence_key, tasks_generated) - return tasks_generated - - -def try_generate_document_cc_pair_cleanup_tasks( - cc_pair: ConnectorCredentialPair, - db_session: Session, - r: Redis, - lock_beat: redis.lock.Lock, -) -> int | None: - """Returns an int if syncing is needed. The int represents the number of sync tasks generated. - Note that syncing can still be required even if the number of sync tasks generated is zero. - Returns None if no syncing is required. - """ - - lock_beat.reacquire() - - rcd = RedisConnectorDeletion(cc_pair.id) - - # don't generate sync tasks if tasks are still pending - if r.exists(rcd.fence_key): - return None - - # we need to refresh the state of the object inside the fence - # to avoid a race condition with db.commit/fence deletion - # at the end of this taskset - try: - db_session.refresh(cc_pair) - except ObjectDeletedError: - return None - - if cc_pair.status != ConnectorCredentialPairStatus.DELETING: - return None - - search_settings = get_current_search_settings(db_session) - - last_indexing = get_last_attempt( - connector_id=cc_pair.connector_id, - credential_id=cc_pair.credential_id, - search_settings_id=search_settings.id, - db_session=db_session, - ) - if last_indexing: - if ( - last_indexing.status == IndexingStatus.IN_PROGRESS - or last_indexing.status == IndexingStatus.NOT_STARTED - ): - return None - - # add tasks to celery and build up the task set to monitor in redis - r.delete(rcd.taskset_key) - - # Add all documents that need to be updated into the queue - task_logger.info( - f"RedisConnectorDeletion.generate_tasks starting. cc_pair_id={cc_pair.id}" - ) - tasks_generated = rcd.generate_tasks(celery_app, db_session, r, lock_beat) - if tasks_generated is None: - return None - - # Currently we are allowing the sync to proceed with 0 tasks. - # It's possible for sets/groups to be generated initially with no entries - # and they still need to be marked as up to date. - # if tasks_generated == 0: - # return 0 - - task_logger.info( - f"RedisConnectorDeletion.generate_tasks finished. " - f"cc_pair_id={cc_pair.id} tasks_generated={tasks_generated}" - ) - - # set this only after all tasks have been added - r.set(rcd.fence_key, tasks_generated) - return tasks_generated - - -##### -# Periodic Tasks -##### -@celery_app.task( - name="check_for_vespa_sync_task", - soft_time_limit=JOB_TIMEOUT, -) -def check_for_vespa_sync_task() -> None: - """Runs periodically to check if any document needs syncing. - Generates sets of tasks for Celery if syncing is needed.""" - - r = redis_pool.get_client() - - lock_beat = r.lock( - DanswerRedisLocks.CHECK_VESPA_SYNC_BEAT_LOCK, - timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT, - ) - - try: - # these tasks should never overlap - if not lock_beat.acquire(blocking=False): - return - - with Session(get_sqlalchemy_engine()) as db_session: - try_generate_stale_document_sync_tasks(db_session, r, lock_beat) - - # check if any document sets are not synced - document_set_info = fetch_document_sets( - user_id=None, db_session=db_session, include_outdated=True - ) - for document_set, _ in document_set_info: - try_generate_document_set_sync_tasks( - document_set, db_session, r, lock_beat - ) - - # check if any user groups are not synced - try: - fetch_user_groups = fetch_versioned_implementation( - "danswer.db.user_group", "fetch_user_groups" - ) - - user_groups = fetch_user_groups( - db_session=db_session, only_up_to_date=False - ) - for usergroup in user_groups: - try_generate_user_group_sync_tasks( - usergroup, db_session, r, lock_beat - ) - except ModuleNotFoundError: - # Always exceptions on the MIT version, which is expected - pass - except SoftTimeLimitExceeded: - task_logger.info( - "Soft time limit exceeded, task is being terminated gracefully." - ) - except Exception: - task_logger.exception("Unexpected exception") - finally: - if lock_beat.owned(): - lock_beat.release() - - -@celery_app.task( - name="check_for_connector_deletion_task", - soft_time_limit=JOB_TIMEOUT, -) -def check_for_connector_deletion_task() -> None: - r = redis_pool.get_client() - - lock_beat = r.lock( - DanswerRedisLocks.CHECK_CONNECTOR_DELETION_BEAT_LOCK, - timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT, - ) - - try: - # these tasks should never overlap - if not lock_beat.acquire(blocking=False): - return - - with Session(get_sqlalchemy_engine()) as db_session: - cc_pairs = get_connector_credential_pairs(db_session) - for cc_pair in cc_pairs: - try_generate_document_cc_pair_cleanup_tasks( - cc_pair, db_session, r, lock_beat - ) - except SoftTimeLimitExceeded: - task_logger.info( - "Soft time limit exceeded, task is being terminated gracefully." - ) - except Exception: - task_logger.exception("Unexpected exception") - finally: - if lock_beat.owned(): - lock_beat.release() - - -@celery_app.task( - name="kombu_message_cleanup_task", - soft_time_limit=JOB_TIMEOUT, - bind=True, - base=AbortableTask, -) -def kombu_message_cleanup_task(self: Any) -> int: - """Runs periodically to clean up the kombu_message table""" - - # we will select messages older than this amount to clean up - KOMBU_MESSAGE_CLEANUP_AGE = 7 # days - KOMBU_MESSAGE_CLEANUP_PAGE_LIMIT = 1000 - - ctx = {} - ctx["last_processed_id"] = 0 - ctx["deleted"] = 0 - ctx["cleanup_age"] = KOMBU_MESSAGE_CLEANUP_AGE - ctx["page_limit"] = KOMBU_MESSAGE_CLEANUP_PAGE_LIMIT - with Session(get_sqlalchemy_engine()) as db_session: - # Exit the task if we can't take the advisory lock - result = db_session.execute( - text("SELECT pg_try_advisory_lock(:id)"), - {"id": PostgresAdvisoryLocks.KOMBU_MESSAGE_CLEANUP_LOCK_ID.value}, - ).scalar() - if not result: - return 0 - - while True: - if self.is_aborted(): - raise TaskRevokedError("kombu_message_cleanup_task was aborted.") - - b = kombu_message_cleanup_task_helper(ctx, db_session) - if not b: - break - - db_session.commit() - - if ctx["deleted"] > 0: - task_logger.info( - f"Deleted {ctx['deleted']} orphaned messages from kombu_message." - ) - - return ctx["deleted"] - - -def kombu_message_cleanup_task_helper(ctx: dict, db_session: Session) -> bool: - """ - Helper function to clean up old messages from the `kombu_message` table that are no longer relevant. - - This function retrieves messages from the `kombu_message` table that are no longer visible and - older than a specified interval. It checks if the corresponding task_id exists in the - `celery_taskmeta` table. If the task_id does not exist, the message is deleted. - - Args: - ctx (dict): A context dictionary containing configuration parameters such as: - - 'cleanup_age' (int): The age in days after which messages are considered old. - - 'page_limit' (int): The maximum number of messages to process in one batch. - - 'last_processed_id' (int): The ID of the last processed message to handle pagination. - - 'deleted' (int): A counter to track the number of deleted messages. - db_session (Session): The SQLAlchemy database session for executing queries. - - Returns: - bool: Returns True if there are more rows to process, False if not. - """ - - inspector = inspect(db_session.bind) - if not inspector: - return False - - # With the move to redis as celery's broker and backend, kombu tables may not even exist. - # We can fail silently. - if not inspector.has_table("kombu_message"): - return False - - query = text( - """ - SELECT id, timestamp, payload - FROM kombu_message WHERE visible = 'false' - AND timestamp < CURRENT_TIMESTAMP - INTERVAL :interval_days - AND id > :last_processed_id - ORDER BY id - LIMIT :page_limit -""" - ) - kombu_messages = db_session.execute( - query, - { - "interval_days": f"{ctx['cleanup_age']} days", - "page_limit": ctx["page_limit"], - "last_processed_id": ctx["last_processed_id"], - }, - ).fetchall() - - if len(kombu_messages) == 0: - return False - - for msg in kombu_messages: - payload = json.loads(msg[2]) - task_id = payload["headers"]["id"] - - # Check if task_id exists in celery_taskmeta - task_exists = db_session.execute( - text("SELECT 1 FROM celery_taskmeta WHERE task_id = :task_id"), - {"task_id": task_id}, - ).fetchone() - - # If task_id does not exist, delete the message - if not task_exists: - result = db_session.execute( - text("DELETE FROM kombu_message WHERE id = :message_id"), - {"message_id": msg[0]}, - ) - if result.rowcount > 0: # type: ignore - ctx["deleted"] += 1 - - ctx["last_processed_id"] = msg[0] - - return True - - -@celery_app.task( - name="check_for_prune_task", - soft_time_limit=JOB_TIMEOUT, -) -def check_for_prune_task() -> None: - """Runs periodically to check if any prune tasks should be run and adds them - to the queue""" - - with Session(get_sqlalchemy_engine()) as db_session: - all_cc_pairs = get_connector_credential_pairs(db_session) - - for cc_pair in all_cc_pairs: - if should_prune_cc_pair( - connector=cc_pair.connector, - credential=cc_pair.credential, - db_session=db_session, - ): - task_logger.info(f"Pruning the {cc_pair.connector.name} connector") - - prune_documents_task.apply_async( - kwargs=dict( - connector_id=cc_pair.connector.id, - credential_id=cc_pair.credential.id, - ) - ) - - -@celery_app.task( - name="vespa_metadata_sync_task", - bind=True, - soft_time_limit=45, - time_limit=60, - max_retries=3, -) -def vespa_metadata_sync_task(self: Task, document_id: str) -> bool: - task_logger.info(f"document_id={document_id}") - - try: - with Session(get_sqlalchemy_engine()) as db_session: - curr_ind_name, sec_ind_name = get_both_index_names(db_session) - document_index = get_default_document_index( - primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name - ) - - doc = get_document(document_id, db_session) - if not doc: - return False - - # document set sync - doc_sets = fetch_document_sets_for_document(document_id, db_session) - update_doc_sets: set[str] = set(doc_sets) - - # User group sync - doc_access = get_access_for_document( - document_id=document_id, db_session=db_session - ) - update_request = UpdateRequest( - document_ids=[document_id], - document_sets=update_doc_sets, - access=doc_access, - boost=doc.boost, - hidden=doc.hidden, - ) - - # update Vespa. OK if doc doesn't exist. Raises exception otherwise. - document_index.update_single(update_request=update_request) - - # update db last. Worst case = we crash right before this and - # the sync might repeat again later - mark_document_as_synced(document_id, db_session) - except SoftTimeLimitExceeded: - task_logger.info(f"SoftTimeLimitExceeded exception. doc_id={document_id}") - except Exception as e: - task_logger.exception("Unexpected exception") - - # Exponential backoff from 2^4 to 2^6 ... i.e. 16, 32, 64 - countdown = 2 ** (self.request.retries + 4) - self.retry(exc=e, countdown=countdown) - - return True - - -@celery_app.task( - name="document_by_cc_pair_cleanup_task", - bind=True, - soft_time_limit=45, - time_limit=60, - max_retries=3, -) -def document_by_cc_pair_cleanup_task( - self: Task, document_id: str, connector_id: int, credential_id: int -) -> bool: - task_logger.info(f"document_id={document_id}") - - try: - with Session(get_sqlalchemy_engine()) as db_session: - curr_ind_name, sec_ind_name = get_both_index_names(db_session) - document_index = get_default_document_index( - primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name - ) - - count = get_document_connector_count(db_session, document_id) - if count == 1: - # count == 1 means this is the only remaining cc_pair reference to the doc - # delete it from vespa and the db - document_index.delete(doc_ids=[document_id]) - delete_documents_complete__no_commit( - db_session=db_session, - document_ids=[document_id], - ) - elif count > 1: - # count > 1 means the document still has cc_pair references - doc = get_document(document_id, db_session) - if not doc: - return False - - # the below functions do not include cc_pairs being deleted. - # i.e. they will correctly omit access for the current cc_pair - doc_access = get_access_for_document( - document_id=document_id, db_session=db_session - ) - - doc_sets = fetch_document_sets_for_document(document_id, db_session) - update_doc_sets: set[str] = set(doc_sets) - - update_request = UpdateRequest( - document_ids=[document_id], - document_sets=update_doc_sets, - access=doc_access, - boost=doc.boost, - hidden=doc.hidden, - ) - - # update Vespa. OK if doc doesn't exist. Raises exception otherwise. - document_index.update_single(update_request=update_request) - - # there are still other cc_pair references to the doc, so just resync to Vespa - delete_document_by_connector_credential_pair__no_commit( - db_session=db_session, - document_id=document_id, - connector_credential_pair_identifier=ConnectorCredentialPairIdentifier( - connector_id=connector_id, - credential_id=credential_id, - ), - ) - - mark_document_as_synced(document_id, db_session) - else: - pass - - # update_docs_last_modified__no_commit( - # db_session=db_session, - # document_ids=[document_id], - # ) - - db_session.commit() - except SoftTimeLimitExceeded: - task_logger.info(f"SoftTimeLimitExceeded exception. doc_id={document_id}") - except Exception as e: - task_logger.exception("Unexpected exception") - - # Exponential backoff from 2^4 to 2^6 ... i.e. 16, 32, 64 - countdown = 2 ** (self.request.retries + 4) - self.retry(exc=e, countdown=countdown) - - return True - - -@signals.task_postrun.connect -def celery_task_postrun( - sender: Any | None = None, - task_id: str | None = None, - task: Task | None = None, - args: tuple | None = None, - kwargs: dict | None = None, - retval: Any | None = None, - state: str | None = None, - **kwds: Any, -) -> None: - """We handle this signal in order to remove completed tasks - from their respective tasksets. This allows us to track the progress of document set - and user group syncs. - - This function runs after any task completes (both success and failure) - Note that this signal does not fire on a task that failed to complete and is going - to be retried. - """ - if not task: - return - - task_logger.debug(f"Task {task.name} (ID: {task_id}) completed with state: {state}") - # logger.debug(f"Result: {retval}") - - if state not in READY_STATES: - return - - if not task_id: - return - - if task_id.startswith(RedisConnectorCredentialPair.PREFIX): - r = redis_pool.get_client() - r.srem(RedisConnectorCredentialPair.get_taskset_key(), task_id) - return - - if task_id.startswith(RedisDocumentSet.PREFIX): - r = redis_pool.get_client() - document_set_id = RedisDocumentSet.get_id_from_task_id(task_id) - if document_set_id is not None: - rds = RedisDocumentSet(document_set_id) - r.srem(rds.taskset_key, task_id) - return - - if task_id.startswith(RedisUserGroup.PREFIX): - r = redis_pool.get_client() - usergroup_id = RedisUserGroup.get_id_from_task_id(task_id) - if usergroup_id is not None: - rug = RedisUserGroup(usergroup_id) - r.srem(rug.taskset_key, task_id) - return - - if task_id.startswith(RedisConnectorDeletion.PREFIX): - r = redis_pool.get_client() - cc_pair_id = RedisConnectorDeletion.get_id_from_task_id(task_id) - if cc_pair_id is not None: - rcd = RedisConnectorDeletion(cc_pair_id) - r.srem(rcd.taskset_key, task_id) - return - - -def monitor_connector_taskset(r: Redis) -> None: - fence_value = r.get(RedisConnectorCredentialPair.get_fence_key()) - if fence_value is None: - return - - try: - initial_count = int(cast(int, fence_value)) - except ValueError: - task_logger.error("The value is not an integer.") - return - - count = r.scard(RedisConnectorCredentialPair.get_taskset_key()) - task_logger.info(f"Stale documents: remaining={count} initial={initial_count}") - if count == 0: - r.delete(RedisConnectorCredentialPair.get_taskset_key()) - r.delete(RedisConnectorCredentialPair.get_fence_key()) - task_logger.info(f"Successfully synced stale documents. count={initial_count}") - - -def monitor_document_set_taskset(key_bytes: bytes, r: Redis) -> None: - fence_key = key_bytes.decode("utf-8") - document_set_id = RedisDocumentSet.get_id_from_fence_key(fence_key) - if document_set_id is None: - task_logger.warning("could not parse document set id from {key}") - return - - rds = RedisDocumentSet(document_set_id) - - fence_value = r.get(rds.fence_key) - if fence_value is None: - return - - try: - initial_count = int(cast(int, fence_value)) - except ValueError: - task_logger.error("The value is not an integer.") - return - - count = cast(int, r.scard(rds.taskset_key)) - task_logger.info( - f"Document set sync: document_set_id={document_set_id} remaining={count} initial={initial_count}" - ) - if count > 0: - return - - with Session(get_sqlalchemy_engine()) as db_session: - document_set = cast( - DocumentSet, - get_document_set_by_id( - db_session=db_session, document_set_id=document_set_id - ), - ) # casting since we "know" a document set with this ID exists - if document_set: - if not document_set.connector_credential_pairs: - # if there are no connectors, then delete the document set. - delete_document_set( - document_set_row=document_set, db_session=db_session - ) - task_logger.info( - f"Successfully deleted document set with ID: '{document_set_id}'!" - ) - else: - mark_document_set_as_synced(document_set_id, db_session) - task_logger.info( - f"Successfully synced document set with ID: '{document_set_id}'!" - ) - - r.delete(rds.taskset_key) - r.delete(rds.fence_key) - - -def monitor_connector_deletion_taskset(key_bytes: bytes, r: Redis) -> None: - fence_key = key_bytes.decode("utf-8") - cc_pair_id = RedisConnectorDeletion.get_id_from_fence_key(fence_key) - if cc_pair_id is None: - task_logger.warning("could not parse document set id from {key}") - return - - rcd = RedisConnectorDeletion(cc_pair_id) - - fence_value = r.get(rcd.fence_key) - if fence_value is None: - return - - try: - initial_count = int(cast(int, fence_value)) - except ValueError: - task_logger.error("The value is not an integer.") - return - - count = cast(int, r.scard(rcd.taskset_key)) - task_logger.info( - f"Connector deletion: cc_pair_id={cc_pair_id} remaining={count} initial={initial_count}" - ) - if count > 0: - return - - with Session(get_sqlalchemy_engine()) as db_session: - cc_pair = get_connector_credential_pair_from_id(cc_pair_id, db_session) - if not cc_pair: - return - - try: - # clean up the rest of the related Postgres entities - # index attempts - delete_index_attempts( - db_session=db_session, - cc_pair_id=cc_pair.id, - ) - - # document sets - delete_document_set_cc_pair_relationship__no_commit( - db_session=db_session, - connector_id=cc_pair.connector_id, - credential_id=cc_pair.credential_id, - ) - - # user groups - cleanup_user_groups = fetch_versioned_implementation_with_fallback( - "danswer.db.user_group", - "delete_user_group_cc_pair_relationship__no_commit", - noop_fallback, - ) - cleanup_user_groups( - cc_pair_id=cc_pair.id, - db_session=db_session, - ) - - # finally, delete the cc-pair - delete_connector_credential_pair__no_commit( - db_session=db_session, - connector_id=cc_pair.connector_id, - credential_id=cc_pair.credential_id, - ) - # if there are no credentials left, delete the connector - connector = fetch_connector_by_id( - db_session=db_session, - connector_id=cc_pair.connector_id, - ) - if not connector or not len(connector.credentials): - task_logger.info( - "Found no credentials left for connector, deleting connector" - ) - db_session.delete(connector) - db_session.commit() - except Exception as e: - stack_trace = traceback.format_exc() - error_message = f"Error: {str(e)}\n\nStack Trace:\n{stack_trace}" - add_deletion_failure_message(db_session, cc_pair.id, error_message) - task_logger.exception( - f"Failed to run connector_deletion. " - f"connector_id={cc_pair.connector_id} credential_id={cc_pair.credential_id}" - ) - raise e - - task_logger.info( - f"Successfully deleted connector_credential_pair with connector_id: '{cc_pair.connector_id}' " - f"and credential_id: '{cc_pair.credential_id}'. " - f"Deleted {initial_count} docs." - ) - - r.delete(rcd.taskset_key) - r.delete(rcd.fence_key) - - -@celery_app.task(name="monitor_vespa_sync", soft_time_limit=300) -def monitor_vespa_sync() -> None: - """This is a celery beat task that monitors and finalizes metadata sync tasksets. - It scans for fence values and then gets the counts of any associated tasksets. - If the count is 0, that means all tasks finished and we should clean up. - - This task lock timeout is CELERY_METADATA_SYNC_BEAT_LOCK_TIMEOUT seconds, so don't - do anything too expensive in this function! - """ - r = redis_pool.get_client() - - lock_beat = r.lock( - DanswerRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK, - timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT, - ) - - try: - # prevent overlapping tasks - if not lock_beat.acquire(blocking=False): - return - - if r.exists(RedisConnectorCredentialPair.get_fence_key()): - monitor_connector_taskset(r) - - for key_bytes in r.scan_iter(RedisDocumentSet.FENCE_PREFIX + "*"): - monitor_document_set_taskset(key_bytes, r) - - for key_bytes in r.scan_iter(RedisUserGroup.FENCE_PREFIX + "*"): - monitor_usergroup_taskset = fetch_versioned_implementation_with_fallback( - "danswer.background.celery_utils", - "monitor_usergroup_taskset", - noop_fallback, - ) - - monitor_usergroup_taskset(key_bytes, r) - - for key_bytes in r.scan_iter(RedisConnectorDeletion.FENCE_PREFIX + "*"): - monitor_connector_deletion_taskset(key_bytes, r) - - # r_celery = celery_app.broker_connection().channel().client - # length = celery_get_queue_length(DanswerCeleryQueues.VESPA_METADATA_SYNC, r_celery) - # task_logger.warning(f"queue={DanswerCeleryQueues.VESPA_METADATA_SYNC} length={length}") - except SoftTimeLimitExceeded: - task_logger.info( - "Soft time limit exceeded, task is being terminated gracefully." - ) - finally: - if lock_beat.owned(): - lock_beat.release() - - -@beat_init.connect -def on_beat_init(sender: Any, **kwargs: Any) -> None: - init_sqlalchemy_engine(POSTGRES_CELERY_BEAT_APP_NAME) - - -@worker_init.connect -def on_worker_init(sender: Any, **kwargs: Any) -> None: - init_sqlalchemy_engine(POSTGRES_CELERY_WORKER_APP_NAME) - - # TODO(rkuo): this is singleton work that should be done on startup exactly once - # if we run multiple workers, we'll need to centralize where this cleanup happens - r = redis_pool.get_client() - - r.delete(DanswerRedisLocks.CHECK_VESPA_SYNC_BEAT_LOCK) - r.delete(DanswerRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK) - - r.delete(RedisConnectorCredentialPair.get_taskset_key()) - r.delete(RedisConnectorCredentialPair.get_fence_key()) - - for key in r.scan_iter(RedisDocumentSet.TASKSET_PREFIX + "*"): - r.delete(key) - - for key in r.scan_iter(RedisDocumentSet.FENCE_PREFIX + "*"): - r.delete(key) - - for key in r.scan_iter(RedisUserGroup.TASKSET_PREFIX + "*"): - r.delete(key) - - for key in r.scan_iter(RedisUserGroup.FENCE_PREFIX + "*"): - r.delete(key) - - for key in r.scan_iter(RedisConnectorDeletion.TASKSET_PREFIX + "*"): - r.delete(key) - - for key in r.scan_iter(RedisConnectorDeletion.FENCE_PREFIX + "*"): - r.delete(key) - - -class CeleryTaskPlainFormatter(PlainFormatter): - def format(self, record: logging.LogRecord) -> str: - task = current_task - if task and task.request: - record.__dict__.update(task_id=task.request.id, task_name=task.name) - record.msg = f"[{task.name}({task.request.id})] {record.msg}" - - return super().format(record) - - -class CeleryTaskColoredFormatter(ColoredFormatter): - def format(self, record: logging.LogRecord) -> str: - task = current_task - if task and task.request: - record.__dict__.update(task_id=task.request.id, task_name=task.name) - record.msg = f"[{task.name}({task.request.id})] {record.msg}" - - return super().format(record) - - -@signals.setup_logging.connect -def on_setup_logging( - loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any -) -> None: - # TODO: could unhardcode format and colorize and accept these as options from - # celery's config - - # reformats celery's worker logger - root_logger = logging.getLogger() - - root_handler = logging.StreamHandler() # Set up a handler for the root logger - root_formatter = ColoredFormatter( - "%(asctime)s %(filename)30s %(lineno)4s: %(message)s", - datefmt="%m/%d/%Y %I:%M:%S %p", - ) - root_handler.setFormatter(root_formatter) - root_logger.addHandler(root_handler) # Apply the handler to the root logger - - if logfile: - root_file_handler = logging.FileHandler(logfile) - root_file_formatter = PlainFormatter( - "%(asctime)s %(filename)30s %(lineno)4s: %(message)s", - datefmt="%m/%d/%Y %I:%M:%S %p", - ) - root_file_handler.setFormatter(root_file_formatter) - root_logger.addHandler(root_file_handler) - - root_logger.setLevel(loglevel) - - # reformats celery's task logger - task_formatter = CeleryTaskColoredFormatter( - "%(asctime)s %(filename)30s %(lineno)4s: %(message)s", - datefmt="%m/%d/%Y %I:%M:%S %p", - ) - task_handler = logging.StreamHandler() # Set up a handler for the task logger - task_handler.setFormatter(task_formatter) - task_logger.addHandler(task_handler) # Apply the handler to the task logger - - if logfile: - task_file_handler = logging.FileHandler(logfile) - task_file_formatter = CeleryTaskPlainFormatter( - "%(asctime)s %(filename)30s %(lineno)4s: %(message)s", - datefmt="%m/%d/%Y %I:%M:%S %p", - ) - task_file_handler.setFormatter(task_file_formatter) - task_logger.addHandler(task_file_handler) - - task_logger.setLevel(loglevel) - task_logger.propagate = False - - -class CeleryTaskPlainFormatter(PlainFormatter): - def format(self, record: logging.LogRecord) -> str: - task = current_task - if task and task.request: - record.__dict__.update(task_id=task.request.id, task_name=task.name) - record.msg = f"[{task.name}({task.request.id})] {record.msg}" - - return super().format(record) - - -class CeleryTaskColoredFormatter(ColoredFormatter): - def format(self, record: logging.LogRecord) -> str: - task = current_task - if task and task.request: - record.__dict__.update(task_id=task.request.id, task_name=task.name) - record.msg = f"[{task.name}({task.request.id})] {record.msg}" - - return super().format(record) - - -@signals.setup_logging.connect -def on_setup_logging( - loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any -) -> None: - # TODO: could unhardcode format and colorize and accept these as options from - # celery's config - - # reformats celery's worker logger - root_logger = logging.getLogger() - - root_handler = logging.StreamHandler() # Set up a handler for the root logger - root_formatter = ColoredFormatter( - "%(asctime)s %(filename)30s %(lineno)4s: %(message)s", - datefmt="%m/%d/%Y %I:%M:%S %p", - ) - root_handler.setFormatter(root_formatter) - root_logger.addHandler(root_handler) # Apply the handler to the root logger - - if logfile: - root_file_handler = logging.FileHandler(logfile) - root_file_formatter = PlainFormatter( - "%(asctime)s %(filename)30s %(lineno)4s: %(message)s", - datefmt="%m/%d/%Y %I:%M:%S %p", - ) - root_file_handler.setFormatter(root_file_formatter) - root_logger.addHandler(root_file_handler) - - root_logger.setLevel(loglevel) - - # reformats celery's task logger - task_formatter = CeleryTaskColoredFormatter( - "%(asctime)s %(filename)30s %(lineno)4s: %(message)s", - datefmt="%m/%d/%Y %I:%M:%S %p", - ) - task_handler = logging.StreamHandler() # Set up a handler for the task logger - task_handler.setFormatter(task_formatter) - task_logger.addHandler(task_handler) # Apply the handler to the task logger - - if logfile: - task_file_handler = logging.FileHandler(logfile) - task_file_formatter = CeleryTaskPlainFormatter( - "%(asctime)s %(filename)30s %(lineno)4s: %(message)s", - datefmt="%m/%d/%Y %I:%M:%S %p", - ) - task_file_handler.setFormatter(task_file_formatter) - task_logger.addHandler(task_file_handler) - - task_logger.setLevel(loglevel) - task_logger.propagate = False - - -##### -# Celery Beat (Periodic Tasks) Settings -##### -celery_app.conf.beat_schedule = { - "check-for-vespa-sync": { - "task": "check_for_vespa_sync_task", - "schedule": timedelta(seconds=5), - "options": {"priority": DanswerCeleryPriority.HIGH}, - }, -} -celery_app.conf.beat_schedule.update( - { - "check-for-connector-deletion-task": { - "task": "check_for_connector_deletion_task", - # don't need to check too often, since we kick off a deletion initially - # during the API call that actually marks the CC pair for deletion - "schedule": timedelta(minutes=1), - "options": {"priority": DanswerCeleryPriority.HIGH}, - }, - } -) -celery_app.conf.beat_schedule.update( - { - "check-for-prune": { - "task": "check_for_prune_task", - "schedule": timedelta(seconds=5), - "options": {"priority": DanswerCeleryPriority.HIGH}, - }, - } -) -celery_app.conf.beat_schedule.update( - { - "kombu-message-cleanup": { - "task": "kombu_message_cleanup_task", - "schedule": timedelta(seconds=3600), - "options": {"priority": DanswerCeleryPriority.LOWEST}, - }, - } -) -celery_app.conf.beat_schedule.update( - { - "monitor-vespa-sync": { - "task": "monitor_vespa_sync", - "schedule": timedelta(seconds=5), - "options": {"priority": DanswerCeleryPriority.HIGH}, - }, - } -) diff --git a/backend/danswer/background/celery/celery_redis.py b/backend/danswer/background/celery/celery_redis.py index 1d837bd51e0..3e205d71ded 100644 --- a/backend/danswer/background/celery/celery_redis.py +++ b/backend/danswer/background/celery/celery_redis.py @@ -1,346 +1,10 @@ # These are helper objects for tracking the keys we need to write in redis -import time -from abc import ABC -from abc import abstractmethod from typing import cast -from uuid import uuid4 -import redis -from celery import Celery from redis import Redis -from sqlalchemy.orm import Session -from danswer.background.celery.celeryconfig import CELERY_SEPARATOR -from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.background.celery.configs.base import CELERY_SEPARATOR from danswer.configs.constants import DanswerCeleryPriority -from danswer.configs.constants import DanswerCeleryQueues -from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id -from danswer.db.document import construct_document_select_for_connector_credential_pair -from danswer.db.document import ( - construct_document_select_for_connector_credential_pair_by_needs_sync, -) -from danswer.db.document_set import construct_document_select_by_docset -from danswer.utils.variable_functionality import fetch_versioned_implementation - - -class RedisObjectHelper(ABC): - PREFIX = "base" - FENCE_PREFIX = PREFIX + "_fence" - TASKSET_PREFIX = PREFIX + "_taskset" - - def __init__(self, id: int): - self._id: int = id - - @property - def task_id_prefix(self) -> str: - return f"{self.PREFIX}_{self._id}" - - @property - def fence_key(self) -> str: - # example: documentset_fence_1 - return f"{self.FENCE_PREFIX}_{self._id}" - - @property - def taskset_key(self) -> str: - # example: documentset_taskset_1 - return f"{self.TASKSET_PREFIX}_{self._id}" - - @staticmethod - def get_id_from_fence_key(key: str) -> int | None: - """ - Extracts the object ID from a fence key in the format `PREFIX_fence_X`. - - Args: - key (str): The fence key string. - - Returns: - Optional[int]: The extracted ID if the key is in the correct format, otherwise None. - """ - parts = key.split("_") - if len(parts) != 3: - return None - - try: - object_id = int(parts[2]) - except ValueError: - return None - - return object_id - - @staticmethod - def get_id_from_task_id(task_id: str) -> int | None: - """ - Extracts the object ID from a task ID string. - - This method assumes the task ID is formatted as `prefix_objectid_suffix`, where: - - `prefix` is an arbitrary string (e.g., the name of the task or entity), - - `objectid` is the ID you want to extract, - - `suffix` is another arbitrary string (e.g., a UUID). - - Example: - If the input `task_id` is `documentset_1_cbfdc96a-80ca-4312-a242-0bb68da3c1dc`, - this method will return the string `"1"`. - - Args: - task_id (str): The task ID string from which to extract the object ID. - - Returns: - str | None: The extracted object ID if the task ID is in the correct format, otherwise None. - """ - # example: task_id=documentset_1_cbfdc96a-80ca-4312-a242-0bb68da3c1dc - parts = task_id.split("_") - if len(parts) != 3: - return None - - try: - object_id = int(parts[1]) - except ValueError: - return None - - return object_id - - @abstractmethod - def generate_tasks( - self, - celery_app: Celery, - db_session: Session, - redis_client: Redis, - lock: redis.lock.Lock, - ) -> int | None: - pass - - -class RedisDocumentSet(RedisObjectHelper): - PREFIX = "documentset" - FENCE_PREFIX = PREFIX + "_fence" - TASKSET_PREFIX = PREFIX + "_taskset" - - def generate_tasks( - self, - celery_app: Celery, - db_session: Session, - redis_client: Redis, - lock: redis.lock.Lock, - ) -> int | None: - last_lock_time = time.monotonic() - - async_results = [] - stmt = construct_document_select_by_docset(self._id, current_only=False) - for doc in db_session.scalars(stmt).yield_per(1): - current_time = time.monotonic() - if current_time - last_lock_time >= ( - CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4 - ): - lock.reacquire() - last_lock_time = current_time - - # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac" - # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac" - # we prefix the task id so it's easier to keep track of who created the task - # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac" - custom_task_id = f"{self.task_id_prefix}_{uuid4()}" - - # add to the set BEFORE creating the task. - redis_client.sadd(self.taskset_key, custom_task_id) - - result = celery_app.send_task( - "vespa_metadata_sync_task", - kwargs=dict(document_id=doc.id), - queue=DanswerCeleryQueues.VESPA_METADATA_SYNC, - task_id=custom_task_id, - priority=DanswerCeleryPriority.LOW, - ) - - async_results.append(result) - - return len(async_results) - - -class RedisUserGroup(RedisObjectHelper): - PREFIX = "usergroup" - FENCE_PREFIX = PREFIX + "_fence" - TASKSET_PREFIX = PREFIX + "_taskset" - - def generate_tasks( - self, - celery_app: Celery, - db_session: Session, - redis_client: Redis, - lock: redis.lock.Lock, - ) -> int | None: - last_lock_time = time.monotonic() - - async_results = [] - - try: - construct_document_select_by_usergroup = fetch_versioned_implementation( - "danswer.db.user_group", - "construct_document_select_by_usergroup", - ) - except ModuleNotFoundError: - return 0 - - stmt = construct_document_select_by_usergroup(self._id) - for doc in db_session.scalars(stmt).yield_per(1): - current_time = time.monotonic() - if current_time - last_lock_time >= ( - CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4 - ): - lock.reacquire() - last_lock_time = current_time - - # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac" - # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac" - # we prefix the task id so it's easier to keep track of who created the task - # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac" - custom_task_id = f"{self.task_id_prefix}_{uuid4()}" - - # add to the set BEFORE creating the task. - redis_client.sadd(self.taskset_key, custom_task_id) - - result = celery_app.send_task( - "vespa_metadata_sync_task", - kwargs=dict(document_id=doc.id), - queue=DanswerCeleryQueues.VESPA_METADATA_SYNC, - task_id=custom_task_id, - priority=DanswerCeleryPriority.LOW, - ) - - async_results.append(result) - - return len(async_results) - - -class RedisConnectorCredentialPair(RedisObjectHelper): - """This class differs from the default in that the taskset used spans - all connectors and is not per connector.""" - - PREFIX = "connectorsync" - FENCE_PREFIX = PREFIX + "_fence" - TASKSET_PREFIX = PREFIX + "_taskset" - - @classmethod - def get_fence_key(cls) -> str: - return RedisConnectorCredentialPair.FENCE_PREFIX - - @classmethod - def get_taskset_key(cls) -> str: - return RedisConnectorCredentialPair.TASKSET_PREFIX - - @property - def taskset_key(self) -> str: - """Notice that this is intentionally reusing the same taskset for all - connector syncs""" - # example: connector_taskset - return f"{self.TASKSET_PREFIX}" - - def generate_tasks( - self, - celery_app: Celery, - db_session: Session, - redis_client: Redis, - lock: redis.lock.Lock, - ) -> int | None: - last_lock_time = time.monotonic() - - async_results = [] - cc_pair = get_connector_credential_pair_from_id(self._id, db_session) - if not cc_pair: - return None - - stmt = construct_document_select_for_connector_credential_pair_by_needs_sync( - cc_pair.connector_id, cc_pair.credential_id - ) - for doc in db_session.scalars(stmt).yield_per(1): - current_time = time.monotonic() - if current_time - last_lock_time >= ( - CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4 - ): - lock.reacquire() - last_lock_time = current_time - - # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac" - # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac" - # we prefix the task id so it's easier to keep track of who created the task - # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac" - custom_task_id = f"{self.task_id_prefix}_{uuid4()}" - - # add to the tracking taskset in redis BEFORE creating the celery task. - # note that for the moment we are using a single taskset key, not differentiated by cc_pair id - redis_client.sadd( - RedisConnectorCredentialPair.get_taskset_key(), custom_task_id - ) - - # Priority on sync's triggered by new indexing should be medium - result = celery_app.send_task( - "vespa_metadata_sync_task", - kwargs=dict(document_id=doc.id), - queue=DanswerCeleryQueues.VESPA_METADATA_SYNC, - task_id=custom_task_id, - priority=DanswerCeleryPriority.MEDIUM, - ) - - async_results.append(result) - - return len(async_results) - - -class RedisConnectorDeletion(RedisObjectHelper): - PREFIX = "connectordeletion" - FENCE_PREFIX = PREFIX + "_fence" - TASKSET_PREFIX = PREFIX + "_taskset" - - def generate_tasks( - self, - celery_app: Celery, - db_session: Session, - redis_client: Redis, - lock: redis.lock.Lock, - ) -> int | None: - last_lock_time = time.monotonic() - - async_results = [] - cc_pair = get_connector_credential_pair_from_id(self._id, db_session) - if not cc_pair: - return None - - stmt = construct_document_select_for_connector_credential_pair( - cc_pair.connector_id, cc_pair.credential_id - ) - for doc in db_session.scalars(stmt).yield_per(1): - current_time = time.monotonic() - if current_time - last_lock_time >= ( - CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4 - ): - lock.reacquire() - last_lock_time = current_time - - # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac" - # the actual redis key is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac" - # we prefix the task id so it's easier to keep track of who created the task - # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac" - custom_task_id = f"{self.task_id_prefix}_{uuid4()}" - - # add to the tracking taskset in redis BEFORE creating the celery task. - # note that for the moment we are using a single taskset key, not differentiated by cc_pair id - redis_client.sadd(self.taskset_key, custom_task_id) - - # Priority on sync's triggered by new indexing should be medium - result = celery_app.send_task( - "document_by_cc_pair_cleanup_task", - kwargs=dict( - document_id=doc.id, - connector_id=cc_pair.connector_id, - credential_id=cc_pair.credential_id, - ), - queue=DanswerCeleryQueues.CONNECTOR_DELETION, - task_id=custom_task_id, - priority=DanswerCeleryPriority.MEDIUM, - ) - - async_results.append(result) - - return len(async_results) def celery_get_queue_length(queue: str, r: Redis) -> int: diff --git a/backend/danswer/background/celery/celery_utils.py b/backend/danswer/background/celery/celery_utils.py index 65377a4d754..22142fee202 100644 --- a/backend/danswer/background/celery/celery_utils.py +++ b/backend/danswer/background/celery/celery_utils.py @@ -1,39 +1,35 @@ from datetime import datetime from datetime import timezone +from typing import Any from sqlalchemy.orm import Session -from danswer.background.celery.celery_redis import RedisConnectorDeletion -from danswer.background.task_utils import name_cc_prune_task -from danswer.configs.app_configs import ALLOW_SIMULTANEOUS_PRUNING from danswer.configs.app_configs import MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE from danswer.connectors.cross_connector_utils.rate_limit_wrapper import ( rate_limit_builder, ) from danswer.connectors.interfaces import BaseConnector -from danswer.connectors.interfaces import IdConnector from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector +from danswer.connectors.interfaces import SlimConnector from danswer.connectors.models import Document from danswer.db.connector_credential_pair import get_connector_credential_pair -from danswer.db.engine import get_db_current_time from danswer.db.enums import TaskStatus -from danswer.db.models import Connector -from danswer.db.models import Credential from danswer.db.models import TaskQueueState -from danswer.db.tasks import check_task_is_live_and_not_timed_out -from danswer.db.tasks import get_latest_task -from danswer.db.tasks import get_latest_task_by_type -from danswer.redis.redis_pool import RedisPool +from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface +from danswer.redis.redis_connector import RedisConnector from danswer.server.documents.models import DeletionAttemptSnapshot from danswer.utils.logger import setup_logger + logger = setup_logger() -redis_pool = RedisPool() def _get_deletion_status( - connector_id: int, credential_id: int, db_session: Session + connector_id: int, + credential_id: int, + db_session: Session, + tenant_id: str | None = None, ) -> TaskQueueState | None: """We no longer store TaskQueueState in the DB for a deletion attempt. This function populates TaskQueueState by just checking redis. @@ -44,21 +40,26 @@ def _get_deletion_status( if not cc_pair: return None - rcd = RedisConnectorDeletion(cc_pair.id) - - r = redis_pool.get_client() - if not r.exists(rcd.fence_key): + redis_connector = RedisConnector(tenant_id, cc_pair.id) + if not redis_connector.delete.fenced: return None return TaskQueueState( - task_id="", task_name=rcd.fence_key, status=TaskStatus.STARTED + task_id="", + task_name=redis_connector.delete.fence_key, + status=TaskStatus.STARTED, ) def get_deletion_attempt_snapshot( - connector_id: int, credential_id: int, db_session: Session + connector_id: int, + credential_id: int, + db_session: Session, + tenant_id: str | None = None, ) -> DeletionAttemptSnapshot | None: - deletion_task = _get_deletion_status(connector_id, credential_id, db_session) + deletion_task = _get_deletion_status( + connector_id, credential_id, db_session, tenant_id + ) if not deletion_task: return None @@ -69,60 +70,31 @@ def get_deletion_attempt_snapshot( ) -def should_prune_cc_pair( - connector: Connector, credential: Credential, db_session: Session -) -> bool: - if not connector.prune_freq: - return False - - pruning_task_name = name_cc_prune_task( - connector_id=connector.id, credential_id=credential.id - ) - last_pruning_task = get_latest_task(pruning_task_name, db_session) - current_db_time = get_db_current_time(db_session) - - if not last_pruning_task: - time_since_initialization = current_db_time - connector.time_created - if time_since_initialization.total_seconds() >= connector.prune_freq: - return True - return False - - if not ALLOW_SIMULTANEOUS_PRUNING: - pruning_type_task_name = name_cc_prune_task() - last_pruning_type_task = get_latest_task_by_type( - pruning_type_task_name, db_session - ) - - if last_pruning_type_task and check_task_is_live_and_not_timed_out( - last_pruning_type_task, db_session - ): - return False - - if check_task_is_live_and_not_timed_out(last_pruning_task, db_session): - return False - - if not last_pruning_task.start_time: - return False - - time_since_last_pruning = current_db_time - last_pruning_task.start_time - return time_since_last_pruning.total_seconds() >= connector.prune_freq - - -def document_batch_to_ids(doc_batch: list[Document]) -> set[str]: +def document_batch_to_ids( + doc_batch: list[Document], +) -> set[str]: return {doc.id for doc in doc_batch} -def extract_ids_from_runnable_connector(runnable_connector: BaseConnector) -> set[str]: +def extract_ids_from_runnable_connector( + runnable_connector: BaseConnector, + callback: IndexingHeartbeatInterface | None = None, +) -> set[str]: """ - If the PruneConnector hasnt been implemented for the given connector, just pull - all docs using the load_from_state and grab out the IDs + If the SlimConnector hasnt been implemented for the given connector, just pull + all docs using the load_from_state and grab out the IDs. + + Optionally, a callback can be passed to handle the length of each document batch. """ all_connector_doc_ids: set[str] = set() + if isinstance(runnable_connector, SlimConnector): + for metadata_batch in runnable_connector.retrieve_all_slim_documents(): + all_connector_doc_ids.update({doc.id for doc in metadata_batch}) + doc_batch_generator = None - if isinstance(runnable_connector, IdConnector): - all_connector_doc_ids = runnable_connector.retrieve_all_source_ids() - elif isinstance(runnable_connector, LoadConnector): + + if isinstance(runnable_connector, LoadConnector): doc_batch_generator = runnable_connector.load_from_state() elif isinstance(runnable_connector, PollConnector): start = datetime(1970, 1, 1, tzinfo=timezone.utc).timestamp() @@ -131,13 +103,46 @@ def extract_ids_from_runnable_connector(runnable_connector: BaseConnector) -> se else: raise RuntimeError("Pruning job could not find a valid runnable_connector.") - if doc_batch_generator: - doc_batch_processing_func = document_batch_to_ids - if MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE: - doc_batch_processing_func = rate_limit_builder( - max_calls=MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE, period=60 - )(document_batch_to_ids) - for doc_batch in doc_batch_generator: - all_connector_doc_ids.update(doc_batch_processing_func(doc_batch)) + doc_batch_processing_func = document_batch_to_ids + if MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE: + doc_batch_processing_func = rate_limit_builder( + max_calls=MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE, period=60 + )(document_batch_to_ids) + for doc_batch in doc_batch_generator: + if callback: + if callback.should_stop(): + raise RuntimeError( + "extract_ids_from_runnable_connector: Stop signal detected" + ) + + all_connector_doc_ids.update(doc_batch_processing_func(doc_batch)) + + if callback: + callback.progress("extract_ids_from_runnable_connector", len(doc_batch)) return all_connector_doc_ids + + +def celery_is_listening_to_queue(worker: Any, name: str) -> bool: + """Checks to see if we're listening to the named queue""" + + # how to get a list of queues this worker is listening to + # https://stackoverflow.com/questions/29790523/how-to-determine-which-queues-a-celery-worker-is-consuming-at-runtime + queue_names = list(worker.app.amqp.queues.consume_from.keys()) + for queue_name in queue_names: + if queue_name == name: + return True + + return False + + +def celery_is_worker_primary(worker: Any) -> bool: + """There are multiple approaches that could be taken to determine if a celery worker + is 'primary', as defined by us. But the way we do it is to check the hostname set + for the celery worker, which can be done on the + command line with '--hostname'.""" + hostname = worker.hostname + if hostname.startswith("primary"): + return True + + return False diff --git a/backend/danswer/background/celery/celeryconfig.py b/backend/danswer/background/celery/configs/base.py similarity index 66% rename from backend/danswer/background/celery/celeryconfig.py rename to backend/danswer/background/celery/configs/base.py index d0314adf865..886fcf545c9 100644 --- a/backend/danswer/background/celery/celeryconfig.py +++ b/backend/danswer/background/celery/configs/base.py @@ -1,7 +1,11 @@ # docs: https://docs.celeryq.dev/en/stable/userguide/configuration.html +import urllib.parse + +from danswer.configs.app_configs import CELERY_BROKER_POOL_LIMIT from danswer.configs.app_configs import CELERY_RESULT_EXPIRES from danswer.configs.app_configs import REDIS_DB_NUMBER_CELERY from danswer.configs.app_configs import REDIS_DB_NUMBER_CELERY_RESULT_BACKEND +from danswer.configs.app_configs import REDIS_HEALTH_CHECK_INTERVAL from danswer.configs.app_configs import REDIS_HOST from danswer.configs.app_configs import REDIS_PASSWORD from danswer.configs.app_configs import REDIS_PORT @@ -9,12 +13,13 @@ from danswer.configs.app_configs import REDIS_SSL_CA_CERTS from danswer.configs.app_configs import REDIS_SSL_CERT_REQS from danswer.configs.constants import DanswerCeleryPriority +from danswer.configs.constants import REDIS_SOCKET_KEEPALIVE_OPTIONS CELERY_SEPARATOR = ":" CELERY_PASSWORD_PART = "" if REDIS_PASSWORD: - CELERY_PASSWORD_PART = f":{REDIS_PASSWORD}@" + CELERY_PASSWORD_PART = ":" + urllib.parse.quote(REDIS_PASSWORD, safe="") + "@" REDIS_SCHEME = "redis" @@ -26,29 +31,51 @@ if REDIS_SSL_CA_CERTS: SSL_QUERY_PARAMS += f"&ssl_ca_certs={REDIS_SSL_CA_CERTS}" +# region Broker settings # example celery_broker_url: "redis://:password@localhost:6379/15" broker_url = f"{REDIS_SCHEME}://{CELERY_PASSWORD_PART}{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB_NUMBER_CELERY}{SSL_QUERY_PARAMS}" -result_backend = f"{REDIS_SCHEME}://{CELERY_PASSWORD_PART}{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB_NUMBER_CELERY_RESULT_BACKEND}{SSL_QUERY_PARAMS}" - -# NOTE: prefetch 4 is significantly faster than prefetch 1 for small tasks -# however, prefetching is bad when tasks are lengthy as those tasks -# can stall other tasks. -worker_prefetch_multiplier = 4 +broker_connection_retry_on_startup = True +broker_pool_limit = CELERY_BROKER_POOL_LIMIT +# redis broker settings +# https://docs.celeryq.dev/projects/kombu/en/stable/reference/kombu.transport.redis.html broker_transport_options = { "priority_steps": list(range(len(DanswerCeleryPriority))), "sep": CELERY_SEPARATOR, "queue_order_strategy": "priority", + "retry_on_timeout": True, + "health_check_interval": REDIS_HEALTH_CHECK_INTERVAL, + "socket_keepalive": True, + "socket_keepalive_options": REDIS_SOCKET_KEEPALIVE_OPTIONS, } +# endregion + +# redis backend settings +# https://docs.celeryq.dev/en/stable/userguide/configuration.html#redis-backend-settings + +# there doesn't appear to be a way to set socket_keepalive_options on the redis result backend +redis_socket_keepalive = True +redis_retry_on_timeout = True +redis_backend_health_check_interval = REDIS_HEALTH_CHECK_INTERVAL + task_default_priority = DanswerCeleryPriority.MEDIUM task_acks_late = True +# region Task result backend settings # It's possible we don't even need celery's result backend, in which case all of the optimization below # might be irrelevant +result_backend = f"{REDIS_SCHEME}://{CELERY_PASSWORD_PART}{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB_NUMBER_CELERY_RESULT_BACKEND}{SSL_QUERY_PARAMS}" result_expires = CELERY_RESULT_EXPIRES # 86400 seconds is the default +# endregion + +# Leaving this to the default of True may cause double logging since both our own app +# and celery think they are controlling the logger. +# TODO: Configure celery's logger entirely manually and set this to False +# worker_hijack_root_logger = False +# region Notes on serialization performance # Option 0: Defaults (json serializer, no compression) # about 1.5 KB per queued task. 1KB in queue, 400B for result, 100 as a child entry in generator result @@ -74,3 +101,4 @@ # task_serializer = "pickle-bzip2" # result_serializer = "pickle-bzip2" # accept_content=["pickle", "pickle-bzip2"] +# endregion diff --git a/backend/danswer/background/celery/configs/beat.py b/backend/danswer/background/celery/configs/beat.py new file mode 100644 index 00000000000..ef8b21c386f --- /dev/null +++ b/backend/danswer/background/celery/configs/beat.py @@ -0,0 +1,14 @@ +# docs: https://docs.celeryq.dev/en/stable/userguide/configuration.html +import danswer.background.celery.configs.base as shared_config + +broker_url = shared_config.broker_url +broker_connection_retry_on_startup = shared_config.broker_connection_retry_on_startup +broker_pool_limit = shared_config.broker_pool_limit +broker_transport_options = shared_config.broker_transport_options + +redis_socket_keepalive = shared_config.redis_socket_keepalive +redis_retry_on_timeout = shared_config.redis_retry_on_timeout +redis_backend_health_check_interval = shared_config.redis_backend_health_check_interval + +result_backend = shared_config.result_backend +result_expires = shared_config.result_expires # 86400 seconds is the default diff --git a/backend/danswer/background/celery/configs/heavy.py b/backend/danswer/background/celery/configs/heavy.py new file mode 100644 index 00000000000..2d1c65aa86e --- /dev/null +++ b/backend/danswer/background/celery/configs/heavy.py @@ -0,0 +1,20 @@ +import danswer.background.celery.configs.base as shared_config + +broker_url = shared_config.broker_url +broker_connection_retry_on_startup = shared_config.broker_connection_retry_on_startup +broker_pool_limit = shared_config.broker_pool_limit +broker_transport_options = shared_config.broker_transport_options + +redis_socket_keepalive = shared_config.redis_socket_keepalive +redis_retry_on_timeout = shared_config.redis_retry_on_timeout +redis_backend_health_check_interval = shared_config.redis_backend_health_check_interval + +result_backend = shared_config.result_backend +result_expires = shared_config.result_expires # 86400 seconds is the default + +task_default_priority = shared_config.task_default_priority +task_acks_late = shared_config.task_acks_late + +worker_concurrency = 4 +worker_pool = "threads" +worker_prefetch_multiplier = 1 diff --git a/backend/danswer/background/celery/configs/indexing.py b/backend/danswer/background/celery/configs/indexing.py new file mode 100644 index 00000000000..d2b1b99baa9 --- /dev/null +++ b/backend/danswer/background/celery/configs/indexing.py @@ -0,0 +1,21 @@ +import danswer.background.celery.configs.base as shared_config +from danswer.configs.app_configs import CELERY_WORKER_INDEXING_CONCURRENCY + +broker_url = shared_config.broker_url +broker_connection_retry_on_startup = shared_config.broker_connection_retry_on_startup +broker_pool_limit = shared_config.broker_pool_limit +broker_transport_options = shared_config.broker_transport_options + +redis_socket_keepalive = shared_config.redis_socket_keepalive +redis_retry_on_timeout = shared_config.redis_retry_on_timeout +redis_backend_health_check_interval = shared_config.redis_backend_health_check_interval + +result_backend = shared_config.result_backend +result_expires = shared_config.result_expires # 86400 seconds is the default + +task_default_priority = shared_config.task_default_priority +task_acks_late = shared_config.task_acks_late + +worker_concurrency = CELERY_WORKER_INDEXING_CONCURRENCY +worker_pool = "threads" +worker_prefetch_multiplier = 1 diff --git a/backend/danswer/background/celery/configs/light.py b/backend/danswer/background/celery/configs/light.py new file mode 100644 index 00000000000..f75ddfd0fb5 --- /dev/null +++ b/backend/danswer/background/celery/configs/light.py @@ -0,0 +1,22 @@ +import danswer.background.celery.configs.base as shared_config +from danswer.configs.app_configs import CELERY_WORKER_LIGHT_CONCURRENCY +from danswer.configs.app_configs import CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER + +broker_url = shared_config.broker_url +broker_connection_retry_on_startup = shared_config.broker_connection_retry_on_startup +broker_pool_limit = shared_config.broker_pool_limit +broker_transport_options = shared_config.broker_transport_options + +redis_socket_keepalive = shared_config.redis_socket_keepalive +redis_retry_on_timeout = shared_config.redis_retry_on_timeout +redis_backend_health_check_interval = shared_config.redis_backend_health_check_interval + +result_backend = shared_config.result_backend +result_expires = shared_config.result_expires # 86400 seconds is the default + +task_default_priority = shared_config.task_default_priority +task_acks_late = shared_config.task_acks_late + +worker_concurrency = CELERY_WORKER_LIGHT_CONCURRENCY +worker_pool = "threads" +worker_prefetch_multiplier = CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER diff --git a/backend/danswer/background/celery/configs/primary.py b/backend/danswer/background/celery/configs/primary.py new file mode 100644 index 00000000000..2d1c65aa86e --- /dev/null +++ b/backend/danswer/background/celery/configs/primary.py @@ -0,0 +1,20 @@ +import danswer.background.celery.configs.base as shared_config + +broker_url = shared_config.broker_url +broker_connection_retry_on_startup = shared_config.broker_connection_retry_on_startup +broker_pool_limit = shared_config.broker_pool_limit +broker_transport_options = shared_config.broker_transport_options + +redis_socket_keepalive = shared_config.redis_socket_keepalive +redis_retry_on_timeout = shared_config.redis_retry_on_timeout +redis_backend_health_check_interval = shared_config.redis_backend_health_check_interval + +result_backend = shared_config.result_backend +result_expires = shared_config.result_expires # 86400 seconds is the default + +task_default_priority = shared_config.task_default_priority +task_acks_late = shared_config.task_acks_late + +worker_concurrency = 4 +worker_pool = "threads" +worker_prefetch_multiplier = 1 diff --git a/backend/danswer/background/celery/tasks/beat_schedule.py b/backend/danswer/background/celery/tasks/beat_schedule.py new file mode 100644 index 00000000000..3b18f8931e4 --- /dev/null +++ b/backend/danswer/background/celery/tasks/beat_schedule.py @@ -0,0 +1,60 @@ +from datetime import timedelta +from typing import Any + +from danswer.configs.constants import DanswerCeleryPriority + + +tasks_to_schedule = [ + { + "name": "check-for-vespa-sync", + "task": "check_for_vespa_sync_task", + "schedule": timedelta(seconds=20), + "options": {"priority": DanswerCeleryPriority.HIGH}, + }, + { + "name": "check-for-connector-deletion", + "task": "check_for_connector_deletion_task", + "schedule": timedelta(seconds=20), + "options": {"priority": DanswerCeleryPriority.HIGH}, + }, + { + "name": "check-for-indexing", + "task": "check_for_indexing", + "schedule": timedelta(seconds=15), + "options": {"priority": DanswerCeleryPriority.HIGH}, + }, + { + "name": "check-for-prune", + "task": "check_for_pruning", + "schedule": timedelta(seconds=15), + "options": {"priority": DanswerCeleryPriority.HIGH}, + }, + { + "name": "kombu-message-cleanup", + "task": "kombu_message_cleanup_task", + "schedule": timedelta(seconds=3600), + "options": {"priority": DanswerCeleryPriority.LOWEST}, + }, + { + "name": "monitor-vespa-sync", + "task": "monitor_vespa_sync", + "schedule": timedelta(seconds=5), + "options": {"priority": DanswerCeleryPriority.HIGH}, + }, + { + "name": "check-for-doc-permissions-sync", + "task": "check_for_doc_permissions_sync", + "schedule": timedelta(seconds=30), + "options": {"priority": DanswerCeleryPriority.HIGH}, + }, + { + "name": "check-for-external-group-sync", + "task": "check_for_external_group_sync", + "schedule": timedelta(seconds=20), + "options": {"priority": DanswerCeleryPriority.HIGH}, + }, +] + + +def get_tasks_to_schedule() -> list[dict[str, Any]]: + return tasks_to_schedule diff --git a/backend/danswer/background/celery/tasks/connector_deletion/tasks.py b/backend/danswer/background/celery/tasks/connector_deletion/tasks.py new file mode 100644 index 00000000000..9413dd97854 --- /dev/null +++ b/backend/danswer/background/celery/tasks/connector_deletion/tasks.py @@ -0,0 +1,187 @@ +from datetime import datetime +from datetime import timezone + +from celery import Celery +from celery import shared_task +from celery import Task +from celery.exceptions import SoftTimeLimitExceeded +from redis import Redis +from redis.lock import Lock as RedisLock +from sqlalchemy.orm import Session + +from danswer.background.celery.apps.app_base import task_logger +from danswer.configs.app_configs import JOB_TIMEOUT +from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.configs.constants import DanswerRedisLocks +from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id +from danswer.db.connector_credential_pair import get_connector_credential_pairs +from danswer.db.engine import get_session_with_tenant +from danswer.db.enums import ConnectorCredentialPairStatus +from danswer.db.search_settings import get_all_search_settings +from danswer.redis.redis_connector import RedisConnector +from danswer.redis.redis_connector_delete import RedisConnectorDeletePayload +from danswer.redis.redis_pool import get_redis_client + + +class TaskDependencyError(RuntimeError): + """Raised to the caller to indicate dependent tasks are running that would interfere + with connector deletion.""" + + +@shared_task( + name="check_for_connector_deletion_task", + soft_time_limit=JOB_TIMEOUT, + trail=False, + bind=True, +) +def check_for_connector_deletion_task(self: Task, *, tenant_id: str | None) -> None: + r = get_redis_client(tenant_id=tenant_id) + + lock_beat = r.lock( + DanswerRedisLocks.CHECK_CONNECTOR_DELETION_BEAT_LOCK, + timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT, + ) + + try: + # these tasks should never overlap + if not lock_beat.acquire(blocking=False): + return + + # collect cc_pair_ids + cc_pair_ids: list[int] = [] + with get_session_with_tenant(tenant_id) as db_session: + cc_pairs = get_connector_credential_pairs(db_session) + for cc_pair in cc_pairs: + cc_pair_ids.append(cc_pair.id) + + # try running cleanup on the cc_pair_ids + for cc_pair_id in cc_pair_ids: + with get_session_with_tenant(tenant_id) as db_session: + redis_connector = RedisConnector(tenant_id, cc_pair_id) + try: + try_generate_document_cc_pair_cleanup_tasks( + self.app, cc_pair_id, db_session, r, lock_beat, tenant_id + ) + except TaskDependencyError as e: + # this means we wanted to start deleting but dependent tasks were running + # Leave a stop signal to clear indexing and pruning tasks more quickly + task_logger.info(str(e)) + redis_connector.stop.set_fence(True) + else: + # clear the stop signal if it exists ... no longer needed + redis_connector.stop.set_fence(False) + + except SoftTimeLimitExceeded: + task_logger.info( + "Soft time limit exceeded, task is being terminated gracefully." + ) + except Exception: + task_logger.exception(f"Unexpected exception: tenant={tenant_id}") + finally: + if lock_beat.owned(): + lock_beat.release() + + +def try_generate_document_cc_pair_cleanup_tasks( + app: Celery, + cc_pair_id: int, + db_session: Session, + r: Redis, + lock_beat: RedisLock, + tenant_id: str | None, +) -> int | None: + """Returns an int if syncing is needed. The int represents the number of sync tasks generated. + Note that syncing can still be required even if the number of sync tasks generated is zero. + Returns None if no syncing is required. + + Will raise TaskDependencyError if dependent tasks such as indexing and pruning are + still running. In our case, the caller reacts by setting a stop signal in Redis to + exit those tasks as quickly as possible. + """ + + lock_beat.reacquire() + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + + # don't generate sync tasks if tasks are still pending + if redis_connector.delete.fenced: + return None + + # we need to load the state of the object inside the fence + # to avoid a race condition with db.commit/fence deletion + # at the end of this taskset + cc_pair = get_connector_credential_pair_from_id(cc_pair_id, db_session) + if not cc_pair: + return None + + if cc_pair.status != ConnectorCredentialPairStatus.DELETING: + return None + + # set a basic fence to start + fence_payload = RedisConnectorDeletePayload( + num_tasks=None, + submitted=datetime.now(timezone.utc), + ) + + redis_connector.delete.set_fence(fence_payload) + + try: + # do not proceed if connector indexing or connector pruning are running + search_settings_list = get_all_search_settings(db_session) + for search_settings in search_settings_list: + redis_connector_index = redis_connector.new_index(search_settings.id) + if redis_connector_index.fenced: + raise TaskDependencyError( + f"Connector deletion - Delayed (indexing in progress): " + f"cc_pair={cc_pair_id} " + f"search_settings={search_settings.id}" + ) + + if redis_connector.prune.fenced: + raise TaskDependencyError( + f"Connector deletion - Delayed (pruning in progress): " + f"cc_pair={cc_pair_id}" + ) + + if redis_connector.permissions.fenced: + raise TaskDependencyError( + f"Connector deletion - Delayed (permissions in progress): " + f"cc_pair={cc_pair_id}" + ) + + # add tasks to celery and build up the task set to monitor in redis + redis_connector.delete.taskset_clear() + + # Add all documents that need to be updated into the queue + task_logger.info( + f"RedisConnectorDeletion.generate_tasks starting. cc_pair={cc_pair_id}" + ) + tasks_generated = redis_connector.delete.generate_tasks( + app, db_session, lock_beat + ) + if tasks_generated is None: + raise ValueError("RedisConnectorDeletion.generate_tasks returned None") + except TaskDependencyError: + redis_connector.delete.set_fence(None) + raise + except Exception: + task_logger.exception("Unexpected exception") + redis_connector.delete.set_fence(None) + return None + else: + # Currently we are allowing the sync to proceed with 0 tasks. + # It's possible for sets/groups to be generated initially with no entries + # and they still need to be marked as up to date. + # if tasks_generated == 0: + # return 0 + + task_logger.info( + f"RedisConnectorDeletion.generate_tasks finished. " + f"cc_pair={cc_pair_id} tasks_generated={tasks_generated}" + ) + + # set this only after all tasks have been added + fence_payload.num_tasks = tasks_generated + redis_connector.delete.set_fence(fence_payload) + + return tasks_generated diff --git a/backend/danswer/background/celery/tasks/doc_permission_syncing/tasks.py b/backend/danswer/background/celery/tasks/doc_permission_syncing/tasks.py new file mode 100644 index 00000000000..eef14e980ca --- /dev/null +++ b/backend/danswer/background/celery/tasks/doc_permission_syncing/tasks.py @@ -0,0 +1,323 @@ +from datetime import datetime +from datetime import timedelta +from datetime import timezone +from uuid import uuid4 + +from celery import Celery +from celery import shared_task +from celery import Task +from celery.exceptions import SoftTimeLimitExceeded +from redis import Redis + +from danswer.access.models import DocExternalAccess +from danswer.background.celery.apps.app_base import task_logger +from danswer.configs.app_configs import JOB_TIMEOUT +from danswer.configs.constants import CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT +from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX +from danswer.configs.constants import DanswerCeleryPriority +from danswer.configs.constants import DanswerCeleryQueues +from danswer.configs.constants import DanswerRedisLocks +from danswer.configs.constants import DocumentSource +from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id +from danswer.db.engine import get_session_with_tenant +from danswer.db.enums import AccessType +from danswer.db.enums import ConnectorCredentialPairStatus +from danswer.db.models import ConnectorCredentialPair +from danswer.db.users import batch_add_ext_perm_user_if_not_exists +from danswer.redis.redis_connector import RedisConnector +from danswer.redis.redis_connector_doc_perm_sync import ( + RedisConnectorPermissionSyncData, +) +from danswer.redis.redis_pool import get_redis_client +from danswer.utils.logger import doc_permission_sync_ctx +from danswer.utils.logger import setup_logger +from ee.danswer.db.connector_credential_pair import get_all_auto_sync_cc_pairs +from ee.danswer.db.document import upsert_document_external_perms +from ee.danswer.external_permissions.sync_params import DOC_PERMISSION_SYNC_PERIODS +from ee.danswer.external_permissions.sync_params import DOC_PERMISSIONS_FUNC_MAP + +logger = setup_logger() + + +DOCUMENT_PERMISSIONS_UPDATE_MAX_RETRIES = 3 + + +# 5 seconds more than RetryDocumentIndex STOP_AFTER+MAX_WAIT +LIGHT_SOFT_TIME_LIMIT = 105 +LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15 + + +def _is_external_doc_permissions_sync_due(cc_pair: ConnectorCredentialPair) -> bool: + """Returns boolean indicating if external doc permissions sync is due.""" + + if cc_pair.access_type != AccessType.SYNC: + return False + + # skip doc permissions sync if not active + if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE: + return False + + if cc_pair.status == ConnectorCredentialPairStatus.DELETING: + return False + + # If the last sync is None, it has never been run so we run the sync + last_perm_sync = cc_pair.last_time_perm_sync + if last_perm_sync is None: + return True + + source_sync_period = DOC_PERMISSION_SYNC_PERIODS.get(cc_pair.connector.source) + + # If RESTRICTED_FETCH_PERIOD[source] is None, we always run the sync. + if not source_sync_period: + return True + + # If the last sync is greater than the full fetch period, we run the sync + next_sync = last_perm_sync + timedelta(seconds=source_sync_period) + if datetime.now(timezone.utc) >= next_sync: + return True + + return False + + +@shared_task( + name="check_for_doc_permissions_sync", + soft_time_limit=JOB_TIMEOUT, + bind=True, +) +def check_for_doc_permissions_sync(self: Task, *, tenant_id: str | None) -> None: + r = get_redis_client(tenant_id=tenant_id) + + lock_beat = r.lock( + DanswerRedisLocks.CHECK_CONNECTOR_DOC_PERMISSIONS_SYNC_BEAT_LOCK, + timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT, + ) + + try: + # these tasks should never overlap + if not lock_beat.acquire(blocking=False): + return + + # get all cc pairs that need to be synced + cc_pair_ids_to_sync: list[int] = [] + with get_session_with_tenant(tenant_id) as db_session: + cc_pairs = get_all_auto_sync_cc_pairs(db_session) + + for cc_pair in cc_pairs: + if _is_external_doc_permissions_sync_due(cc_pair): + cc_pair_ids_to_sync.append(cc_pair.id) + + for cc_pair_id in cc_pair_ids_to_sync: + tasks_created = try_creating_permissions_sync_task( + self.app, cc_pair_id, r, tenant_id + ) + if not tasks_created: + continue + + task_logger.info(f"Doc permissions sync queued: cc_pair={cc_pair_id}") + except SoftTimeLimitExceeded: + task_logger.info( + "Soft time limit exceeded, task is being terminated gracefully." + ) + except Exception: + task_logger.exception(f"Unexpected exception: tenant={tenant_id}") + finally: + if lock_beat.owned(): + lock_beat.release() + + +def try_creating_permissions_sync_task( + app: Celery, + cc_pair_id: int, + r: Redis, + tenant_id: str | None, +) -> int | None: + """Returns an int if syncing is needed. The int represents the number of sync tasks generated. + Returns None if no syncing is required.""" + redis_connector = RedisConnector(tenant_id, cc_pair_id) + + LOCK_TIMEOUT = 30 + + lock = r.lock( + DANSWER_REDIS_FUNCTION_LOCK_PREFIX + "try_generate_permissions_sync_tasks", + timeout=LOCK_TIMEOUT, + ) + + acquired = lock.acquire(blocking_timeout=LOCK_TIMEOUT / 2) + if not acquired: + return None + + try: + if redis_connector.permissions.fenced: + return None + + if redis_connector.delete.fenced: + return None + + if redis_connector.prune.fenced: + return None + + redis_connector.permissions.generator_clear() + redis_connector.permissions.taskset_clear() + + custom_task_id = f"{redis_connector.permissions.generator_task_key}_{uuid4()}" + + app.send_task( + "connector_permission_sync_generator_task", + kwargs=dict( + cc_pair_id=cc_pair_id, + tenant_id=tenant_id, + ), + queue=DanswerCeleryQueues.CONNECTOR_DOC_PERMISSIONS_SYNC, + task_id=custom_task_id, + priority=DanswerCeleryPriority.HIGH, + ) + + # set a basic fence to start + payload = RedisConnectorPermissionSyncData( + started=None, + ) + + redis_connector.permissions.set_fence(payload) + except Exception: + task_logger.exception(f"Unexpected exception: cc_pair={cc_pair_id}") + return None + finally: + if lock.owned(): + lock.release() + + return 1 + + +@shared_task( + name="connector_permission_sync_generator_task", + acks_late=False, + soft_time_limit=JOB_TIMEOUT, + track_started=True, + trail=False, + bind=True, +) +def connector_permission_sync_generator_task( + self: Task, + cc_pair_id: int, + tenant_id: str | None, +) -> None: + """ + Permission sync task that handles document permission syncing for a given connector credential pair + This task assumes that the task has already been properly fenced + """ + + doc_permission_sync_ctx_dict = doc_permission_sync_ctx.get() + doc_permission_sync_ctx_dict["cc_pair_id"] = cc_pair_id + doc_permission_sync_ctx_dict["request_id"] = self.request.id + doc_permission_sync_ctx.set(doc_permission_sync_ctx_dict) + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + + r = get_redis_client(tenant_id=tenant_id) + + lock = r.lock( + DanswerRedisLocks.CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX + + f"_{redis_connector.id}", + timeout=CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT, + ) + + acquired = lock.acquire(blocking=False) + if not acquired: + task_logger.warning( + f"Permission sync task already running, exiting...: cc_pair={cc_pair_id}" + ) + return None + + try: + with get_session_with_tenant(tenant_id) as db_session: + cc_pair = get_connector_credential_pair_from_id(cc_pair_id, db_session) + if cc_pair is None: + raise ValueError( + f"No connector credential pair found for id: {cc_pair_id}" + ) + + source_type = cc_pair.connector.source + + doc_sync_func = DOC_PERMISSIONS_FUNC_MAP.get(source_type) + if doc_sync_func is None: + raise ValueError( + f"No doc sync func found for {source_type} with cc_pair={cc_pair_id}" + ) + + logger.info(f"Syncing docs for {source_type} with cc_pair={cc_pair_id}") + + payload = RedisConnectorPermissionSyncData( + started=datetime.now(timezone.utc), + ) + redis_connector.permissions.set_fence(payload) + + document_external_accesses: list[DocExternalAccess] = doc_sync_func(cc_pair) + + task_logger.info( + f"RedisConnector.permissions.generate_tasks starting. cc_pair={cc_pair_id}" + ) + tasks_generated = redis_connector.permissions.generate_tasks( + self.app, lock, document_external_accesses, source_type + ) + if tasks_generated is None: + return None + + task_logger.info( + f"RedisConnector.permissions.generate_tasks finished. " + f"cc_pair={cc_pair_id} tasks_generated={tasks_generated}" + ) + + redis_connector.permissions.generator_complete = tasks_generated + + except Exception as e: + task_logger.exception(f"Failed to run permission sync: cc_pair={cc_pair_id}") + + redis_connector.permissions.generator_clear() + redis_connector.permissions.taskset_clear() + redis_connector.permissions.set_fence(None) + raise e + finally: + if lock.owned(): + lock.release() + + +@shared_task( + name="update_external_document_permissions_task", + soft_time_limit=LIGHT_SOFT_TIME_LIMIT, + time_limit=LIGHT_TIME_LIMIT, + max_retries=DOCUMENT_PERMISSIONS_UPDATE_MAX_RETRIES, + bind=True, +) +def update_external_document_permissions_task( + self: Task, + tenant_id: str | None, + serialized_doc_external_access: dict, + source_string: str, +) -> bool: + document_external_access = DocExternalAccess.from_dict( + serialized_doc_external_access + ) + doc_id = document_external_access.doc_id + external_access = document_external_access.external_access + try: + with get_session_with_tenant(tenant_id) as db_session: + # Then we build the update requests to update vespa + batch_add_ext_perm_user_if_not_exists( + db_session=db_session, + emails=list(external_access.external_user_emails), + ) + upsert_document_external_perms( + db_session=db_session, + doc_id=doc_id, + external_access=external_access, + source_type=DocumentSource(source_string), + ) + + logger.debug( + f"Successfully synced postgres document permissions for {doc_id}" + ) + return True + except Exception: + logger.exception("Error Syncing Document Permissions") + return False diff --git a/backend/danswer/background/celery/tasks/external_group_syncing/tasks.py b/backend/danswer/background/celery/tasks/external_group_syncing/tasks.py new file mode 100644 index 00000000000..c3f0f6c6f15 --- /dev/null +++ b/backend/danswer/background/celery/tasks/external_group_syncing/tasks.py @@ -0,0 +1,269 @@ +from datetime import datetime +from datetime import timedelta +from datetime import timezone +from uuid import uuid4 + +from celery import Celery +from celery import shared_task +from celery import Task +from celery.exceptions import SoftTimeLimitExceeded +from redis import Redis + +from danswer.background.celery.apps.app_base import task_logger +from danswer.configs.app_configs import JOB_TIMEOUT +from danswer.configs.constants import CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT +from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX +from danswer.configs.constants import DanswerCeleryPriority +from danswer.configs.constants import DanswerCeleryQueues +from danswer.configs.constants import DanswerRedisLocks +from danswer.db.connector import mark_cc_pair_as_external_group_synced +from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id +from danswer.db.engine import get_session_with_tenant +from danswer.db.enums import AccessType +from danswer.db.enums import ConnectorCredentialPairStatus +from danswer.db.models import ConnectorCredentialPair +from danswer.redis.redis_connector import RedisConnector +from danswer.redis.redis_pool import get_redis_client +from danswer.utils.logger import setup_logger +from ee.danswer.db.connector_credential_pair import get_all_auto_sync_cc_pairs +from ee.danswer.db.external_perm import ExternalUserGroup +from ee.danswer.db.external_perm import replace_user__ext_group_for_cc_pair +from ee.danswer.external_permissions.sync_params import EXTERNAL_GROUP_SYNC_PERIODS +from ee.danswer.external_permissions.sync_params import GROUP_PERMISSIONS_FUNC_MAP + +logger = setup_logger() + + +EXTERNAL_GROUPS_UPDATE_MAX_RETRIES = 3 + + +# 5 seconds more than RetryDocumentIndex STOP_AFTER+MAX_WAIT +LIGHT_SOFT_TIME_LIMIT = 105 +LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15 + + +def _is_external_group_sync_due(cc_pair: ConnectorCredentialPair) -> bool: + """Returns boolean indicating if external group sync is due.""" + + if cc_pair.access_type != AccessType.SYNC: + return False + + # skip external group sync if not active + if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE: + return False + + if cc_pair.status == ConnectorCredentialPairStatus.DELETING: + return False + + # If there is not group sync function for the connector, we don't run the sync + # This is fine because all sources dont necessarily have a concept of groups + if not GROUP_PERMISSIONS_FUNC_MAP.get(cc_pair.connector.source): + return False + + # If the last sync is None, it has never been run so we run the sync + last_ext_group_sync = cc_pair.last_time_external_group_sync + if last_ext_group_sync is None: + return True + + source_sync_period = EXTERNAL_GROUP_SYNC_PERIODS.get(cc_pair.connector.source) + + # If EXTERNAL_GROUP_SYNC_PERIODS is None, we always run the sync. + if not source_sync_period: + return True + + # If the last sync is greater than the full fetch period, we run the sync + next_sync = last_ext_group_sync + timedelta(seconds=source_sync_period) + if datetime.now(timezone.utc) >= next_sync: + return True + + return False + + +@shared_task( + name="check_for_external_group_sync", + soft_time_limit=JOB_TIMEOUT, + bind=True, +) +def check_for_external_group_sync(self: Task, *, tenant_id: str | None) -> None: + r = get_redis_client(tenant_id=tenant_id) + + lock_beat = r.lock( + DanswerRedisLocks.CHECK_CONNECTOR_EXTERNAL_GROUP_SYNC_BEAT_LOCK, + timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT, + ) + + try: + # these tasks should never overlap + if not lock_beat.acquire(blocking=False): + return + + cc_pair_ids_to_sync: list[int] = [] + with get_session_with_tenant(tenant_id) as db_session: + cc_pairs = get_all_auto_sync_cc_pairs(db_session) + + for cc_pair in cc_pairs: + if _is_external_group_sync_due(cc_pair): + cc_pair_ids_to_sync.append(cc_pair.id) + + for cc_pair_id in cc_pair_ids_to_sync: + tasks_created = try_creating_permissions_sync_task( + self.app, cc_pair_id, r, tenant_id + ) + if not tasks_created: + continue + + task_logger.info(f"External group sync queued: cc_pair={cc_pair_id}") + except SoftTimeLimitExceeded: + task_logger.info( + "Soft time limit exceeded, task is being terminated gracefully." + ) + except Exception: + task_logger.exception(f"Unexpected exception: tenant={tenant_id}") + finally: + if lock_beat.owned(): + lock_beat.release() + + +def try_creating_permissions_sync_task( + app: Celery, + cc_pair_id: int, + r: Redis, + tenant_id: str | None, +) -> int | None: + """Returns an int if syncing is needed. The int represents the number of sync tasks generated. + Returns None if no syncing is required.""" + redis_connector = RedisConnector(tenant_id, cc_pair_id) + + LOCK_TIMEOUT = 30 + + lock = r.lock( + DANSWER_REDIS_FUNCTION_LOCK_PREFIX + "try_generate_external_group_sync_tasks", + timeout=LOCK_TIMEOUT, + ) + + acquired = lock.acquire(blocking_timeout=LOCK_TIMEOUT / 2) + if not acquired: + return None + + try: + # Dont kick off a new sync if the previous one is still running + if redis_connector.external_group_sync.fenced: + return None + + redis_connector.external_group_sync.generator_clear() + redis_connector.external_group_sync.taskset_clear() + + custom_task_id = f"{redis_connector.external_group_sync.taskset_key}_{uuid4()}" + + _ = app.send_task( + "connector_external_group_sync_generator_task", + kwargs=dict( + cc_pair_id=cc_pair_id, + tenant_id=tenant_id, + ), + queue=DanswerCeleryQueues.CONNECTOR_EXTERNAL_GROUP_SYNC, + task_id=custom_task_id, + priority=DanswerCeleryPriority.HIGH, + ) + # set a basic fence to start + redis_connector.external_group_sync.set_fence(True) + + except Exception: + task_logger.exception( + f"Unexpected exception while trying to create external group sync task: cc_pair={cc_pair_id}" + ) + return None + finally: + if lock.owned(): + lock.release() + + return 1 + + +@shared_task( + name="connector_external_group_sync_generator_task", + acks_late=False, + soft_time_limit=JOB_TIMEOUT, + track_started=True, + trail=False, + bind=True, +) +def connector_external_group_sync_generator_task( + self: Task, + cc_pair_id: int, + tenant_id: str | None, +) -> None: + """ + Permission sync task that handles external group syncing for a given connector credential pair + This task assumes that the task has already been properly fenced + """ + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + + r = get_redis_client(tenant_id=tenant_id) + + lock = r.lock( + DanswerRedisLocks.CONNECTOR_EXTERNAL_GROUP_SYNC_LOCK_PREFIX + + f"_{redis_connector.id}", + timeout=CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT, + ) + + try: + acquired = lock.acquire(blocking=False) + if not acquired: + task_logger.warning( + f"External group sync task already running, exiting...: cc_pair={cc_pair_id}" + ) + return None + + with get_session_with_tenant(tenant_id) as db_session: + cc_pair = get_connector_credential_pair_from_id(cc_pair_id, db_session) + if cc_pair is None: + raise ValueError( + f"No connector credential pair found for id: {cc_pair_id}" + ) + + source_type = cc_pair.connector.source + + ext_group_sync_func = GROUP_PERMISSIONS_FUNC_MAP.get(source_type) + if ext_group_sync_func is None: + raise ValueError( + f"No external group sync func found for {source_type} for cc_pair: {cc_pair_id}" + ) + + logger.info( + f"Syncing external groups for {source_type} for cc_pair: {cc_pair_id}" + ) + + external_user_groups: list[ExternalUserGroup] = ext_group_sync_func(cc_pair) + + logger.info( + f"Syncing {len(external_user_groups)} external user groups for {source_type}" + ) + + replace_user__ext_group_for_cc_pair( + db_session=db_session, + cc_pair_id=cc_pair.id, + group_defs=external_user_groups, + source=cc_pair.connector.source, + ) + logger.info( + f"Synced {len(external_user_groups)} external user groups for {source_type}" + ) + + mark_cc_pair_as_external_group_synced(db_session, cc_pair.id) + + except Exception as e: + task_logger.exception( + f"Failed to run external group sync: cc_pair={cc_pair_id}" + ) + + redis_connector.external_group_sync.generator_clear() + redis_connector.external_group_sync.taskset_clear() + raise e + finally: + # we always want to clear the fence after the task is done or failed so it doesn't get stuck + redis_connector.external_group_sync.set_fence(False) + if lock.owned(): + lock.release() diff --git a/backend/danswer/background/celery/tasks/indexing/tasks.py b/backend/danswer/background/celery/tasks/indexing/tasks.py new file mode 100644 index 00000000000..73b2b20a4e0 --- /dev/null +++ b/backend/danswer/background/celery/tasks/indexing/tasks.py @@ -0,0 +1,797 @@ +from datetime import datetime +from datetime import timezone +from http import HTTPStatus +from time import sleep + +import redis +import sentry_sdk +from celery import Celery +from celery import shared_task +from celery import Task +from celery.exceptions import SoftTimeLimitExceeded +from redis import Redis +from redis.exceptions import LockError +from redis.lock import Lock as RedisLock +from sqlalchemy.orm import Session + +from danswer.background.celery.apps.app_base import task_logger +from danswer.background.indexing.job_client import SimpleJobClient +from danswer.background.indexing.run_indexing import run_indexing_entrypoint +from danswer.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP +from danswer.configs.constants import CELERY_INDEXING_LOCK_TIMEOUT +from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX +from danswer.configs.constants import DanswerCeleryPriority +from danswer.configs.constants import DanswerCeleryQueues +from danswer.configs.constants import DanswerRedisLocks +from danswer.configs.constants import DocumentSource +from danswer.db.connector_credential_pair import fetch_connector_credential_pairs +from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id +from danswer.db.engine import get_db_current_time +from danswer.db.engine import get_session_with_tenant +from danswer.db.enums import ConnectorCredentialPairStatus +from danswer.db.enums import IndexingStatus +from danswer.db.enums import IndexModelStatus +from danswer.db.index_attempt import create_index_attempt +from danswer.db.index_attempt import delete_index_attempt +from danswer.db.index_attempt import get_all_index_attempts_by_status +from danswer.db.index_attempt import get_index_attempt +from danswer.db.index_attempt import get_last_attempt_for_cc_pair +from danswer.db.index_attempt import mark_attempt_failed +from danswer.db.models import ConnectorCredentialPair +from danswer.db.models import IndexAttempt +from danswer.db.models import SearchSettings +from danswer.db.search_settings import get_current_search_settings +from danswer.db.search_settings import get_secondary_search_settings +from danswer.db.swap_index import check_index_swap +from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface +from danswer.natural_language_processing.search_nlp_models import EmbeddingModel +from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder +from danswer.redis.redis_connector import RedisConnector +from danswer.redis.redis_connector_index import RedisConnectorIndex +from danswer.redis.redis_connector_index import RedisConnectorIndexPayload +from danswer.redis.redis_pool import get_redis_client +from danswer.utils.logger import setup_logger +from danswer.utils.variable_functionality import global_version +from shared_configs.configs import INDEXING_MODEL_SERVER_HOST +from shared_configs.configs import INDEXING_MODEL_SERVER_PORT +from shared_configs.configs import MULTI_TENANT +from shared_configs.configs import SENTRY_DSN + +logger = setup_logger() + + +class IndexingCallback(IndexingHeartbeatInterface): + def __init__( + self, + stop_key: str, + generator_progress_key: str, + redis_lock: RedisLock, + redis_client: Redis, + ): + super().__init__() + self.redis_lock: RedisLock = redis_lock + self.stop_key: str = stop_key + self.generator_progress_key: str = generator_progress_key + self.redis_client = redis_client + self.started: datetime = datetime.now(timezone.utc) + self.redis_lock.reacquire() + + self.last_tag: str = "IndexingCallback.__init__" + self.last_lock_reacquire: datetime = datetime.now(timezone.utc) + + def should_stop(self) -> bool: + if self.redis_client.exists(self.stop_key): + return True + return False + + def progress(self, tag: str, amount: int) -> None: + try: + self.redis_lock.reacquire() + self.last_tag = tag + self.last_lock_reacquire = datetime.now(timezone.utc) + except LockError: + logger.exception( + f"IndexingCallback - lock.reacquire exceptioned. " + f"lock_timeout={self.redis_lock.timeout} " + f"start={self.started} " + f"last_tag={self.last_tag} " + f"last_reacquired={self.last_lock_reacquire} " + f"now={datetime.now(timezone.utc)}" + ) + raise + + self.redis_client.incrby(self.generator_progress_key, amount) + + +def get_unfenced_index_attempt_ids(db_session: Session, r: redis.Redis) -> list[int]: + """Gets a list of unfenced index attempts. Should not be possible, so we'd typically + want to clean them up. + + Unfenced = attempt not in terminal state and fence does not exist. + """ + unfenced_attempts: list[int] = [] + + # inner/outer/inner double check pattern to avoid race conditions when checking for + # bad state + # inner = index_attempt in non terminal state + # outer = r.fence_key down + + # check the db for index attempts in a non terminal state + attempts: list[IndexAttempt] = [] + attempts.extend( + get_all_index_attempts_by_status(IndexingStatus.NOT_STARTED, db_session) + ) + attempts.extend( + get_all_index_attempts_by_status(IndexingStatus.IN_PROGRESS, db_session) + ) + + for attempt in attempts: + fence_key = RedisConnectorIndex.fence_key_with_ids( + attempt.connector_credential_pair_id, attempt.search_settings_id + ) + + # if the fence is down / doesn't exist, possible error but not confirmed + if r.exists(fence_key): + continue + + # Between the time the attempts are first looked up and the time we see the fence down, + # the attempt may have completed and taken down the fence normally. + + # We need to double check that the index attempt is still in a non terminal state + # and matches the original state, which confirms we are really in a bad state. + attempt_2 = get_index_attempt(db_session, attempt.id) + if not attempt_2: + continue + + if attempt.status != attempt_2.status: + continue + + unfenced_attempts.append(attempt.id) + + return unfenced_attempts + + +@shared_task( + name="check_for_indexing", + soft_time_limit=300, + bind=True, +) +def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None: + tasks_created = 0 + + r = get_redis_client(tenant_id=tenant_id) + + lock_beat: RedisLock = r.lock( + DanswerRedisLocks.CHECK_INDEXING_BEAT_LOCK, + timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT, + ) + + try: + # these tasks should never overlap + if not lock_beat.acquire(blocking=False): + return None + + # check for search settings swap + with get_session_with_tenant(tenant_id=tenant_id) as db_session: + old_search_settings = check_index_swap(db_session=db_session) + current_search_settings = get_current_search_settings(db_session) + # So that the first time users aren't surprised by really slow speed of first + # batch of documents indexed + if current_search_settings.provider_type is None and not MULTI_TENANT: + if old_search_settings: + embedding_model = EmbeddingModel.from_db_model( + search_settings=current_search_settings, + server_host=INDEXING_MODEL_SERVER_HOST, + server_port=INDEXING_MODEL_SERVER_PORT, + ) + + # only warm up if search settings were changed + warm_up_bi_encoder( + embedding_model=embedding_model, + ) + + # gather cc_pair_ids + cc_pair_ids: list[int] = [] + with get_session_with_tenant(tenant_id) as db_session: + lock_beat.reacquire() + cc_pairs = fetch_connector_credential_pairs(db_session) + for cc_pair_entry in cc_pairs: + cc_pair_ids.append(cc_pair_entry.id) + + # kick off index attempts + for cc_pair_id in cc_pair_ids: + lock_beat.reacquire() + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + with get_session_with_tenant(tenant_id) as db_session: + # Get the primary search settings + primary_search_settings = get_current_search_settings(db_session) + search_settings = [primary_search_settings] + + # Check for secondary search settings + secondary_search_settings = get_secondary_search_settings(db_session) + if secondary_search_settings is not None: + # If secondary settings exist, add them to the list + search_settings.append(secondary_search_settings) + + for search_settings_instance in search_settings: + redis_connector_index = redis_connector.new_index( + search_settings_instance.id + ) + if redis_connector_index.fenced: + continue + + cc_pair = get_connector_credential_pair_from_id( + cc_pair_id, db_session + ) + if not cc_pair: + continue + + last_attempt = get_last_attempt_for_cc_pair( + cc_pair.id, search_settings_instance.id, db_session + ) + if not _should_index( + cc_pair=cc_pair, + last_index=last_attempt, + search_settings_instance=search_settings_instance, + secondary_index_building=len(search_settings) > 1, + db_session=db_session, + ): + continue + + # using a task queue and only allowing one task per cc_pair/search_setting + # prevents us from starving out certain attempts + attempt_id = try_creating_indexing_task( + self.app, + cc_pair, + search_settings_instance, + False, + db_session, + r, + tenant_id, + ) + if attempt_id: + task_logger.info( + f"Connector indexing queued: " + f"index_attempt={attempt_id} " + f"cc_pair={cc_pair.id} " + f"search_settings={search_settings_instance.id} " + ) + tasks_created += 1 + + # Fail any index attempts in the DB that don't have fences + # This shouldn't ever happen! + with get_session_with_tenant(tenant_id) as db_session: + unfenced_attempt_ids = get_unfenced_index_attempt_ids(db_session, r) + for attempt_id in unfenced_attempt_ids: + lock_beat.reacquire() + + attempt = get_index_attempt(db_session, attempt_id) + if not attempt: + continue + + failure_reason = ( + f"Unfenced index attempt found in DB: " + f"index_attempt={attempt.id} " + f"cc_pair={attempt.connector_credential_pair_id} " + f"search_settings={attempt.search_settings_id}" + ) + task_logger.error(failure_reason) + mark_attempt_failed( + attempt.id, db_session, failure_reason=failure_reason + ) + + except SoftTimeLimitExceeded: + task_logger.info( + "Soft time limit exceeded, task is being terminated gracefully." + ) + except Exception: + task_logger.exception(f"Unexpected exception: tenant={tenant_id}") + finally: + if lock_beat.owned(): + lock_beat.release() + else: + task_logger.error( + "check_for_indexing - Lock not owned on completion: " + f"tenant={tenant_id}" + ) + + return tasks_created + + +def _should_index( + cc_pair: ConnectorCredentialPair, + last_index: IndexAttempt | None, + search_settings_instance: SearchSettings, + secondary_index_building: bool, + db_session: Session, +) -> bool: + """Checks various global settings and past indexing attempts to determine if + we should try to start indexing the cc pair / search setting combination. + + Note that tactical checks such as preventing overlap with a currently running task + are not handled here. + + Return True if we should try to index, False if not. + """ + connector = cc_pair.connector + + # uncomment for debugging + # task_logger.info(f"_should_index: " + # f"cc_pair={cc_pair.id} " + # f"connector={cc_pair.connector_id} " + # f"refresh_freq={connector.refresh_freq}") + + # don't kick off indexing for `NOT_APPLICABLE` sources + if connector.source == DocumentSource.NOT_APPLICABLE: + return False + + # User can still manually create single indexing attempts via the UI for the + # currently in use index + if DISABLE_INDEX_UPDATE_ON_SWAP: + if ( + search_settings_instance.status == IndexModelStatus.PRESENT + and secondary_index_building + ): + return False + + # When switching over models, always index at least once + if search_settings_instance.status == IndexModelStatus.FUTURE: + if last_index: + # No new index if the last index attempt succeeded + # Once is enough. The model will never be able to swap otherwise. + if last_index.status == IndexingStatus.SUCCESS: + return False + + # No new index if the last index attempt is waiting to start + if last_index.status == IndexingStatus.NOT_STARTED: + return False + + # No new index if the last index attempt is running + if last_index.status == IndexingStatus.IN_PROGRESS: + return False + else: + if ( + connector.id == 0 or connector.source == DocumentSource.INGESTION_API + ): # Ingestion API + return False + return True + + # If the connector is paused or is the ingestion API, don't index + # NOTE: during an embedding model switch over, the following logic + # is bypassed by the above check for a future model + if ( + not cc_pair.status.is_active() + or connector.id == 0 + or connector.source == DocumentSource.INGESTION_API + ): + return False + + # if no attempt has ever occurred, we should index regardless of refresh_freq + if not last_index: + return True + + if connector.refresh_freq is None: + return False + + current_db_time = get_db_current_time(db_session) + time_since_index = current_db_time - last_index.time_updated + if time_since_index.total_seconds() < connector.refresh_freq: + return False + + return True + + +def try_creating_indexing_task( + celery_app: Celery, + cc_pair: ConnectorCredentialPair, + search_settings: SearchSettings, + reindex: bool, + db_session: Session, + r: Redis, + tenant_id: str | None, +) -> int | None: + """Checks for any conditions that should block the indexing task from being + created, then creates the task. + + Does not check for scheduling related conditions as this function + is used to trigger indexing immediately. + """ + + LOCK_TIMEOUT = 30 + index_attempt_id: int | None = None + + # we need to serialize any attempt to trigger indexing since it can be triggered + # either via celery beat or manually (API call) + lock: RedisLock = r.lock( + DANSWER_REDIS_FUNCTION_LOCK_PREFIX + "try_creating_indexing_task", + timeout=LOCK_TIMEOUT, + ) + + acquired = lock.acquire(blocking_timeout=LOCK_TIMEOUT / 2) + if not acquired: + return None + + try: + redis_connector = RedisConnector(tenant_id, cc_pair.id) + redis_connector_index = redis_connector.new_index(search_settings.id) + + # skip if already indexing + if redis_connector_index.fenced: + return None + + # skip indexing if the cc_pair is deleting + if redis_connector.delete.fenced: + return None + + db_session.refresh(cc_pair) + if cc_pair.status == ConnectorCredentialPairStatus.DELETING: + return None + + # add a long running generator task to the queue + redis_connector_index.generator_clear() + + # set a basic fence to start + payload = RedisConnectorIndexPayload( + index_attempt_id=None, + started=None, + submitted=datetime.now(timezone.utc), + celery_task_id=None, + ) + + redis_connector_index.set_fence(payload) + + # create the index attempt for tracking purposes + # code elsewhere checks for index attempts without an associated redis key + # and cleans them up + # therefore we must create the attempt and the task after the fence goes up + index_attempt_id = create_index_attempt( + cc_pair.id, + search_settings.id, + from_beginning=reindex, + db_session=db_session, + ) + + custom_task_id = redis_connector_index.generate_generator_task_id() + + # when the task is sent, we have yet to finish setting up the fence + # therefore, the task must contain code that blocks until the fence is ready + result = celery_app.send_task( + "connector_indexing_proxy_task", + kwargs=dict( + index_attempt_id=index_attempt_id, + cc_pair_id=cc_pair.id, + search_settings_id=search_settings.id, + tenant_id=tenant_id, + ), + queue=DanswerCeleryQueues.CONNECTOR_INDEXING, + task_id=custom_task_id, + priority=DanswerCeleryPriority.MEDIUM, + ) + if not result: + raise RuntimeError("send_task for connector_indexing_proxy_task failed.") + + # now fill out the fence with the rest of the data + payload.index_attempt_id = index_attempt_id + payload.celery_task_id = result.id + redis_connector_index.set_fence(payload) + except Exception: + task_logger.exception( + f"try_creating_indexing_task - Unexpected exception: " + f"tenant={tenant_id} " + f"cc_pair={cc_pair.id} " + f"search_settings={search_settings.id}" + ) + + if index_attempt_id is not None: + delete_index_attempt(db_session, index_attempt_id) + redis_connector_index.set_fence(None) + return None + finally: + if lock.owned(): + lock.release() + + return index_attempt_id + + +@shared_task(name="connector_indexing_proxy_task", acks_late=False, track_started=True) +def connector_indexing_proxy_task( + index_attempt_id: int, + cc_pair_id: int, + search_settings_id: int, + tenant_id: str | None, +) -> None: + """celery tasks are forked, but forking is unstable. This proxies work to a spawned task.""" + task_logger.info( + f"Indexing watchdog - starting: attempt={index_attempt_id} " + f"tenant={tenant_id} " + f"cc_pair={cc_pair_id} " + f"search_settings={search_settings_id}" + ) + client = SimpleJobClient() + + job = client.submit( + connector_indexing_task_wrapper, + index_attempt_id, + cc_pair_id, + search_settings_id, + tenant_id, + global_version.is_ee_version(), + pure=False, + ) + + if not job: + task_logger.info( + f"Indexing watchdog - spawn failed: attempt={index_attempt_id} " + f"tenant={tenant_id} " + f"cc_pair={cc_pair_id} " + f"search_settings={search_settings_id}" + ) + return + + task_logger.info( + f"Indexing watchdog - spawn succeeded: attempt={index_attempt_id} " + f"tenant={tenant_id} " + f"cc_pair={cc_pair_id} " + f"search_settings={search_settings_id}" + ) + + while True: + sleep(10) + + # do nothing for ongoing jobs that haven't been stopped + if not job.done(): + with get_session_with_tenant(tenant_id) as db_session: + index_attempt = get_index_attempt( + db_session=db_session, index_attempt_id=index_attempt_id + ) + + if not index_attempt: + continue + + if not index_attempt.is_finished(): + continue + + if job.status == "error": + task_logger.error( + f"Indexing watchdog - spawned task exceptioned: " + f"attempt={index_attempt_id} " + f"tenant={tenant_id} " + f"cc_pair={cc_pair_id} " + f"search_settings={search_settings_id} " + f"error={job.exception()}" + ) + + job.release() + break + + task_logger.info( + f"Indexing watchdog - finished: attempt={index_attempt_id} " + f"tenant={tenant_id} " + f"cc_pair={cc_pair_id} " + f"search_settings={search_settings_id}" + ) + return + + +def connector_indexing_task_wrapper( + index_attempt_id: int, + cc_pair_id: int, + search_settings_id: int, + tenant_id: str | None, + is_ee: bool, +) -> int | None: + """Just wraps connector_indexing_task so we can log any exceptions before + re-raising it.""" + result: int | None = None + + try: + result = connector_indexing_task( + index_attempt_id, + cc_pair_id, + search_settings_id, + tenant_id, + is_ee, + ) + except: + logger.exception( + f"connector_indexing_task exceptioned: " + f"tenant={tenant_id} " + f"index_attempt={index_attempt_id} " + f"cc_pair={cc_pair_id} " + f"search_settings={search_settings_id}" + ) + raise + + return result + + +def connector_indexing_task( + index_attempt_id: int, + cc_pair_id: int, + search_settings_id: int, + tenant_id: str | None, + is_ee: bool, +) -> int | None: + """Indexing task. For a cc pair, this task pulls all document IDs from the source + and compares those IDs to locally stored documents and deletes all locally stored IDs missing + from the most recently pulled document ID list + + acks_late must be set to False. Otherwise, celery's visibility timeout will + cause any task that runs longer than the timeout to be redispatched by the broker. + There appears to be no good workaround for this, so we need to handle redispatching + manually. + + Returns None if the task did not run (possibly due to a conflict). + Otherwise, returns an int >= 0 representing the number of indexed docs. + + NOTE: if an exception is raised out of this task, the primary worker will detect + that the task transitioned to a "READY" state but the generator_complete_key doesn't exist. + This will cause the primary worker to abort the indexing attempt and clean up. + """ + + # Since connector_indexing_proxy_task spawns a new process using this function as + # the entrypoint, we init Sentry here. + if SENTRY_DSN: + sentry_sdk.init( + dsn=SENTRY_DSN, + traces_sample_rate=0.1, + ) + logger.info("Sentry initialized") + else: + logger.debug("Sentry DSN not provided, skipping Sentry initialization") + + logger.info( + f"Indexing spawned task starting: " + f"attempt={index_attempt_id} " + f"tenant={tenant_id} " + f"cc_pair={cc_pair_id} " + f"search_settings={search_settings_id}" + ) + + attempt_found = False + n_final_progress: int | None = None + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + redis_connector_index = redis_connector.new_index(search_settings_id) + + r = get_redis_client(tenant_id=tenant_id) + + if redis_connector.delete.fenced: + raise RuntimeError( + f"Indexing will not start because connector deletion is in progress: " + f"attempt={index_attempt_id} " + f"cc_pair={cc_pair_id} " + f"fence={redis_connector.delete.fence_key}" + ) + + if redis_connector.stop.fenced: + raise RuntimeError( + f"Indexing will not start because a connector stop signal was detected: " + f"attempt={index_attempt_id} " + f"cc_pair={cc_pair_id} " + f"fence={redis_connector.stop.fence_key}" + ) + + while True: + if not redis_connector_index.fenced: # The fence must exist + raise ValueError( + f"connector_indexing_task - fence not found: fence={redis_connector_index.fence_key}" + ) + + payload = redis_connector_index.payload # The payload must exist + if not payload: + raise ValueError("connector_indexing_task: payload invalid or not found") + + if payload.index_attempt_id is None or payload.celery_task_id is None: + logger.info( + f"connector_indexing_task - Waiting for fence: fence={redis_connector_index.fence_key}" + ) + sleep(1) + continue + + if payload.index_attempt_id != index_attempt_id: + raise ValueError( + f"connector_indexing_task - id mismatch. Task may be left over from previous run.: " + f"task_index_attempt={index_attempt_id} " + f"payload_index_attempt={payload.index_attempt_id}" + ) + + logger.info( + f"connector_indexing_task - Fence found, continuing...: fence={redis_connector_index.fence_key}" + ) + break + + lock: RedisLock = r.lock( + redis_connector_index.generator_lock_key, + timeout=CELERY_INDEXING_LOCK_TIMEOUT, + ) + + acquired = lock.acquire(blocking=False) + if not acquired: + logger.warning( + f"Indexing task already running, exiting...: " + f"index_attempt={index_attempt_id} cc_pair={cc_pair_id} search_settings={search_settings_id}" + ) + return None + + payload.started = datetime.now(timezone.utc) + redis_connector_index.set_fence(payload) + + try: + with get_session_with_tenant(tenant_id) as db_session: + attempt = get_index_attempt(db_session, index_attempt_id) + if not attempt: + raise ValueError( + f"Index attempt not found: index_attempt={index_attempt_id}" + ) + attempt_found = True + + cc_pair = get_connector_credential_pair_from_id( + cc_pair_id=cc_pair_id, + db_session=db_session, + ) + + if not cc_pair: + raise ValueError(f"cc_pair not found: cc_pair={cc_pair_id}") + + if not cc_pair.connector: + raise ValueError( + f"Connector not found: cc_pair={cc_pair_id} connector={cc_pair.connector_id}" + ) + + if not cc_pair.credential: + raise ValueError( + f"Credential not found: cc_pair={cc_pair_id} credential={cc_pair.credential_id}" + ) + + # define a callback class + callback = IndexingCallback( + redis_connector.stop.fence_key, + redis_connector_index.generator_progress_key, + lock, + r, + ) + + logger.info( + f"Indexing spawned task running entrypoint: attempt={index_attempt_id} " + f"tenant={tenant_id} " + f"cc_pair={cc_pair_id} " + f"search_settings={search_settings_id}" + ) + + run_indexing_entrypoint( + index_attempt_id, + tenant_id, + cc_pair_id, + is_ee, + callback=callback, + ) + + # get back the total number of indexed docs and return it + n_final_progress = redis_connector_index.get_progress() + redis_connector_index.set_generator_complete(HTTPStatus.OK.value) + except Exception as e: + logger.exception( + f"Indexing spawned task failed: attempt={index_attempt_id} " + f"tenant={tenant_id} " + f"cc_pair={cc_pair_id} " + f"search_settings={search_settings_id}" + ) + if attempt_found: + with get_session_with_tenant(tenant_id) as db_session: + mark_attempt_failed(index_attempt_id, db_session, failure_reason=str(e)) + + raise e + finally: + if lock.owned(): + lock.release() + + logger.info( + f"Indexing spawned task finished: attempt={index_attempt_id} " + f"tenant={tenant_id} " + f"cc_pair={cc_pair_id} " + f"search_settings={search_settings_id}" + ) + return n_final_progress diff --git a/backend/danswer/background/celery/tasks/periodic/tasks.py b/backend/danswer/background/celery/tasks/periodic/tasks.py new file mode 100644 index 00000000000..20baa7c52fa --- /dev/null +++ b/backend/danswer/background/celery/tasks/periodic/tasks.py @@ -0,0 +1,137 @@ +##### +# Periodic Tasks +##### +import json +from typing import Any + +from celery import shared_task +from celery.contrib.abortable import AbortableTask # type: ignore +from celery.exceptions import TaskRevokedError +from sqlalchemy import inspect +from sqlalchemy import text +from sqlalchemy.orm import Session + +from danswer.background.celery.apps.app_base import task_logger +from danswer.configs.app_configs import JOB_TIMEOUT +from danswer.configs.constants import PostgresAdvisoryLocks +from danswer.db.engine import get_session_with_tenant + + +@shared_task( + name="kombu_message_cleanup_task", + soft_time_limit=JOB_TIMEOUT, + bind=True, + base=AbortableTask, +) +def kombu_message_cleanup_task(self: Any, tenant_id: str | None) -> int: + """Runs periodically to clean up the kombu_message table""" + + # we will select messages older than this amount to clean up + KOMBU_MESSAGE_CLEANUP_AGE = 7 # days + KOMBU_MESSAGE_CLEANUP_PAGE_LIMIT = 1000 + + ctx = {} + ctx["last_processed_id"] = 0 + ctx["deleted"] = 0 + ctx["cleanup_age"] = KOMBU_MESSAGE_CLEANUP_AGE + ctx["page_limit"] = KOMBU_MESSAGE_CLEANUP_PAGE_LIMIT + with get_session_with_tenant(tenant_id) as db_session: + # Exit the task if we can't take the advisory lock + result = db_session.execute( + text("SELECT pg_try_advisory_lock(:id)"), + {"id": PostgresAdvisoryLocks.KOMBU_MESSAGE_CLEANUP_LOCK_ID.value}, + ).scalar() + if not result: + return 0 + + while True: + if self.is_aborted(): + raise TaskRevokedError("kombu_message_cleanup_task was aborted.") + + b = kombu_message_cleanup_task_helper(ctx, db_session) + if not b: + break + + db_session.commit() + + if ctx["deleted"] > 0: + task_logger.info( + f"Deleted {ctx['deleted']} orphaned messages from kombu_message." + ) + + return ctx["deleted"] + + +def kombu_message_cleanup_task_helper(ctx: dict, db_session: Session) -> bool: + """ + Helper function to clean up old messages from the `kombu_message` table that are no longer relevant. + + This function retrieves messages from the `kombu_message` table that are no longer visible and + older than a specified interval. It checks if the corresponding task_id exists in the + `celery_taskmeta` table. If the task_id does not exist, the message is deleted. + + Args: + ctx (dict): A context dictionary containing configuration parameters such as: + - 'cleanup_age' (int): The age in days after which messages are considered old. + - 'page_limit' (int): The maximum number of messages to process in one batch. + - 'last_processed_id' (int): The ID of the last processed message to handle pagination. + - 'deleted' (int): A counter to track the number of deleted messages. + db_session (Session): The SQLAlchemy database session for executing queries. + + Returns: + bool: Returns True if there are more rows to process, False if not. + """ + + inspector = inspect(db_session.bind) + if not inspector: + return False + + # With the move to redis as celery's broker and backend, kombu tables may not even exist. + # We can fail silently. + if not inspector.has_table("kombu_message"): + return False + + query = text( + """ + SELECT id, timestamp, payload + FROM kombu_message WHERE visible = 'false' + AND timestamp < CURRENT_TIMESTAMP - INTERVAL :interval_days + AND id > :last_processed_id + ORDER BY id + LIMIT :page_limit +""" + ) + kombu_messages = db_session.execute( + query, + { + "interval_days": f"{ctx['cleanup_age']} days", + "page_limit": ctx["page_limit"], + "last_processed_id": ctx["last_processed_id"], + }, + ).fetchall() + + if len(kombu_messages) == 0: + return False + + for msg in kombu_messages: + payload = json.loads(msg[2]) + task_id = payload["headers"]["id"] + + # Check if task_id exists in celery_taskmeta + task_exists = db_session.execute( + text("SELECT 1 FROM celery_taskmeta WHERE task_id = :task_id"), + {"task_id": task_id}, + ).fetchone() + + # If task_id does not exist, delete the message + if not task_exists: + result = db_session.execute( + text("DELETE FROM kombu_message WHERE id = :message_id"), + {"message_id": msg[0]}, + ) + if result.rowcount > 0: # type: ignore + ctx["deleted"] += 1 + + ctx["last_processed_id"] = msg[0] + + return True diff --git a/backend/danswer/background/celery/tasks/pruning/tasks.py b/backend/danswer/background/celery/tasks/pruning/tasks.py new file mode 100644 index 00000000000..67b781f228f --- /dev/null +++ b/backend/danswer/background/celery/tasks/pruning/tasks.py @@ -0,0 +1,338 @@ +from datetime import datetime +from datetime import timedelta +from datetime import timezone +from uuid import uuid4 + +from celery import Celery +from celery import shared_task +from celery import Task +from celery.exceptions import SoftTimeLimitExceeded +from redis import Redis +from sqlalchemy.orm import Session + +from danswer.background.celery.apps.app_base import task_logger +from danswer.background.celery.celery_utils import extract_ids_from_runnable_connector +from danswer.background.celery.tasks.indexing.tasks import IndexingCallback +from danswer.configs.app_configs import ALLOW_SIMULTANEOUS_PRUNING +from danswer.configs.app_configs import JOB_TIMEOUT +from danswer.configs.constants import CELERY_PRUNING_LOCK_TIMEOUT +from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX +from danswer.configs.constants import DanswerCeleryPriority +from danswer.configs.constants import DanswerCeleryQueues +from danswer.configs.constants import DanswerRedisLocks +from danswer.connectors.factory import instantiate_connector +from danswer.connectors.models import InputType +from danswer.db.connector_credential_pair import get_connector_credential_pair +from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id +from danswer.db.connector_credential_pair import get_connector_credential_pairs +from danswer.db.document import get_documents_for_connector_credential_pair +from danswer.db.engine import get_session_with_tenant +from danswer.db.enums import ConnectorCredentialPairStatus +from danswer.db.models import ConnectorCredentialPair +from danswer.redis.redis_connector import RedisConnector +from danswer.redis.redis_pool import get_redis_client +from danswer.utils.logger import pruning_ctx +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +def _is_pruning_due(cc_pair: ConnectorCredentialPair) -> bool: + """Returns boolean indicating if pruning is due. + + Next pruning time is calculated as a delta from the last successful prune, or the + last successful indexing if pruning has never succeeded. + + TODO(rkuo): consider whether we should allow pruning to be immediately rescheduled + if pruning fails (which is what it does now). A backoff could be reasonable. + """ + + # skip pruning if no prune frequency is set + # pruning can still be forced via the API which will run a pruning task directly + if not cc_pair.connector.prune_freq: + return False + + # skip pruning if not active + if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE: + return False + + # skip pruning if the next scheduled prune time hasn't been reached yet + last_pruned = cc_pair.last_pruned + if not last_pruned: + if not cc_pair.last_successful_index_time: + # if we've never indexed, we can't prune + return False + + # if never pruned, use the last time the connector indexed successfully + last_pruned = cc_pair.last_successful_index_time + + next_prune = last_pruned + timedelta(seconds=cc_pair.connector.prune_freq) + if datetime.now(timezone.utc) < next_prune: + return False + + return True + + +@shared_task( + name="check_for_pruning", + soft_time_limit=JOB_TIMEOUT, + bind=True, +) +def check_for_pruning(self: Task, *, tenant_id: str | None) -> None: + r = get_redis_client(tenant_id=tenant_id) + + lock_beat = r.lock( + DanswerRedisLocks.CHECK_PRUNE_BEAT_LOCK, + timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT, + ) + + try: + # these tasks should never overlap + if not lock_beat.acquire(blocking=False): + return + + cc_pair_ids: list[int] = [] + with get_session_with_tenant(tenant_id) as db_session: + cc_pairs = get_connector_credential_pairs(db_session) + for cc_pair_entry in cc_pairs: + cc_pair_ids.append(cc_pair_entry.id) + + for cc_pair_id in cc_pair_ids: + lock_beat.reacquire() + with get_session_with_tenant(tenant_id) as db_session: + cc_pair = get_connector_credential_pair_from_id(cc_pair_id, db_session) + if not cc_pair: + continue + + if not _is_pruning_due(cc_pair): + continue + + tasks_created = try_creating_prune_generator_task( + self.app, cc_pair, db_session, r, tenant_id + ) + if not tasks_created: + continue + + task_logger.info(f"Pruning queued: cc_pair={cc_pair.id}") + except SoftTimeLimitExceeded: + task_logger.info( + "Soft time limit exceeded, task is being terminated gracefully." + ) + except Exception: + task_logger.exception(f"Unexpected exception: tenant={tenant_id}") + finally: + if lock_beat.owned(): + lock_beat.release() + + +def try_creating_prune_generator_task( + celery_app: Celery, + cc_pair: ConnectorCredentialPair, + db_session: Session, + r: Redis, + tenant_id: str | None, +) -> int | None: + """Checks for any conditions that should block the pruning generator task from being + created, then creates the task. + + Does not check for scheduling related conditions as this function + is used to trigger prunes immediately, e.g. via the web ui. + """ + + redis_connector = RedisConnector(tenant_id, cc_pair.id) + + if not ALLOW_SIMULTANEOUS_PRUNING: + count = redis_connector.prune.get_active_task_count() + if count > 0: + return None + + LOCK_TIMEOUT = 30 + + # we need to serialize starting pruning since it can be triggered either via + # celery beat or manually (API call) + lock = r.lock( + DANSWER_REDIS_FUNCTION_LOCK_PREFIX + "try_creating_prune_generator_task", + timeout=LOCK_TIMEOUT, + ) + + acquired = lock.acquire(blocking_timeout=LOCK_TIMEOUT / 2) + if not acquired: + return None + + try: + # skip pruning if already pruning + if redis_connector.prune.fenced: + return None + + # skip pruning if the cc_pair is deleting + if redis_connector.delete.fenced: + return None + + # skip pruning if doc permissions sync is running + if redis_connector.permissions.fenced: + return None + + db_session.refresh(cc_pair) + if cc_pair.status == ConnectorCredentialPairStatus.DELETING: + return None + + # add a long running generator task to the queue + redis_connector.prune.generator_clear() + redis_connector.prune.taskset_clear() + + custom_task_id = f"{redis_connector.prune.generator_task_key}_{uuid4()}" + + celery_app.send_task( + "connector_pruning_generator_task", + kwargs=dict( + cc_pair_id=cc_pair.id, + connector_id=cc_pair.connector_id, + credential_id=cc_pair.credential_id, + tenant_id=tenant_id, + ), + queue=DanswerCeleryQueues.CONNECTOR_PRUNING, + task_id=custom_task_id, + priority=DanswerCeleryPriority.LOW, + ) + + # set this only after all tasks have been added + redis_connector.prune.set_fence(True) + except Exception: + task_logger.exception(f"Unexpected exception: cc_pair={cc_pair.id}") + return None + finally: + if lock.owned(): + lock.release() + + return 1 + + +@shared_task( + name="connector_pruning_generator_task", + acks_late=False, + soft_time_limit=JOB_TIMEOUT, + track_started=True, + trail=False, + bind=True, +) +def connector_pruning_generator_task( + self: Task, + cc_pair_id: int, + connector_id: int, + credential_id: int, + tenant_id: str | None, +) -> None: + """connector pruning task. For a cc pair, this task pulls all document IDs from the source + and compares those IDs to locally stored documents and deletes all locally stored IDs missing + from the most recently pulled document ID list""" + + pruning_ctx_dict = pruning_ctx.get() + pruning_ctx_dict["cc_pair_id"] = cc_pair_id + pruning_ctx_dict["request_id"] = self.request.id + pruning_ctx.set(pruning_ctx_dict) + + task_logger.info(f"Pruning generator starting: cc_pair={cc_pair_id}") + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + + r = get_redis_client(tenant_id=tenant_id) + + lock = r.lock( + DanswerRedisLocks.PRUNING_LOCK_PREFIX + f"_{redis_connector.id}", + timeout=CELERY_PRUNING_LOCK_TIMEOUT, + ) + + acquired = lock.acquire(blocking=False) + if not acquired: + task_logger.warning( + f"Pruning task already running, exiting...: cc_pair={cc_pair_id}" + ) + return None + + try: + with get_session_with_tenant(tenant_id) as db_session: + cc_pair = get_connector_credential_pair( + db_session=db_session, + connector_id=connector_id, + credential_id=credential_id, + ) + + if not cc_pair: + task_logger.warning( + f"cc_pair not found for {connector_id} {credential_id}" + ) + return + + task_logger.info( + f"Pruning generator running connector: " + f"cc_pair={cc_pair_id} " + f"connector_source={cc_pair.connector.source}" + ) + runnable_connector = instantiate_connector( + db_session, + cc_pair.connector.source, + InputType.SLIM_RETRIEVAL, + cc_pair.connector.connector_specific_config, + cc_pair.credential, + ) + + callback = IndexingCallback( + redis_connector.stop.fence_key, + redis_connector.prune.generator_progress_key, + lock, + r, + ) + + # a list of docs in the source + all_connector_doc_ids: set[str] = extract_ids_from_runnable_connector( + runnable_connector, callback + ) + + # a list of docs in our local index + all_indexed_document_ids = { + doc.id + for doc in get_documents_for_connector_credential_pair( + db_session=db_session, + connector_id=connector_id, + credential_id=credential_id, + ) + } + + # generate list of docs to remove (no longer in the source) + doc_ids_to_remove = list(all_indexed_document_ids - all_connector_doc_ids) + + task_logger.info( + f"Pruning set collected: " + f"cc_pair={cc_pair_id} " + f"connector_source={cc_pair.connector.source} " + f"docs_to_remove={len(doc_ids_to_remove)}" + ) + + task_logger.info( + f"RedisConnector.prune.generate_tasks starting. cc_pair={cc_pair_id}" + ) + tasks_generated = redis_connector.prune.generate_tasks( + set(doc_ids_to_remove), self.app, db_session, None + ) + if tasks_generated is None: + return None + + task_logger.info( + f"RedisConnector.prune.generate_tasks finished. " + f"cc_pair={cc_pair_id} tasks_generated={tasks_generated}" + ) + + redis_connector.prune.generator_complete = tasks_generated + except Exception as e: + task_logger.exception( + f"Failed to run pruning: cc_pair={cc_pair_id} connector={connector_id}" + ) + + redis_connector.prune.reset() + raise e + finally: + if lock.owned(): + lock.release() + + task_logger.info(f"Pruning generator finished: cc_pair={cc_pair_id}") diff --git a/backend/danswer/background/celery/tasks/shared/RetryDocumentIndex.py b/backend/danswer/background/celery/tasks/shared/RetryDocumentIndex.py new file mode 100644 index 00000000000..bdaca0d811e --- /dev/null +++ b/backend/danswer/background/celery/tasks/shared/RetryDocumentIndex.py @@ -0,0 +1,40 @@ +import httpx +from tenacity import retry +from tenacity import retry_if_exception_type +from tenacity import stop_after_delay +from tenacity import wait_random_exponential + +from danswer.document_index.interfaces import DocumentIndex +from danswer.document_index.interfaces import VespaDocumentFields + + +class RetryDocumentIndex: + """A wrapper class to help with specific retries against Vespa involving + read timeouts. + + wait_random_exponential implements full jitter as per this article: + https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/""" + + MAX_WAIT = 30 + + # STOP_AFTER + MAX_WAIT should be slightly less (5?) than the celery soft_time_limit + STOP_AFTER = 70 + + def __init__(self, index: DocumentIndex): + self.index: DocumentIndex = index + + @retry( + retry=retry_if_exception_type(httpx.ReadTimeout), + wait=wait_random_exponential(multiplier=1, max=MAX_WAIT), + stop=stop_after_delay(STOP_AFTER), + ) + def delete_single(self, doc_id: str) -> int: + return self.index.delete_single(doc_id) + + @retry( + retry=retry_if_exception_type(httpx.ReadTimeout), + wait=wait_random_exponential(multiplier=1, max=MAX_WAIT), + stop=stop_after_delay(STOP_AFTER), + ) + def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int: + return self.index.update_single(doc_id, fields) diff --git a/backend/danswer/background/celery/tasks/shared/tasks.py b/backend/danswer/background/celery/tasks/shared/tasks.py new file mode 100644 index 00000000000..2719a4d0665 --- /dev/null +++ b/backend/danswer/background/celery/tasks/shared/tasks.py @@ -0,0 +1,194 @@ +from http import HTTPStatus + +import httpx +from celery import shared_task +from celery import Task +from celery.exceptions import SoftTimeLimitExceeded +from tenacity import RetryError + +from danswer.access.access import get_access_for_document +from danswer.background.celery.apps.app_base import task_logger +from danswer.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex +from danswer.db.document import delete_document_by_connector_credential_pair__no_commit +from danswer.db.document import delete_documents_complete__no_commit +from danswer.db.document import get_document +from danswer.db.document import get_document_connector_count +from danswer.db.document import mark_document_as_modified +from danswer.db.document import mark_document_as_synced +from danswer.db.document_set import fetch_document_sets_for_document +from danswer.db.engine import get_session_with_tenant +from danswer.document_index.document_index_utils import get_both_index_names +from danswer.document_index.factory import get_default_document_index +from danswer.document_index.interfaces import VespaDocumentFields +from danswer.server.documents.models import ConnectorCredentialPairIdentifier + +DOCUMENT_BY_CC_PAIR_CLEANUP_MAX_RETRIES = 3 + + +# 5 seconds more than RetryDocumentIndex STOP_AFTER+MAX_WAIT +LIGHT_SOFT_TIME_LIMIT = 105 +LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15 + + +@shared_task( + name="document_by_cc_pair_cleanup_task", + soft_time_limit=LIGHT_SOFT_TIME_LIMIT, + time_limit=LIGHT_TIME_LIMIT, + max_retries=DOCUMENT_BY_CC_PAIR_CLEANUP_MAX_RETRIES, + bind=True, +) +def document_by_cc_pair_cleanup_task( + self: Task, + document_id: str, + connector_id: int, + credential_id: int, + tenant_id: str | None, +) -> bool: + """A lightweight subtask used to clean up document to cc pair relationships. + Created by connection deletion and connector pruning parent tasks.""" + + """ + To delete a connector / credential pair: + (1) find all documents associated with connector / credential pair where there + this the is only connector / credential pair that has indexed it + (2) delete all documents from document stores + (3) delete all entries from postgres + (4) find all documents associated with connector / credential pair where there + are multiple connector / credential pairs that have indexed it + (5) update document store entries to remove access associated with the + connector / credential pair from the access list + (6) delete all relevant entries from postgres + """ + task_logger.debug(f"Task start: tenant={tenant_id} doc={document_id}") + + try: + with get_session_with_tenant(tenant_id) as db_session: + action = "skip" + chunks_affected = 0 + + curr_ind_name, sec_ind_name = get_both_index_names(db_session) + doc_index = get_default_document_index( + primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name + ) + + retry_index = RetryDocumentIndex(doc_index) + + count = get_document_connector_count(db_session, document_id) + if count == 1: + # count == 1 means this is the only remaining cc_pair reference to the doc + # delete it from vespa and the db + action = "delete" + + chunks_affected = retry_index.delete_single(document_id) + delete_documents_complete__no_commit( + db_session=db_session, + document_ids=[document_id], + ) + elif count > 1: + action = "update" + + # count > 1 means the document still has cc_pair references + doc = get_document(document_id, db_session) + if not doc: + return False + + # the below functions do not include cc_pairs being deleted. + # i.e. they will correctly omit access for the current cc_pair + doc_access = get_access_for_document( + document_id=document_id, db_session=db_session + ) + + doc_sets = fetch_document_sets_for_document(document_id, db_session) + update_doc_sets: set[str] = set(doc_sets) + + fields = VespaDocumentFields( + document_sets=update_doc_sets, + access=doc_access, + boost=doc.boost, + hidden=doc.hidden, + ) + + # update Vespa. OK if doc doesn't exist. Raises exception otherwise. + chunks_affected = retry_index.update_single(document_id, fields=fields) + + # there are still other cc_pair references to the doc, so just resync to Vespa + delete_document_by_connector_credential_pair__no_commit( + db_session=db_session, + document_id=document_id, + connector_credential_pair_identifier=ConnectorCredentialPairIdentifier( + connector_id=connector_id, + credential_id=credential_id, + ), + ) + + mark_document_as_synced(document_id, db_session) + else: + pass + + db_session.commit() + + task_logger.info( + f"tenant={tenant_id} " + f"doc={document_id} " + f"action={action} " + f"refcount={count} " + f"chunks={chunks_affected}" + ) + except SoftTimeLimitExceeded: + task_logger.info( + f"SoftTimeLimitExceeded exception. tenant={tenant_id} doc={document_id}" + ) + return False + except Exception as ex: + if isinstance(ex, RetryError): + task_logger.warning( + f"Tenacity retry failed: num_attempts={ex.last_attempt.attempt_number}" + ) + + # only set the inner exception if it is of type Exception + e_temp = ex.last_attempt.exception() + if isinstance(e_temp, Exception): + e = e_temp + else: + e = ex + + if isinstance(e, httpx.HTTPStatusError): + if e.response.status_code == HTTPStatus.BAD_REQUEST: + task_logger.exception( + f"Non-retryable HTTPStatusError: " + f"tenant={tenant_id} " + f"doc={document_id} " + f"status={e.response.status_code}" + ) + return False + + task_logger.exception( + f"Unexpected exception: tenant={tenant_id} doc={document_id}" + ) + + if self.request.retries < DOCUMENT_BY_CC_PAIR_CLEANUP_MAX_RETRIES: + # Still retrying. Exponential backoff from 2^4 to 2^6 ... i.e. 16, 32, 64 + countdown = 2 ** (self.request.retries + 4) + self.retry(exc=e, countdown=countdown) + else: + # This is the last attempt! mark the document as dirty in the db so that it + # eventually gets fixed out of band via stale document reconciliation + task_logger.warning( + f"Max celery task retries reached. Marking doc as dirty for reconciliation: " + f"tenant={tenant_id} doc={document_id}" + ) + with get_session_with_tenant(tenant_id) as db_session: + # delete the cc pair relationship now and let reconciliation clean it up + # in vespa + delete_document_by_connector_credential_pair__no_commit( + db_session=db_session, + document_id=document_id, + connector_credential_pair_identifier=ConnectorCredentialPairIdentifier( + connector_id=connector_id, + credential_id=credential_id, + ), + ) + mark_document_as_modified(document_id, db_session) + return False + + return True diff --git a/backend/danswer/background/celery/tasks/vespa/tasks.py b/backend/danswer/background/celery/tasks/vespa/tasks.py new file mode 100644 index 00000000000..ec7f52bc03c --- /dev/null +++ b/backend/danswer/background/celery/tasks/vespa/tasks.py @@ -0,0 +1,896 @@ +import traceback +from datetime import datetime +from datetime import timezone +from http import HTTPStatus +from typing import cast + +import httpx +from celery import Celery +from celery import shared_task +from celery import Task +from celery.exceptions import SoftTimeLimitExceeded +from celery.result import AsyncResult +from celery.states import READY_STATES +from redis import Redis +from redis.lock import Lock as RedisLock +from sqlalchemy.orm import Session +from tenacity import RetryError + +from danswer.access.access import get_access_for_document +from danswer.background.celery.apps.app_base import task_logger +from danswer.background.celery.celery_redis import celery_get_queue_length +from danswer.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex +from danswer.background.celery.tasks.shared.tasks import LIGHT_SOFT_TIME_LIMIT +from danswer.background.celery.tasks.shared.tasks import LIGHT_TIME_LIMIT +from danswer.configs.app_configs import JOB_TIMEOUT +from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.configs.constants import DanswerCeleryQueues +from danswer.configs.constants import DanswerRedisLocks +from danswer.db.connector import fetch_connector_by_id +from danswer.db.connector import mark_cc_pair_as_permissions_synced +from danswer.db.connector import mark_ccpair_as_pruned +from danswer.db.connector_credential_pair import add_deletion_failure_message +from danswer.db.connector_credential_pair import ( + delete_connector_credential_pair__no_commit, +) +from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id +from danswer.db.connector_credential_pair import get_connector_credential_pairs +from danswer.db.document import count_documents_by_needs_sync +from danswer.db.document import get_document +from danswer.db.document import get_document_ids_for_connector_credential_pair +from danswer.db.document import mark_document_as_synced +from danswer.db.document_set import delete_document_set +from danswer.db.document_set import delete_document_set_cc_pair_relationship__no_commit +from danswer.db.document_set import fetch_document_sets +from danswer.db.document_set import fetch_document_sets_for_document +from danswer.db.document_set import get_document_set_by_id +from danswer.db.document_set import mark_document_set_as_synced +from danswer.db.engine import get_session_with_tenant +from danswer.db.index_attempt import delete_index_attempts +from danswer.db.index_attempt import get_index_attempt +from danswer.db.index_attempt import mark_attempt_failed +from danswer.db.models import DocumentSet +from danswer.document_index.document_index_utils import get_both_index_names +from danswer.document_index.factory import get_default_document_index +from danswer.document_index.interfaces import VespaDocumentFields +from danswer.redis.redis_connector import RedisConnector +from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair +from danswer.redis.redis_connector_delete import RedisConnectorDelete +from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync +from danswer.redis.redis_connector_doc_perm_sync import ( + RedisConnectorPermissionSyncData, +) +from danswer.redis.redis_connector_index import RedisConnectorIndex +from danswer.redis.redis_connector_prune import RedisConnectorPrune +from danswer.redis.redis_document_set import RedisDocumentSet +from danswer.redis.redis_pool import get_redis_client +from danswer.redis.redis_usergroup import RedisUserGroup +from danswer.utils.logger import setup_logger +from danswer.utils.variable_functionality import fetch_versioned_implementation +from danswer.utils.variable_functionality import ( + fetch_versioned_implementation_with_fallback, +) +from danswer.utils.variable_functionality import global_version +from danswer.utils.variable_functionality import noop_fallback + +logger = setup_logger() + + +# celery auto associates tasks created inside another task, +# which bloats the result metadata considerably. trail=False prevents this. +@shared_task( + name="check_for_vespa_sync_task", + soft_time_limit=JOB_TIMEOUT, + trail=False, + bind=True, +) +def check_for_vespa_sync_task(self: Task, *, tenant_id: str | None) -> None: + """Runs periodically to check if any document needs syncing. + Generates sets of tasks for Celery if syncing is needed.""" + + r = get_redis_client(tenant_id=tenant_id) + + lock_beat = r.lock( + DanswerRedisLocks.CHECK_VESPA_SYNC_BEAT_LOCK, + timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT, + ) + + try: + # these tasks should never overlap + if not lock_beat.acquire(blocking=False): + return + + with get_session_with_tenant(tenant_id) as db_session: + try_generate_stale_document_sync_tasks( + self.app, db_session, r, lock_beat, tenant_id + ) + + # region document set scan + document_set_ids: list[int] = [] + with get_session_with_tenant(tenant_id) as db_session: + # check if any document sets are not synced + document_set_info = fetch_document_sets( + user_id=None, db_session=db_session, include_outdated=True + ) + + for document_set, _ in document_set_info: + document_set_ids.append(document_set.id) + + for document_set_id in document_set_ids: + with get_session_with_tenant(tenant_id) as db_session: + try_generate_document_set_sync_tasks( + self.app, document_set_id, db_session, r, lock_beat, tenant_id + ) + # endregion + + # check if any user groups are not synced + if global_version.is_ee_version(): + try: + fetch_user_groups = fetch_versioned_implementation( + "danswer.db.user_group", "fetch_user_groups" + ) + except ModuleNotFoundError: + # Always exceptions on the MIT version, which is expected + # We shouldn't actually get here if the ee version check works + pass + else: + usergroup_ids: list[int] = [] + with get_session_with_tenant(tenant_id) as db_session: + user_groups = fetch_user_groups( + db_session=db_session, only_up_to_date=False + ) + + for usergroup in user_groups: + usergroup_ids.append(usergroup.id) + + for usergroup_id in usergroup_ids: + with get_session_with_tenant(tenant_id) as db_session: + try_generate_user_group_sync_tasks( + self.app, usergroup_id, db_session, r, lock_beat, tenant_id + ) + + except SoftTimeLimitExceeded: + task_logger.info( + "Soft time limit exceeded, task is being terminated gracefully." + ) + except Exception: + task_logger.exception(f"Unexpected exception: tenant={tenant_id}") + finally: + if lock_beat.owned(): + lock_beat.release() + + +def try_generate_stale_document_sync_tasks( + celery_app: Celery, + db_session: Session, + r: Redis, + lock_beat: RedisLock, + tenant_id: str | None, +) -> int | None: + # the fence is up, do nothing + if r.exists(RedisConnectorCredentialPair.get_fence_key()): + return None + + r.delete(RedisConnectorCredentialPair.get_taskset_key()) # delete the taskset + + # add tasks to celery and build up the task set to monitor in redis + stale_doc_count = count_documents_by_needs_sync(db_session) + if stale_doc_count == 0: + return None + + task_logger.info( + f"Stale documents found (at least {stale_doc_count}). Generating sync tasks by cc pair." + ) + + task_logger.info( + "RedisConnector.generate_tasks starting by cc_pair. " + "Documents spanning multiple cc_pairs will only be synced once." + ) + + docs_to_skip: set[str] = set() + + # rkuo: we could technically sync all stale docs in one big pass. + # but I feel it's more understandable to group the docs by cc_pair + total_tasks_generated = 0 + cc_pairs = get_connector_credential_pairs(db_session) + for cc_pair in cc_pairs: + rc = RedisConnectorCredentialPair(tenant_id, cc_pair.id) + rc.set_skip_docs(docs_to_skip) + result = rc.generate_tasks(celery_app, db_session, r, lock_beat, tenant_id) + + if result is None: + continue + + if result[1] == 0: + continue + + task_logger.info( + f"RedisConnector.generate_tasks finished for single cc_pair. " + f"cc_pair={cc_pair.id} tasks_generated={result[0]} tasks_possible={result[1]}" + ) + + total_tasks_generated += result[0] + + task_logger.info( + f"RedisConnector.generate_tasks finished for all cc_pairs. total_tasks_generated={total_tasks_generated}" + ) + + r.set(RedisConnectorCredentialPair.get_fence_key(), total_tasks_generated) + return total_tasks_generated + + +def try_generate_document_set_sync_tasks( + celery_app: Celery, + document_set_id: int, + db_session: Session, + r: Redis, + lock_beat: RedisLock, + tenant_id: str | None, +) -> int | None: + lock_beat.reacquire() + + rds = RedisDocumentSet(tenant_id, document_set_id) + + # don't generate document set sync tasks if tasks are still pending + if rds.fenced: + return None + + # don't generate sync tasks if we're up to date + # race condition with the monitor/cleanup function if we use a cached result! + document_set = get_document_set_by_id(db_session, document_set_id) + if not document_set: + return None + + if document_set.is_up_to_date: + return None + + # add tasks to celery and build up the task set to monitor in redis + r.delete(rds.taskset_key) + + task_logger.info( + f"RedisDocumentSet.generate_tasks starting. document_set_id={document_set.id}" + ) + + # Add all documents that need to be updated into the queue + result = rds.generate_tasks(celery_app, db_session, r, lock_beat, tenant_id) + if result is None: + return None + + tasks_generated = result[0] + # Currently we are allowing the sync to proceed with 0 tasks. + # It's possible for sets/groups to be generated initially with no entries + # and they still need to be marked as up to date. + # if tasks_generated == 0: + # return 0 + + task_logger.info( + f"RedisDocumentSet.generate_tasks finished. " + f"document_set={document_set.id} tasks_generated={tasks_generated}" + ) + + # set this only after all tasks have been added + rds.set_fence(tasks_generated) + return tasks_generated + + +def try_generate_user_group_sync_tasks( + celery_app: Celery, + usergroup_id: int, + db_session: Session, + r: Redis, + lock_beat: RedisLock, + tenant_id: str | None, +) -> int | None: + lock_beat.reacquire() + + rug = RedisUserGroup(tenant_id, usergroup_id) + if rug.fenced: + # don't generate sync tasks if tasks are still pending + return None + + # race condition with the monitor/cleanup function if we use a cached result! + fetch_user_group = fetch_versioned_implementation( + "danswer.db.user_group", "fetch_user_group" + ) + + usergroup = fetch_user_group(db_session, usergroup_id) + if not usergroup: + return None + + if usergroup.is_up_to_date: + return None + + # add tasks to celery and build up the task set to monitor in redis + r.delete(rug.taskset_key) + + # Add all documents that need to be updated into the queue + task_logger.info( + f"RedisUserGroup.generate_tasks starting. usergroup_id={usergroup.id}" + ) + result = rug.generate_tasks(celery_app, db_session, r, lock_beat, tenant_id) + if result is None: + return None + + tasks_generated = result[0] + # Currently we are allowing the sync to proceed with 0 tasks. + # It's possible for sets/groups to be generated initially with no entries + # and they still need to be marked as up to date. + # if tasks_generated == 0: + # return 0 + + task_logger.info( + f"RedisUserGroup.generate_tasks finished. " + f"usergroup={usergroup.id} tasks_generated={tasks_generated}" + ) + + # set this only after all tasks have been added + rug.set_fence(tasks_generated) + return tasks_generated + + +def monitor_connector_taskset(r: Redis) -> None: + fence_value = r.get(RedisConnectorCredentialPair.get_fence_key()) + if fence_value is None: + return + + try: + initial_count = int(cast(int, fence_value)) + except ValueError: + task_logger.error("The value is not an integer.") + return + + count = r.scard(RedisConnectorCredentialPair.get_taskset_key()) + task_logger.info( + f"Stale document sync progress: remaining={count} initial={initial_count}" + ) + if count == 0: + r.delete(RedisConnectorCredentialPair.get_taskset_key()) + r.delete(RedisConnectorCredentialPair.get_fence_key()) + task_logger.info(f"Successfully synced stale documents. count={initial_count}") + + +def monitor_document_set_taskset( + tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session +) -> None: + fence_key = key_bytes.decode("utf-8") + document_set_id_str = RedisDocumentSet.get_id_from_fence_key(fence_key) + if document_set_id_str is None: + task_logger.warning(f"could not parse document set id from {fence_key}") + return + + document_set_id = int(document_set_id_str) + + rds = RedisDocumentSet(tenant_id, document_set_id) + if not rds.fenced: + return + + initial_count = rds.payload + if initial_count is None: + return + + count = cast(int, r.scard(rds.taskset_key)) + task_logger.info( + f"Document set sync progress: document_set={document_set_id} " + f"remaining={count} initial={initial_count}" + ) + if count > 0: + return + + document_set = cast( + DocumentSet, + get_document_set_by_id(db_session=db_session, document_set_id=document_set_id), + ) # casting since we "know" a document set with this ID exists + if document_set: + if not document_set.connector_credential_pairs: + # if there are no connectors, then delete the document set. + delete_document_set(document_set_row=document_set, db_session=db_session) + task_logger.info( + f"Successfully deleted document set: document_set={document_set_id}" + ) + else: + mark_document_set_as_synced(document_set_id, db_session) + task_logger.info( + f"Successfully synced document set: document_set={document_set_id}" + ) + + rds.reset() + + +def monitor_connector_deletion_taskset( + tenant_id: str | None, key_bytes: bytes, r: Redis +) -> None: + fence_key = key_bytes.decode("utf-8") + cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key) + if cc_pair_id_str is None: + task_logger.warning(f"could not parse cc_pair_id from {fence_key}") + return + + cc_pair_id = int(cc_pair_id_str) + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + + fence_data = redis_connector.delete.payload + if not fence_data: + task_logger.warning( + f"Connector deletion - fence payload invalid: cc_pair={cc_pair_id}" + ) + return + + if fence_data.num_tasks is None: + # the fence is setting up but isn't ready yet + return + + remaining = redis_connector.delete.get_remaining() + task_logger.info( + f"Connector deletion progress: cc_pair={cc_pair_id} remaining={remaining} initial={fence_data.num_tasks}" + ) + if remaining > 0: + return + + with get_session_with_tenant(tenant_id) as db_session: + cc_pair = get_connector_credential_pair_from_id(cc_pair_id, db_session) + if not cc_pair: + task_logger.warning( + f"Connector deletion - cc_pair not found: cc_pair={cc_pair_id}" + ) + return + + try: + doc_ids = get_document_ids_for_connector_credential_pair( + db_session, cc_pair.connector_id, cc_pair.credential_id + ) + if len(doc_ids) > 0: + # NOTE(rkuo): if this happens, documents somehow got added while + # deletion was in progress. Likely a bug gating off pruning and indexing + # work before deletion starts. + task_logger.warning( + "Connector deletion - documents still found after taskset completion. " + "Clearing the current deletion attempt and allowing deletion to restart: " + f"cc_pair={cc_pair_id} " + f"docs_deleted={fence_data.num_tasks} " + f"docs_remaining={len(doc_ids)}" + ) + + # We don't want to waive off why we get into this state, but resetting + # our attempt and letting the deletion restart is a good way to recover + redis_connector.delete.reset() + raise RuntimeError( + "Connector deletion - documents still found after taskset completion" + ) + + # clean up the rest of the related Postgres entities + # index attempts + delete_index_attempts( + db_session=db_session, + cc_pair_id=cc_pair_id, + ) + + # document sets + delete_document_set_cc_pair_relationship__no_commit( + db_session=db_session, + connector_id=cc_pair.connector_id, + credential_id=cc_pair.credential_id, + ) + + # user groups + cleanup_user_groups = fetch_versioned_implementation_with_fallback( + "danswer.db.user_group", + "delete_user_group_cc_pair_relationship__no_commit", + noop_fallback, + ) + cleanup_user_groups( + cc_pair_id=cc_pair_id, + db_session=db_session, + ) + + # finally, delete the cc-pair + delete_connector_credential_pair__no_commit( + db_session=db_session, + connector_id=cc_pair.connector_id, + credential_id=cc_pair.credential_id, + ) + # if there are no credentials left, delete the connector + connector = fetch_connector_by_id( + db_session=db_session, + connector_id=cc_pair.connector_id, + ) + if not connector or not len(connector.credentials): + task_logger.info( + "Connector deletion - Found no credentials left for connector, deleting connector" + ) + db_session.delete(connector) + db_session.commit() + except Exception as e: + db_session.rollback() + stack_trace = traceback.format_exc() + error_message = f"Error: {str(e)}\n\nStack Trace:\n{stack_trace}" + add_deletion_failure_message(db_session, cc_pair_id, error_message) + task_logger.exception( + f"Connector deletion exceptioned: " + f"cc_pair={cc_pair_id} connector={cc_pair.connector_id} credential={cc_pair.credential_id}" + ) + raise e + + task_logger.info( + f"Connector deletion succeeded: " + f"cc_pair={cc_pair_id} " + f"connector={cc_pair.connector_id} " + f"credential={cc_pair.credential_id} " + f"docs_deleted={fence_data.num_tasks}" + ) + + redis_connector.delete.reset() + + +def monitor_ccpair_pruning_taskset( + tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session +) -> None: + fence_key = key_bytes.decode("utf-8") + cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key) + if cc_pair_id_str is None: + task_logger.warning( + f"monitor_ccpair_pruning_taskset: could not parse cc_pair_id from {fence_key}" + ) + return + + cc_pair_id = int(cc_pair_id_str) + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + if not redis_connector.prune.fenced: + return + + initial = redis_connector.prune.generator_complete + if initial is None: + return + + remaining = redis_connector.prune.get_remaining() + task_logger.info( + f"Connector pruning progress: cc_pair={cc_pair_id} remaining={remaining} initial={initial}" + ) + if remaining > 0: + return + + mark_ccpair_as_pruned(int(cc_pair_id), db_session) + task_logger.info( + f"Successfully pruned connector credential pair. cc_pair={cc_pair_id}" + ) + + redis_connector.prune.taskset_clear() + redis_connector.prune.generator_clear() + redis_connector.prune.set_fence(False) + + +def monitor_ccpair_permissions_taskset( + tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session +) -> None: + fence_key = key_bytes.decode("utf-8") + cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key) + if cc_pair_id_str is None: + task_logger.warning( + f"monitor_ccpair_permissions_taskset: could not parse cc_pair_id from {fence_key}" + ) + return + + cc_pair_id = int(cc_pair_id_str) + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + if not redis_connector.permissions.fenced: + return + + initial = redis_connector.permissions.generator_complete + if initial is None: + return + + remaining = redis_connector.permissions.get_remaining() + task_logger.info( + f"Permissions sync progress: cc_pair={cc_pair_id} remaining={remaining} initial={initial}" + ) + if remaining > 0: + return + + payload: RedisConnectorPermissionSyncData | None = ( + redis_connector.permissions.payload + ) + start_time: datetime | None = payload.started if payload else None + + mark_cc_pair_as_permissions_synced(db_session, int(cc_pair_id), start_time) + task_logger.info(f"Successfully synced permissions for cc_pair={cc_pair_id}") + + redis_connector.permissions.taskset_clear() + redis_connector.permissions.generator_clear() + redis_connector.permissions.set_fence(None) + + +def monitor_ccpair_indexing_taskset( + tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session +) -> None: + # if the fence doesn't exist, there's nothing to do + fence_key = key_bytes.decode("utf-8") + composite_id = RedisConnector.get_id_from_fence_key(fence_key) + if composite_id is None: + task_logger.warning( + f"monitor_ccpair_indexing_taskset: could not parse composite_id from {fence_key}" + ) + return + + # parse out metadata and initialize the helper class with it + parts = composite_id.split("/") + if len(parts) != 2: + return + + cc_pair_id = int(parts[0]) + search_settings_id = int(parts[1]) + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + redis_connector_index = redis_connector.new_index(search_settings_id) + if not redis_connector_index.fenced: + return + + payload = redis_connector_index.payload + if not payload: + return + + elapsed_submitted = datetime.now(timezone.utc) - payload.submitted + + progress = redis_connector_index.get_progress() + if progress is not None: + task_logger.info( + f"Connector indexing progress: cc_pair={cc_pair_id} " + f"search_settings={search_settings_id} " + f"progress={progress} " + f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}" + ) + + if payload.index_attempt_id is None or payload.celery_task_id is None: + # the task is still setting up + return + + # never use any blocking methods on the result from inside a task! + result: AsyncResult = AsyncResult(payload.celery_task_id) + + # inner/outer/inner double check pattern to avoid race conditions when checking for + # bad state + + # inner = get_completion / generator_complete not signaled + # outer = result.state in READY state + status_int = redis_connector_index.get_completion() + if status_int is None: # inner signal not set ... possible error + result_state = result.state + if ( + result_state in READY_STATES + ): # outer signal in terminal state ... possible error + # Now double check! + if redis_connector_index.get_completion() is None: + # inner signal still not set (and cannot change when outer result_state is READY) + # Task is finished but generator complete isn't set. + # We have a problem! Worker may have crashed. + + msg = ( + f"Connector indexing aborted or exceptioned: " + f"attempt={payload.index_attempt_id} " + f"celery_task={payload.celery_task_id} " + f"result_state={result_state} " + f"cc_pair={cc_pair_id} " + f"search_settings={search_settings_id} " + f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}" + ) + task_logger.warning(msg) + + index_attempt = get_index_attempt(db_session, payload.index_attempt_id) + if index_attempt: + mark_attempt_failed( + index_attempt_id=payload.index_attempt_id, + db_session=db_session, + failure_reason=msg, + ) + + redis_connector_index.reset() + return + + status_enum = HTTPStatus(status_int) + + task_logger.info( + f"Connector indexing finished: cc_pair={cc_pair_id} " + f"search_settings={search_settings_id} " + f"status={status_enum.name} " + f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}" + ) + + redis_connector_index.reset() + + +@shared_task(name="monitor_vespa_sync", soft_time_limit=300, bind=True) +def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool: + """This is a celery beat task that monitors and finalizes metadata sync tasksets. + It scans for fence values and then gets the counts of any associated tasksets. + If the count is 0, that means all tasks finished and we should clean up. + + This task lock timeout is CELERY_METADATA_SYNC_BEAT_LOCK_TIMEOUT seconds, so don't + do anything too expensive in this function! + + Returns True if the task actually did work, False if it exited early to prevent overlap + """ + r = get_redis_client(tenant_id=tenant_id) + + lock_beat: RedisLock = r.lock( + DanswerRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK, + timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT, + ) + + try: + # prevent overlapping tasks + if not lock_beat.acquire(blocking=False): + return False + + # print current queue lengths + r_celery = self.app.broker_connection().channel().client # type: ignore + n_celery = celery_get_queue_length("celery", r) + n_indexing = celery_get_queue_length( + DanswerCeleryQueues.CONNECTOR_INDEXING, r_celery + ) + n_sync = celery_get_queue_length( + DanswerCeleryQueues.VESPA_METADATA_SYNC, r_celery + ) + n_deletion = celery_get_queue_length( + DanswerCeleryQueues.CONNECTOR_DELETION, r_celery + ) + n_pruning = celery_get_queue_length( + DanswerCeleryQueues.CONNECTOR_PRUNING, r_celery + ) + n_permissions_sync = celery_get_queue_length( + DanswerCeleryQueues.CONNECTOR_DOC_PERMISSIONS_SYNC, r_celery + ) + + task_logger.info( + f"Queue lengths: celery={n_celery} " + f"indexing={n_indexing} " + f"sync={n_sync} " + f"deletion={n_deletion} " + f"pruning={n_pruning} " + f"permissions_sync={n_permissions_sync} " + ) + + lock_beat.reacquire() + if r.exists(RedisConnectorCredentialPair.get_fence_key()): + monitor_connector_taskset(r) + + lock_beat.reacquire() + for key_bytes in r.scan_iter(RedisConnectorDelete.FENCE_PREFIX + "*"): + lock_beat.reacquire() + monitor_connector_deletion_taskset(tenant_id, key_bytes, r) + + lock_beat.reacquire() + for key_bytes in r.scan_iter(RedisDocumentSet.FENCE_PREFIX + "*"): + lock_beat.reacquire() + with get_session_with_tenant(tenant_id) as db_session: + monitor_document_set_taskset(tenant_id, key_bytes, r, db_session) + + lock_beat.reacquire() + for key_bytes in r.scan_iter(RedisUserGroup.FENCE_PREFIX + "*"): + lock_beat.reacquire() + monitor_usergroup_taskset = fetch_versioned_implementation_with_fallback( + "danswer.background.celery.tasks.vespa.tasks", + "monitor_usergroup_taskset", + noop_fallback, + ) + with get_session_with_tenant(tenant_id) as db_session: + monitor_usergroup_taskset(tenant_id, key_bytes, r, db_session) + + lock_beat.reacquire() + for key_bytes in r.scan_iter(RedisConnectorPrune.FENCE_PREFIX + "*"): + lock_beat.reacquire() + with get_session_with_tenant(tenant_id) as db_session: + monitor_ccpair_pruning_taskset(tenant_id, key_bytes, r, db_session) + + lock_beat.reacquire() + for key_bytes in r.scan_iter(RedisConnectorIndex.FENCE_PREFIX + "*"): + lock_beat.reacquire() + with get_session_with_tenant(tenant_id) as db_session: + monitor_ccpair_indexing_taskset(tenant_id, key_bytes, r, db_session) + + lock_beat.reacquire() + for key_bytes in r.scan_iter(RedisConnectorPermissionSync.FENCE_PREFIX + "*"): + lock_beat.reacquire() + with get_session_with_tenant(tenant_id) as db_session: + monitor_ccpair_permissions_taskset(tenant_id, key_bytes, r, db_session) + + # uncomment for debugging if needed + # r_celery = celery_app.broker_connection().channel().client + # length = celery_get_queue_length(DanswerCeleryQueues.VESPA_METADATA_SYNC, r_celery) + # task_logger.warning(f"queue={DanswerCeleryQueues.VESPA_METADATA_SYNC} length={length}") + except SoftTimeLimitExceeded: + task_logger.info( + "Soft time limit exceeded, task is being terminated gracefully." + ) + finally: + if lock_beat.owned(): + lock_beat.release() + + return True + + +@shared_task( + name="vespa_metadata_sync_task", + bind=True, + soft_time_limit=LIGHT_SOFT_TIME_LIMIT, + time_limit=LIGHT_TIME_LIMIT, + max_retries=3, +) +def vespa_metadata_sync_task( + self: Task, document_id: str, tenant_id: str | None +) -> bool: + try: + with get_session_with_tenant(tenant_id) as db_session: + curr_ind_name, sec_ind_name = get_both_index_names(db_session) + doc_index = get_default_document_index( + primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name + ) + + retry_index = RetryDocumentIndex(doc_index) + + doc = get_document(document_id, db_session) + if not doc: + return False + + # document set sync + doc_sets = fetch_document_sets_for_document(document_id, db_session) + update_doc_sets: set[str] = set(doc_sets) + + # User group sync + doc_access = get_access_for_document( + document_id=document_id, db_session=db_session + ) + + fields = VespaDocumentFields( + document_sets=update_doc_sets, + access=doc_access, + boost=doc.boost, + hidden=doc.hidden, + ) + + # update Vespa. OK if doc doesn't exist. Raises exception otherwise. + chunks_affected = retry_index.update_single(document_id, fields) + + # update db last. Worst case = we crash right before this and + # the sync might repeat again later + mark_document_as_synced(document_id, db_session) + + task_logger.info( + f"tenant={tenant_id} doc={document_id} action=sync chunks={chunks_affected}" + ) + except SoftTimeLimitExceeded: + task_logger.info( + f"SoftTimeLimitExceeded exception. tenant={tenant_id} doc={document_id}" + ) + except Exception as ex: + if isinstance(ex, RetryError): + task_logger.warning( + f"Tenacity retry failed: num_attempts={ex.last_attempt.attempt_number}" + ) + + # only set the inner exception if it is of type Exception + e_temp = ex.last_attempt.exception() + if isinstance(e_temp, Exception): + e = e_temp + else: + e = ex + + if isinstance(e, httpx.HTTPStatusError): + if e.response.status_code == HTTPStatus.BAD_REQUEST: + task_logger.exception( + f"Non-retryable HTTPStatusError: " + f"tenant={tenant_id} " + f"doc={document_id} " + f"status={e.response.status_code}" + ) + return False + + task_logger.exception( + f"Unexpected exception: tenant={tenant_id} doc={document_id}" + ) + + # Exponential backoff from 2^4 to 2^6 ... i.e. 16, 32, 64 + countdown = 2 ** (self.request.retries + 4) + self.retry(exc=e, countdown=countdown) + + return True diff --git a/backend/danswer/background/celery/versioned_apps/beat.py b/backend/danswer/background/celery/versioned_apps/beat.py new file mode 100644 index 00000000000..af407f93c64 --- /dev/null +++ b/backend/danswer/background/celery/versioned_apps/beat.py @@ -0,0 +1,6 @@ +"""Factory stub for running celery worker / celery beat.""" +from danswer.background.celery.apps.beat import celery_app +from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable + +set_is_ee_based_on_env_variable() +app = celery_app diff --git a/backend/danswer/background/celery/versioned_apps/heavy.py b/backend/danswer/background/celery/versioned_apps/heavy.py new file mode 100644 index 00000000000..c2b58a53bfc --- /dev/null +++ b/backend/danswer/background/celery/versioned_apps/heavy.py @@ -0,0 +1,17 @@ +"""Factory stub for running celery worker / celery beat. +This code is different from the primary/beat stubs because there is no EE version to +fetch. Port over the code in those files if we add an EE version of this worker.""" +from celery import Celery + +from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable + +set_is_ee_based_on_env_variable() + + +def get_app() -> Celery: + from danswer.background.celery.apps.heavy import celery_app + + return celery_app + + +app = get_app() diff --git a/backend/danswer/background/celery/versioned_apps/indexing.py b/backend/danswer/background/celery/versioned_apps/indexing.py new file mode 100644 index 00000000000..ed26fc548bc --- /dev/null +++ b/backend/danswer/background/celery/versioned_apps/indexing.py @@ -0,0 +1,17 @@ +"""Factory stub for running celery worker / celery beat. +This code is different from the primary/beat stubs because there is no EE version to +fetch. Port over the code in those files if we add an EE version of this worker.""" +from celery import Celery + +from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable + +set_is_ee_based_on_env_variable() + + +def get_app() -> Celery: + from danswer.background.celery.apps.indexing import celery_app + + return celery_app + + +app = get_app() diff --git a/backend/danswer/background/celery/versioned_apps/light.py b/backend/danswer/background/celery/versioned_apps/light.py new file mode 100644 index 00000000000..3d229431ce5 --- /dev/null +++ b/backend/danswer/background/celery/versioned_apps/light.py @@ -0,0 +1,17 @@ +"""Factory stub for running celery worker / celery beat. +This code is different from the primary/beat stubs because there is no EE version to +fetch. Port over the code in those files if we add an EE version of this worker.""" +from celery import Celery + +from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable + +set_is_ee_based_on_env_variable() + + +def get_app() -> Celery: + from danswer.background.celery.apps.light import celery_app + + return celery_app + + +app = get_app() diff --git a/backend/danswer/background/celery/celery_run.py b/backend/danswer/background/celery/versioned_apps/primary.py similarity index 55% rename from backend/danswer/background/celery/celery_run.py rename to backend/danswer/background/celery/versioned_apps/primary.py index 0fdb2f044a8..2d97caa3da5 100644 --- a/backend/danswer/background/celery/celery_run.py +++ b/backend/danswer/background/celery/versioned_apps/primary.py @@ -1,9 +1,8 @@ -"""Entry point for running celery worker / celery beat.""" +"""Factory stub for running celery worker / celery beat.""" from danswer.utils.variable_functionality import fetch_versioned_implementation from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable - set_is_ee_based_on_env_variable() -celery_app = fetch_versioned_implementation( - "danswer.background.celery.celery_app", "celery_app" +app = fetch_versioned_implementation( + "danswer.background.celery.apps.primary", "celery_app" ) diff --git a/backend/danswer/background/connector_deletion.py b/backend/danswer/background/connector_deletion.py deleted file mode 100644 index 47a3477e6da..00000000000 --- a/backend/danswer/background/connector_deletion.py +++ /dev/null @@ -1,110 +0,0 @@ -""" -To delete a connector / credential pair: -(1) find all documents associated with connector / credential pair where there -this the is only connector / credential pair that has indexed it -(2) delete all documents from document stores -(3) delete all entries from postgres -(4) find all documents associated with connector / credential pair where there -are multiple connector / credential pairs that have indexed it -(5) update document store entries to remove access associated with the -connector / credential pair from the access list -(6) delete all relevant entries from postgres -""" -from sqlalchemy.orm import Session - -from danswer.access.access import get_access_for_documents -from danswer.db.document import delete_documents_by_connector_credential_pair__no_commit -from danswer.db.document import delete_documents_complete__no_commit -from danswer.db.document import get_document_connector_counts -from danswer.db.document import prepare_to_modify_documents -from danswer.db.document_set import fetch_document_sets_for_documents -from danswer.db.engine import get_sqlalchemy_engine -from danswer.document_index.interfaces import DocumentIndex -from danswer.document_index.interfaces import UpdateRequest -from danswer.server.documents.models import ConnectorCredentialPairIdentifier -from danswer.utils.logger import setup_logger - -logger = setup_logger() - -_DELETION_BATCH_SIZE = 1000 - - -def delete_connector_credential_pair_batch( - document_ids: list[str], - connector_id: int, - credential_id: int, - document_index: DocumentIndex, -) -> None: - """ - Removes a batch of documents ids from a cc-pair. If no other cc-pair uses a document anymore - it gets permanently deleted. - """ - with Session(get_sqlalchemy_engine()) as db_session: - # acquire lock for all documents in this batch so that indexing can't - # override the deletion - with prepare_to_modify_documents( - db_session=db_session, document_ids=document_ids - ): - document_connector_counts = get_document_connector_counts( - db_session=db_session, document_ids=document_ids - ) - - # figure out which docs need to be completely deleted - document_ids_to_delete = [ - document_id - for document_id, cnt in document_connector_counts - if cnt == 1 - ] - logger.debug(f"Deleting documents: {document_ids_to_delete}") - - document_index.delete(doc_ids=document_ids_to_delete) - - delete_documents_complete__no_commit( - db_session=db_session, - document_ids=document_ids_to_delete, - ) - - # figure out which docs need to be updated - document_ids_to_update = [ - document_id for document_id, cnt in document_connector_counts if cnt > 1 - ] - - # maps document id to list of document set names - new_doc_sets_for_documents: dict[str, set[str]] = { - document_id_and_document_set_names_tuple[0]: set( - document_id_and_document_set_names_tuple[1] - ) - for document_id_and_document_set_names_tuple in fetch_document_sets_for_documents( - db_session=db_session, - document_ids=document_ids_to_update, - ) - } - - # determine future ACLs for documents in batch - access_for_documents = get_access_for_documents( - document_ids=document_ids_to_update, - db_session=db_session, - ) - - # update Vespa - logger.debug(f"Updating documents: {document_ids_to_update}") - update_requests = [ - UpdateRequest( - document_ids=[document_id], - access=access, - document_sets=new_doc_sets_for_documents[document_id], - ) - for document_id, access in access_for_documents.items() - ] - document_index.update(update_requests=update_requests) - - # clean up Postgres - delete_documents_by_connector_credential_pair__no_commit( - db_session=db_session, - document_ids=document_ids_to_update, - connector_credential_pair_identifier=ConnectorCredentialPairIdentifier( - connector_id=connector_id, - credential_id=credential_id, - ), - ) - db_session.commit() diff --git a/backend/danswer/background/indexing/job_client.py b/backend/danswer/background/indexing/job_client.py index 68d706895fd..602ec4294c0 100644 --- a/backend/danswer/background/indexing/job_client.py +++ b/backend/danswer/background/indexing/job_client.py @@ -11,7 +11,8 @@ from typing import Literal from typing import Optional -from danswer.db.engine import get_sqlalchemy_engine +from danswer.configs.constants import POSTGRES_CELERY_WORKER_INDEXING_CHILD_APP_NAME +from danswer.db.engine import SqlEngine from danswer.utils.logger import setup_logger logger = setup_logger() @@ -28,16 +29,26 @@ def _initializer( func: Callable, args: list | tuple, kwargs: dict[str, Any] | None = None ) -> Any: - """Ensure the parent proc's database connections are not touched - in the new connection pool + """Initialize the child process with a fresh SQLAlchemy Engine. - Based on the recommended approach in the SQLAlchemy docs found: + Based on SQLAlchemy's recommendations to handle multiprocessing: https://docs.sqlalchemy.org/en/20/core/pooling.html#using-connection-pools-with-multiprocessing-or-os-fork """ if kwargs is None: kwargs = {} - get_sqlalchemy_engine().dispose(close=False) + logger.info("Initializing spawned worker child process.") + + # Reset the engine in the child process + SqlEngine.reset_engine() + + # Optionally set a custom app name for database logging purposes + SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_CHILD_APP_NAME) + + # Initialize a new engine with desired parameters + SqlEngine.init_engine(pool_size=4, max_overflow=12, pool_recycle=60) + + # Proceed with executing the target function return func(*args, **kwargs) diff --git a/backend/danswer/background/indexing/run_indexing.py b/backend/danswer/background/indexing/run_indexing.py index 86b4285361f..699e4682caa 100644 --- a/backend/danswer/background/indexing/run_indexing.py +++ b/backend/danswer/background/indexing/run_indexing.py @@ -14,24 +14,25 @@ from danswer.connectors.connector_runner import ConnectorRunner from danswer.connectors.factory import instantiate_connector from danswer.connectors.models import IndexAttemptMetadata +from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id from danswer.db.connector_credential_pair import get_last_successful_attempt_time from danswer.db.connector_credential_pair import update_connector_credential_pair -from danswer.db.engine import get_sqlalchemy_engine +from danswer.db.engine import get_session_with_tenant from danswer.db.enums import ConnectorCredentialPairStatus -from danswer.db.index_attempt import get_index_attempt from danswer.db.index_attempt import mark_attempt_failed -from danswer.db.index_attempt import mark_attempt_in_progress from danswer.db.index_attempt import mark_attempt_partially_succeeded from danswer.db.index_attempt import mark_attempt_succeeded +from danswer.db.index_attempt import transition_attempt_to_in_progress from danswer.db.index_attempt import update_docs_indexed from danswer.db.models import IndexAttempt from danswer.db.models import IndexingStatus from danswer.db.models import IndexModelStatus from danswer.document_index.factory import get_default_document_index from danswer.indexing.embedder import DefaultIndexingEmbedder +from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface from danswer.indexing.indexing_pipeline import build_indexing_pipeline -from danswer.utils.logger import IndexAttemptSingleton from danswer.utils.logger import setup_logger +from danswer.utils.logger import TaskAttemptSingleton from danswer.utils.variable_functionality import global_version logger = setup_logger() @@ -44,11 +45,12 @@ def _get_connector_runner( attempt: IndexAttempt, start_time: datetime, end_time: datetime, + tenant_id: str | None, ) -> ConnectorRunner: """ NOTE: `start_time` and `end_time` are only used for poll connectors - Returns an interator of document batches and whether the returned documents + Returns an iterator of document batches and whether the returned documents are the complete list of existing documents of the connector. If the task of type LOAD_STATE, the list will be considered complete and otherwise incomplete. """ @@ -56,22 +58,28 @@ def _get_connector_runner( try: runnable_connector = instantiate_connector( - attempt.connector_credential_pair.connector.source, - task, - attempt.connector_credential_pair.connector.connector_specific_config, - attempt.connector_credential_pair.credential, - db_session, + db_session=db_session, + source=attempt.connector_credential_pair.connector.source, + input_type=task, + connector_specific_config=attempt.connector_credential_pair.connector.connector_specific_config, + credential=attempt.connector_credential_pair.credential, + tenant_id=tenant_id, ) except Exception as e: logger.exception(f"Unable to instantiate connector due to {e}") # since we failed to even instantiate the connector, we pause the CCPair since # it will never succeed - update_connector_credential_pair( - db_session=db_session, - connector_id=attempt.connector_credential_pair.connector.id, - credential_id=attempt.connector_credential_pair.credential.id, - status=ConnectorCredentialPairStatus.PAUSED, + + cc_pair = get_connector_credential_pair_from_id( + attempt.connector_credential_pair.id, db_session ) + if cc_pair and cc_pair.status == ConnectorCredentialPairStatus.ACTIVE: + update_connector_credential_pair( + db_session=db_session, + connector_id=attempt.connector_credential_pair.connector.id, + credential_id=attempt.connector_credential_pair.credential.id, + status=ConnectorCredentialPairStatus.PAUSED, + ) raise e return ConnectorRunner( @@ -82,15 +90,26 @@ def _get_connector_runner( def _run_indexing( db_session: Session, index_attempt: IndexAttempt, + tenant_id: str | None, + callback: IndexingHeartbeatInterface | None = None, ) -> None: """ 1. Get documents which are either new or updated from specified application 2. Embed and index these documents into the chosen datastore (vespa) 3. Updates Postgres to record the indexed documents + the outcome of this run + + TODO: do not change index attempt statuses here ... instead, set signals in redis + and allow the monitor function to clean them up """ start_time = time.time() + if index_attempt.search_settings is None: + raise ValueError( + "Search settings must be set for indexing. This should not be possible." + ) + search_settings = index_attempt.search_settings + index_name = search_settings.index_name # Only update cc-pair status for primary index jobs @@ -103,16 +122,21 @@ def _run_indexing( ) embedding_model = DefaultIndexingEmbedder.from_db_search_settings( - search_settings=search_settings + search_settings=search_settings, + callback=callback, ) indexing_pipeline = build_indexing_pipeline( attempt_id=index_attempt.id, embedder=embedding_model, document_index=document_index, - ignore_time_skip=index_attempt.from_beginning - or (search_settings.status == IndexModelStatus.FUTURE), + ignore_time_skip=( + index_attempt.from_beginning + or (search_settings.status == IndexModelStatus.FUTURE) + ), db_session=db_session, + tenant_id=tenant_id, + callback=callback, ) db_cc_pair = index_attempt.connector_credential_pair @@ -169,6 +193,7 @@ def _run_indexing( attempt=index_attempt, start_time=window_start, end_time=window_end, + tenant_id=tenant_id, ) all_connector_doc_ids: set[str] = set() @@ -181,7 +206,14 @@ def _run_indexing( # index being built. We want to populate it even for paused connectors # Often paused connectors are sources that aren't updated frequently but the # contents still need to be initially pulled. - db_session.refresh(db_connector) + if callback: + if callback.should_stop(): + raise RuntimeError( + "_run_indexing: Connector stop signal detected" + ) + + # TODO: should we move this into the above callback instead? + db_session.refresh(db_cc_pair) if ( ( db_cc_pair.status == ConnectorCredentialPairStatus.PAUSED @@ -196,7 +228,9 @@ def _run_indexing( db_session.refresh(index_attempt) if index_attempt.status != IndexingStatus.IN_PROGRESS: # Likely due to user manually disabling it or model swap - raise RuntimeError("Index Attempt was canceled") + raise RuntimeError( + f"Index Attempt was canceled, status is {index_attempt.status}" + ) batch_description = [] for doc in doc_batch: @@ -216,6 +250,8 @@ def _run_indexing( logger.debug(f"Indexing batch of documents: {batch_description}") index_attempt_md.batch_num = batch_num + 1 # use 1-index for this + + # real work happens here! new_docs, total_batch_chunks = indexing_pipeline( document_batch=doc_batch, index_attempt_metadata=index_attempt_md, @@ -234,6 +270,9 @@ def _run_indexing( # be inaccurate db_session.commit() + if callback: + callback.progress("_run_indexing", len(doc_batch)) + # This new value is updated every batch, so UI can refresh per batch update update_docs_indexed( db_session=db_session, @@ -280,7 +319,7 @@ def _run_indexing( or index_attempt.status != IndexingStatus.IN_PROGRESS ): mark_attempt_failed( - index_attempt, + index_attempt.id, db_session, failure_reason=str(e), full_exception_trace=traceback.format_exc(), @@ -315,7 +354,7 @@ def _run_indexing( and index_attempt_md.num_exceptions >= batch_num ): mark_attempt_failed( - index_attempt, + index_attempt.id, db_session, failure_reason="All batches exceptioned.", ) @@ -357,69 +396,45 @@ def _run_indexing( ) -def _prepare_index_attempt(db_session: Session, index_attempt_id: int) -> IndexAttempt: - # make sure that the index attempt can't change in between checking the - # status and marking it as in_progress. This setting will be discarded - # after the next commit: - # https://docs.sqlalchemy.org/en/20/orm/session_transaction.html#setting-isolation-for-individual-transactions - db_session.connection(execution_options={"isolation_level": "SERIALIZABLE"}) # type: ignore - - attempt = get_index_attempt( - db_session=db_session, - index_attempt_id=index_attempt_id, - ) - - if attempt is None: - raise RuntimeError(f"Unable to find IndexAttempt for ID '{index_attempt_id}'") - - if attempt.status != IndexingStatus.NOT_STARTED: - raise RuntimeError( - f"Indexing attempt with ID '{index_attempt_id}' is not in NOT_STARTED status. " - f"Current status is '{attempt.status}'." - ) - - # only commit once, to make sure this all happens in a single transaction - mark_attempt_in_progress(attempt, db_session) - - return attempt - - def run_indexing_entrypoint( - index_attempt_id: int, connector_credential_pair_id: int, is_ee: bool = False + index_attempt_id: int, + tenant_id: str | None, + connector_credential_pair_id: int, + is_ee: bool = False, + callback: IndexingHeartbeatInterface | None = None, ) -> None: - """Entrypoint for indexing run when using dask distributed. - Wraps the actual logic in a `try` block so that we can catch any exceptions - and mark the attempt as failed.""" - try: if is_ee: global_version.set_ee() # set the indexing attempt ID so that all log messages from this process # will have it added as a prefix - IndexAttemptSingleton.set_cc_and_index_id( + TaskAttemptSingleton.set_cc_and_index_id( index_attempt_id, connector_credential_pair_id ) + with get_session_with_tenant(tenant_id) as db_session: + attempt = transition_attempt_to_in_progress(index_attempt_id, db_session) - with Session(get_sqlalchemy_engine()) as db_session: - # make sure that it is valid to run this indexing attempt + mark it - # as in progress - attempt = _prepare_index_attempt(db_session, index_attempt_id) + tenant_str = "" + if tenant_id is not None: + tenant_str = f" for tenant {tenant_id}" logger.info( - f"Indexing starting: " + f"Indexing starting{tenant_str}: " f"connector='{attempt.connector_credential_pair.connector.name}' " f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' " f"credentials='{attempt.connector_credential_pair.connector_id}'" ) - _run_indexing(db_session, attempt) + _run_indexing(db_session, attempt, tenant_id, callback) logger.info( - f"Indexing finished: " + f"Indexing finished{tenant_str}: " f"connector='{attempt.connector_credential_pair.connector.name}' " f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' " f"credentials='{attempt.connector_credential_pair.connector_id}'" ) except Exception as e: - logger.exception(f"Indexing job with ID '{index_attempt_id}' failed due to {e}") + logger.exception( + f"Indexing job with ID '{index_attempt_id}' for tenant {tenant_id} failed due to {e}" + ) diff --git a/backend/danswer/background/task_utils.py b/backend/danswer/background/task_utils.py index c1c24bf92a1..f4562892460 100644 --- a/backend/danswer/background/task_utils.py +++ b/backend/danswer/background/task_utils.py @@ -14,15 +14,6 @@ from danswer.db.tasks import register_task -def name_cc_prune_task( - connector_id: int | None = None, credential_id: int | None = None -) -> str: - task_name = f"prune_connector_credential_pair_{connector_id}_{credential_id}" - if not connector_id or not credential_id: - task_name = "prune_connector_credential_pair" - return task_name - - T = TypeVar("T", bound=Callable) diff --git a/backend/danswer/background/update.py b/backend/danswer/background/update.py deleted file mode 100755 index 5fde7cb3da0..00000000000 --- a/backend/danswer/background/update.py +++ /dev/null @@ -1,475 +0,0 @@ -import logging -import time -from datetime import datetime - -import dask -from dask.distributed import Client -from dask.distributed import Future -from distributed import LocalCluster -from sqlalchemy.orm import Session - -from danswer.background.indexing.dask_utils import ResourceLogger -from danswer.background.indexing.job_client import SimpleJob -from danswer.background.indexing.job_client import SimpleJobClient -from danswer.background.indexing.run_indexing import run_indexing_entrypoint -from danswer.configs.app_configs import CLEANUP_INDEXING_JOBS_TIMEOUT -from danswer.configs.app_configs import DASK_JOB_CLIENT_ENABLED -from danswer.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP -from danswer.configs.app_configs import NUM_INDEXING_WORKERS -from danswer.configs.app_configs import NUM_SECONDARY_INDEXING_WORKERS -from danswer.configs.constants import DocumentSource -from danswer.configs.constants import POSTGRES_INDEXER_APP_NAME -from danswer.db.connector import fetch_connectors -from danswer.db.connector_credential_pair import fetch_connector_credential_pairs -from danswer.db.engine import get_db_current_time -from danswer.db.engine import get_sqlalchemy_engine -from danswer.db.engine import init_sqlalchemy_engine -from danswer.db.index_attempt import create_index_attempt -from danswer.db.index_attempt import get_index_attempt -from danswer.db.index_attempt import get_inprogress_index_attempts -from danswer.db.index_attempt import get_last_attempt_for_cc_pair -from danswer.db.index_attempt import get_not_started_index_attempts -from danswer.db.index_attempt import mark_attempt_failed -from danswer.db.models import ConnectorCredentialPair -from danswer.db.models import IndexAttempt -from danswer.db.models import IndexingStatus -from danswer.db.models import IndexModelStatus -from danswer.db.models import SearchSettings -from danswer.db.search_settings import get_current_search_settings -from danswer.db.search_settings import get_secondary_search_settings -from danswer.db.swap_index import check_index_swap -from danswer.natural_language_processing.search_nlp_models import EmbeddingModel -from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder -from danswer.utils.logger import setup_logger -from danswer.utils.variable_functionality import global_version -from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable -from shared_configs.configs import INDEXING_MODEL_SERVER_HOST -from shared_configs.configs import LOG_LEVEL -from shared_configs.configs import MODEL_SERVER_PORT - -logger = setup_logger() - -# If the indexing dies, it's most likely due to resource constraints, -# restarting just delays the eventual failure, not useful to the user -dask.config.set({"distributed.scheduler.allowed-failures": 0}) - -_UNEXPECTED_STATE_FAILURE_REASON = ( - "Stopped mid run, likely due to the background process being killed" -) - - -def _should_create_new_indexing( - cc_pair: ConnectorCredentialPair, - last_index: IndexAttempt | None, - search_settings_instance: SearchSettings, - secondary_index_building: bool, - db_session: Session, -) -> bool: - connector = cc_pair.connector - - # don't kick off indexing for `NOT_APPLICABLE` sources - if connector.source == DocumentSource.NOT_APPLICABLE: - return False - - # User can still manually create single indexing attempts via the UI for the - # currently in use index - if DISABLE_INDEX_UPDATE_ON_SWAP: - if ( - search_settings_instance.status == IndexModelStatus.PRESENT - and secondary_index_building - ): - return False - - # When switching over models, always index at least once - if search_settings_instance.status == IndexModelStatus.FUTURE: - if last_index: - # No new index if the last index attempt succeeded - # Once is enough. The model will never be able to swap otherwise. - if last_index.status == IndexingStatus.SUCCESS: - return False - - # No new index if the last index attempt is waiting to start - if last_index.status == IndexingStatus.NOT_STARTED: - return False - - # No new index if the last index attempt is running - if last_index.status == IndexingStatus.IN_PROGRESS: - return False - else: - if connector.id == 0: # Ingestion API - return False - return True - - # If the connector is paused or is the ingestion API, don't index - # NOTE: during an embedding model switch over, the following logic - # is bypassed by the above check for a future model - if not cc_pair.status.is_active() or connector.id == 0: - return False - - if not last_index: - return True - - if connector.refresh_freq is None: - return False - - # Only one scheduled/ongoing job per connector at a time - # this prevents cases where - # (1) the "latest" index_attempt is scheduled so we show - # that in the UI despite another index_attempt being in-progress - # (2) multiple scheduled index_attempts at a time - if ( - last_index.status == IndexingStatus.NOT_STARTED - or last_index.status == IndexingStatus.IN_PROGRESS - ): - return False - - current_db_time = get_db_current_time(db_session) - time_since_index = current_db_time - last_index.time_updated - return time_since_index.total_seconds() >= connector.refresh_freq - - -def _mark_run_failed( - db_session: Session, index_attempt: IndexAttempt, failure_reason: str -) -> None: - """Marks the `index_attempt` row as failed + updates the ` - connector_credential_pair` to reflect that the run failed""" - logger.warning( - f"Marking in-progress attempt 'connector: {index_attempt.connector_credential_pair.connector_id}, " - f"credential: {index_attempt.connector_credential_pair.credential_id}' as failed due to {failure_reason}" - ) - mark_attempt_failed( - index_attempt=index_attempt, - db_session=db_session, - failure_reason=failure_reason, - ) - - -"""Main funcs""" - - -def create_indexing_jobs(existing_jobs: dict[int, Future | SimpleJob]) -> None: - """Creates new indexing jobs for each connector / credential pair which is: - 1. Enabled - 2. `refresh_frequency` time has passed since the last indexing run for this pair - 3. There is not already an ongoing indexing attempt for this pair - """ - with Session(get_sqlalchemy_engine()) as db_session: - ongoing: set[tuple[int | None, int]] = set() - for attempt_id in existing_jobs: - attempt = get_index_attempt( - db_session=db_session, index_attempt_id=attempt_id - ) - if attempt is None: - logger.error( - f"Unable to find IndexAttempt for ID '{attempt_id}' when creating " - "indexing jobs" - ) - continue - ongoing.add( - ( - attempt.connector_credential_pair_id, - attempt.search_settings_id, - ) - ) - - # Get the primary search settings - primary_search_settings = get_current_search_settings(db_session) - search_settings = [primary_search_settings] - - # Check for secondary search settings - secondary_search_settings = get_secondary_search_settings(db_session) - if secondary_search_settings is not None: - # If secondary settings exist, add them to the list - search_settings.append(secondary_search_settings) - - all_connector_credential_pairs = fetch_connector_credential_pairs(db_session) - for cc_pair in all_connector_credential_pairs: - for search_settings_instance in search_settings: - # Check if there is an ongoing indexing attempt for this connector credential pair - if (cc_pair.id, search_settings_instance.id) in ongoing: - continue - - last_attempt = get_last_attempt_for_cc_pair( - cc_pair.id, search_settings_instance.id, db_session - ) - if not _should_create_new_indexing( - cc_pair=cc_pair, - last_index=last_attempt, - search_settings_instance=search_settings_instance, - secondary_index_building=len(search_settings) > 1, - db_session=db_session, - ): - continue - - create_index_attempt( - cc_pair.id, search_settings_instance.id, db_session - ) - - -def cleanup_indexing_jobs( - existing_jobs: dict[int, Future | SimpleJob], - timeout_hours: int = CLEANUP_INDEXING_JOBS_TIMEOUT, -) -> dict[int, Future | SimpleJob]: - existing_jobs_copy = existing_jobs.copy() - - # clean up completed jobs - with Session(get_sqlalchemy_engine()) as db_session: - for attempt_id, job in existing_jobs.items(): - index_attempt = get_index_attempt( - db_session=db_session, index_attempt_id=attempt_id - ) - - # do nothing for ongoing jobs that haven't been stopped - if not job.done(): - if not index_attempt: - continue - - if not index_attempt.is_finished(): - continue - - if job.status == "error": - logger.error(job.exception()) - - job.release() - del existing_jobs_copy[attempt_id] - - if not index_attempt: - logger.error( - f"Unable to find IndexAttempt for ID '{attempt_id}' when cleaning " - "up indexing jobs" - ) - continue - - if ( - index_attempt.status == IndexingStatus.IN_PROGRESS - or job.status == "error" - ): - _mark_run_failed( - db_session=db_session, - index_attempt=index_attempt, - failure_reason=_UNEXPECTED_STATE_FAILURE_REASON, - ) - - # clean up in-progress jobs that were never completed - connectors = fetch_connectors(db_session) - for connector in connectors: - in_progress_indexing_attempts = get_inprogress_index_attempts( - connector.id, db_session - ) - for index_attempt in in_progress_indexing_attempts: - if index_attempt.id in existing_jobs: - # If index attempt is canceled, stop the run - if index_attempt.status == IndexingStatus.FAILED: - existing_jobs[index_attempt.id].cancel() - # check to see if the job has been updated in last `timeout_hours` hours, if not - # assume it to frozen in some bad state and just mark it as failed. Note: this relies - # on the fact that the `time_updated` field is constantly updated every - # batch of documents indexed - current_db_time = get_db_current_time(db_session=db_session) - time_since_update = current_db_time - index_attempt.time_updated - if time_since_update.total_seconds() > 60 * 60 * timeout_hours: - existing_jobs[index_attempt.id].cancel() - _mark_run_failed( - db_session=db_session, - index_attempt=index_attempt, - failure_reason="Indexing run frozen - no updates in the last three hours. " - "The run will be re-attempted at next scheduled indexing time.", - ) - else: - # If job isn't known, simply mark it as failed - _mark_run_failed( - db_session=db_session, - index_attempt=index_attempt, - failure_reason=_UNEXPECTED_STATE_FAILURE_REASON, - ) - - return existing_jobs_copy - - -def kickoff_indexing_jobs( - existing_jobs: dict[int, Future | SimpleJob], - client: Client | SimpleJobClient, - secondary_client: Client | SimpleJobClient, -) -> dict[int, Future | SimpleJob]: - existing_jobs_copy = existing_jobs.copy() - engine = get_sqlalchemy_engine() - - # Don't include jobs waiting in the Dask queue that just haven't started running - # Also (rarely) don't include for jobs that started but haven't updated the indexing tables yet - with Session(engine) as db_session: - # get_not_started_index_attempts orders its returned results from oldest to newest - # we must process attempts in a FIFO manner to prevent connector starvation - new_indexing_attempts = [ - (attempt, attempt.search_settings) - for attempt in get_not_started_index_attempts(db_session) - if attempt.id not in existing_jobs - ] - - logger.debug(f"Found {len(new_indexing_attempts)} new indexing task(s).") - - if not new_indexing_attempts: - return existing_jobs - - indexing_attempt_count = 0 - - for attempt, search_settings in new_indexing_attempts: - use_secondary_index = ( - search_settings.status == IndexModelStatus.FUTURE - if search_settings is not None - else False - ) - if attempt.connector_credential_pair.connector is None: - logger.warning( - f"Skipping index attempt as Connector has been deleted: {attempt}" - ) - with Session(engine) as db_session: - mark_attempt_failed( - attempt, db_session, failure_reason="Connector is null" - ) - continue - if attempt.connector_credential_pair.credential is None: - logger.warning( - f"Skipping index attempt as Credential has been deleted: {attempt}" - ) - with Session(engine) as db_session: - mark_attempt_failed( - attempt, db_session, failure_reason="Credential is null" - ) - continue - - if use_secondary_index: - run = secondary_client.submit( - run_indexing_entrypoint, - attempt.id, - attempt.connector_credential_pair_id, - global_version.get_is_ee_version(), - pure=False, - ) - else: - run = client.submit( - run_indexing_entrypoint, - attempt.id, - attempt.connector_credential_pair_id, - global_version.get_is_ee_version(), - pure=False, - ) - - if run: - if indexing_attempt_count == 0: - logger.info( - f"Indexing dispatch starts: pending={len(new_indexing_attempts)}" - ) - - indexing_attempt_count += 1 - secondary_str = " (secondary index)" if use_secondary_index else "" - logger.info( - f"Indexing dispatched{secondary_str}: " - f"attempt_id={attempt.id} " - f"connector='{attempt.connector_credential_pair.connector.name}' " - f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' " - f"credentials='{attempt.connector_credential_pair.credential_id}'" - ) - existing_jobs_copy[attempt.id] = run - - if indexing_attempt_count > 0: - logger.info( - f"Indexing dispatch results: " - f"initial_pending={len(new_indexing_attempts)} " - f"started={indexing_attempt_count} " - f"remaining={len(new_indexing_attempts) - indexing_attempt_count}" - ) - - return existing_jobs_copy - - -def update_loop( - delay: int = 10, - num_workers: int = NUM_INDEXING_WORKERS, - num_secondary_workers: int = NUM_SECONDARY_INDEXING_WORKERS, -) -> None: - engine = get_sqlalchemy_engine() - with Session(engine) as db_session: - check_index_swap(db_session=db_session) - search_settings = get_current_search_settings(db_session) - - # So that the first time users aren't surprised by really slow speed of first - # batch of documents indexed - - if search_settings.provider_type is None: - logger.notice("Running a first inference to warm up embedding model") - embedding_model = EmbeddingModel.from_db_model( - search_settings=search_settings, - server_host=INDEXING_MODEL_SERVER_HOST, - server_port=MODEL_SERVER_PORT, - ) - - warm_up_bi_encoder( - embedding_model=embedding_model, - ) - - client_primary: Client | SimpleJobClient - client_secondary: Client | SimpleJobClient - if DASK_JOB_CLIENT_ENABLED: - cluster_primary = LocalCluster( - n_workers=num_workers, - threads_per_worker=1, - # there are warning about high memory usage + "Event loop unresponsive" - # which are not relevant to us since our workers are expected to use a - # lot of memory + involve CPU intensive tasks that will not relinquish - # the event loop - silence_logs=logging.ERROR, - ) - cluster_secondary = LocalCluster( - n_workers=num_secondary_workers, - threads_per_worker=1, - silence_logs=logging.ERROR, - ) - client_primary = Client(cluster_primary) - client_secondary = Client(cluster_secondary) - if LOG_LEVEL.lower() == "debug": - client_primary.register_worker_plugin(ResourceLogger()) - else: - client_primary = SimpleJobClient(n_workers=num_workers) - client_secondary = SimpleJobClient(n_workers=num_secondary_workers) - - existing_jobs: dict[int, Future | SimpleJob] = {} - - while True: - start = time.time() - start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S") - logger.debug(f"Running update, current UTC time: {start_time_utc}") - - if existing_jobs: - # TODO: make this debug level once the "no jobs are being scheduled" issue is resolved - logger.debug( - "Found existing indexing jobs: " - f"{[(attempt_id, job.status) for attempt_id, job in existing_jobs.items()]}" - ) - - try: - with Session(get_sqlalchemy_engine()) as db_session: - check_index_swap(db_session) - existing_jobs = cleanup_indexing_jobs(existing_jobs=existing_jobs) - create_indexing_jobs(existing_jobs=existing_jobs) - existing_jobs = kickoff_indexing_jobs( - existing_jobs=existing_jobs, - client=client_primary, - secondary_client=client_secondary, - ) - except Exception as e: - logger.exception(f"Failed to run update due to {e}") - sleep_time = delay - (time.time() - start) - if sleep_time > 0: - time.sleep(sleep_time) - - -def update__main() -> None: - set_is_ee_based_on_env_variable() - init_sqlalchemy_engine(POSTGRES_INDEXER_APP_NAME) - - logger.notice("Starting indexing service") - update_loop() - - -if __name__ == "__main__": - update__main() diff --git a/backend/danswer/chat/chat_utils.py b/backend/danswer/chat/chat_utils.py index b1e4132779b..5e42ae23f5a 100644 --- a/backend/danswer/chat/chat_utils.py +++ b/backend/danswer/chat/chat_utils.py @@ -1,14 +1,16 @@ import re from typing import cast +from uuid import UUID +from fastapi.datastructures import Headers from sqlalchemy.orm import Session from danswer.chat.models import CitationInfo from danswer.chat.models import LlmDoc +from danswer.context.search.models import InferenceSection from danswer.db.chat import get_chat_messages_by_session from danswer.db.models import ChatMessage from danswer.llm.answering.models import PreviousMessage -from danswer.search.models import InferenceSection from danswer.utils.logger import setup_logger logger = setup_logger() @@ -33,7 +35,7 @@ def llm_doc_from_inference_section(inference_section: InferenceSection) -> LlmDo def create_chat_chain( - chat_session_id: int, + chat_session_id: UUID, db_session: Session, prefetch_tool_calls: bool = True, # Optional id at which we finish processing @@ -166,3 +168,31 @@ def slack_link_format(match: re.Match) -> str: new_citation_info[citation.citation_num] = citation return new_answer, list(new_citation_info.values()) + + +def extract_headers( + headers: dict[str, str] | Headers, pass_through_headers: list[str] | None +) -> dict[str, str]: + """ + Extract headers specified in pass_through_headers from input headers. + Handles both dict and FastAPI Headers objects, accounting for lowercase keys. + + Args: + headers: Input headers as dict or Headers object. + + Returns: + dict: Filtered headers based on pass_through_headers. + """ + if not pass_through_headers: + return {} + + extracted_headers: dict[str, str] = {} + for key in pass_through_headers: + if key in headers: + extracted_headers[key] = headers[key] + else: + # fastapi makes all header keys lowercase, handling that here + lowercase_key = key.lower() + if lowercase_key in headers: + extracted_headers[lowercase_key] = headers[lowercase_key] + return extracted_headers diff --git a/backend/danswer/chat/load_yamls.py b/backend/danswer/chat/load_yamls.py deleted file mode 100644 index 1839b3a5f23..00000000000 --- a/backend/danswer/chat/load_yamls.py +++ /dev/null @@ -1,175 +0,0 @@ -import yaml -from sqlalchemy.orm import Session - -from danswer.configs.chat_configs import INPUT_PROMPT_YAML -from danswer.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT -from danswer.configs.chat_configs import PERSONAS_YAML -from danswer.configs.chat_configs import PROMPTS_YAML -from danswer.db.document_set import get_or_create_document_set_by_name -from danswer.db.engine import get_sqlalchemy_engine -from danswer.db.input_prompt import insert_input_prompt_if_not_exists -from danswer.db.models import DocumentSet as DocumentSetDBModel -from danswer.db.models import Persona -from danswer.db.models import Prompt as PromptDBModel -from danswer.db.models import Tool as ToolDBModel -from danswer.db.persona import get_prompt_by_name -from danswer.db.persona import upsert_persona -from danswer.db.persona import upsert_prompt -from danswer.search.enums import RecencyBiasSetting - - -def load_prompts_from_yaml(prompts_yaml: str = PROMPTS_YAML) -> None: - with open(prompts_yaml, "r") as file: - data = yaml.safe_load(file) - - all_prompts = data.get("prompts", []) - with Session(get_sqlalchemy_engine()) as db_session: - for prompt in all_prompts: - upsert_prompt( - user=None, - prompt_id=prompt.get("id"), - name=prompt["name"], - description=prompt["description"].strip(), - system_prompt=prompt["system"].strip(), - task_prompt=prompt["task"].strip(), - include_citations=prompt["include_citations"], - datetime_aware=prompt.get("datetime_aware", True), - default_prompt=True, - personas=None, - db_session=db_session, - commit=True, - ) - - -def load_personas_from_yaml( - personas_yaml: str = PERSONAS_YAML, - default_chunks: float = MAX_CHUNKS_FED_TO_CHAT, -) -> None: - with open(personas_yaml, "r") as file: - data = yaml.safe_load(file) - - all_personas = data.get("personas", []) - with Session(get_sqlalchemy_engine()) as db_session: - for persona in all_personas: - doc_set_names = persona["document_sets"] - doc_sets: list[DocumentSetDBModel] = [ - get_or_create_document_set_by_name(db_session, name) - for name in doc_set_names - ] - - # Assume if user hasn't set any document sets for the persona, the user may want - # to later attach document sets to the persona manually, therefore, don't overwrite/reset - # the document sets for the persona - doc_set_ids: list[int] | None = None - if doc_sets: - doc_set_ids = [doc_set.id for doc_set in doc_sets] - else: - doc_set_ids = None - - prompt_ids: list[int] | None = None - prompt_set_names = persona["prompts"] - if prompt_set_names: - prompts: list[PromptDBModel | None] = [ - get_prompt_by_name(prompt_name, user=None, db_session=db_session) - for prompt_name in prompt_set_names - ] - if any([prompt is None for prompt in prompts]): - raise ValueError("Invalid Persona configs, not all prompts exist") - - if prompts: - prompt_ids = [prompt.id for prompt in prompts if prompt is not None] - - p_id = persona.get("id") - tool_ids = [] - if persona.get("image_generation"): - image_gen_tool = ( - db_session.query(ToolDBModel) - .filter(ToolDBModel.name == "ImageGenerationTool") - .first() - ) - if image_gen_tool: - tool_ids.append(image_gen_tool.id) - - llm_model_provider_override = persona.get("llm_model_provider_override") - llm_model_version_override = persona.get("llm_model_version_override") - - # Set specific overrides for image generation persona - if persona.get("image_generation"): - llm_model_version_override = "gpt-4o" - - # Load Internet Search Tool. - if persona.get("internet_search"): - internet_search_tool = ( - db_session.query(ToolDBModel) - .filter(ToolDBModel.name == "InternetSearchTool") - .first() - ) - if internet_search_tool: - tool_ids.append(internet_search_tool.id) - - existing_persona = ( - db_session.query(Persona) - .filter(Persona.name == persona["name"]) - .first() - ) - - upsert_persona( - user=None, - persona_id=(-1 * p_id) if p_id is not None else None, - name=persona["name"], - description=persona["description"], - num_chunks=persona.get("num_chunks") - if persona.get("num_chunks") is not None - else default_chunks, - llm_relevance_filter=persona.get("llm_relevance_filter"), - starter_messages=persona.get("starter_messages"), - llm_filter_extraction=persona.get("llm_filter_extraction"), - icon_shape=persona.get("icon_shape"), - icon_color=persona.get("icon_color"), - llm_model_provider_override=llm_model_provider_override, - llm_model_version_override=llm_model_version_override, - recency_bias=RecencyBiasSetting(persona["recency_bias"]), - prompt_ids=prompt_ids, - document_set_ids=doc_set_ids, - tool_ids=tool_ids, - default_persona=True, - is_public=True, - display_priority=existing_persona.display_priority - if existing_persona is not None - else persona.get("display_priority"), - is_visible=existing_persona.is_visible - if existing_persona is not None - else persona.get("is_visible"), - db_session=db_session, - ) - - -def load_input_prompts_from_yaml(input_prompts_yaml: str = INPUT_PROMPT_YAML) -> None: - with open(input_prompts_yaml, "r") as file: - data = yaml.safe_load(file) - - all_input_prompts = data.get("input_prompts", []) - with Session(get_sqlalchemy_engine()) as db_session: - for input_prompt in all_input_prompts: - # If these prompts are deleted (which is a hard delete in the DB), on server startup - # they will be recreated, but the user can always just deactivate them, just a light inconvenience - insert_input_prompt_if_not_exists( - user=None, - input_prompt_id=input_prompt.get("id"), - prompt=input_prompt["prompt"], - content=input_prompt["content"], - is_public=input_prompt["is_public"], - active=input_prompt.get("active", True), - db_session=db_session, - commit=True, - ) - - -def load_chat_yamls( - prompt_yaml: str = PROMPTS_YAML, - personas_yaml: str = PERSONAS_YAML, - input_prompts_yaml: str = INPUT_PROMPT_YAML, -) -> None: - load_prompts_from_yaml(prompt_yaml) - load_personas_from_yaml(personas_yaml) - load_input_prompts_from_yaml(input_prompts_yaml) diff --git a/backend/danswer/chat/models.py b/backend/danswer/chat/models.py index 97d5b9e7275..3852029c47b 100644 --- a/backend/danswer/chat/models.py +++ b/backend/danswer/chat/models.py @@ -6,11 +6,11 @@ from pydantic import BaseModel from danswer.configs.constants import DocumentSource -from danswer.search.enums import QueryFlow -from danswer.search.enums import SearchType -from danswer.search.models import RetrievalDocs -from danswer.search.models import SearchResponse -from danswer.tools.custom.base_tool_types import ToolResultType +from danswer.context.search.enums import QueryFlow +from danswer.context.search.enums import SearchType +from danswer.context.search.models import RetrievalDocs +from danswer.context.search.models import SearchResponse +from danswer.tools.tool_implementations.custom.base_tool_types import ToolResultType class LlmDoc(BaseModel): @@ -156,7 +156,7 @@ class QAResponse(SearchResponse, DanswerAnswer): error_msg: str | None = None -class ImageGenerationDisplay(BaseModel): +class FileChatDisplay(BaseModel): file_ids: list[str] @@ -170,7 +170,7 @@ class CustomToolResponse(BaseModel): | DanswerQuotes | CitationInfo | DanswerContexts - | ImageGenerationDisplay + | FileChatDisplay | CustomToolResponse | StreamingError | StreamStopInfo diff --git a/backend/danswer/chat/process_message.py b/backend/danswer/chat/process_message.py index f4bd1fd6db9..aba88b7d099 100644 --- a/backend/danswer/chat/process_message.py +++ b/backend/danswer/chat/process_message.py @@ -9,19 +9,28 @@ from danswer.chat.models import CitationInfo from danswer.chat.models import CustomToolResponse from danswer.chat.models import DanswerAnswerPiece +from danswer.chat.models import FileChatDisplay from danswer.chat.models import FinalUsedContextDocsResponse -from danswer.chat.models import ImageGenerationDisplay from danswer.chat.models import LLMRelevanceFilterResponse from danswer.chat.models import MessageResponseIDInfo from danswer.chat.models import MessageSpecificCitations from danswer.chat.models import QADocsResponse from danswer.chat.models import StreamingError -from danswer.configs.chat_configs import BING_API_KEY +from danswer.chat.models import StreamStopInfo from danswer.configs.chat_configs import CHAT_TARGET_CHUNK_PERCENTAGE from danswer.configs.chat_configs import DISABLE_LLM_CHOOSE_SEARCH from danswer.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT from danswer.configs.constants import MessageType -from danswer.configs.model_configs import GEN_AI_TEMPERATURE +from danswer.context.search.enums import OptionalSearchSetting +from danswer.context.search.enums import QueryFlow +from danswer.context.search.enums import SearchType +from danswer.context.search.models import InferenceSection +from danswer.context.search.models import RetrievalDetails +from danswer.context.search.retrieval.search_runner import inference_sections_from_ids +from danswer.context.search.utils import chunks_or_sections_to_search_docs +from danswer.context.search.utils import dedupe_documents +from danswer.context.search.utils import drop_llm_indices +from danswer.context.search.utils import relevant_sections_to_indices from danswer.db.chat import attach_files_to_chat_message from danswer.db.chat import create_db_search_doc from danswer.db.chat import create_new_chat_message @@ -34,7 +43,6 @@ from danswer.db.chat import translate_db_message_to_chat_message_detail from danswer.db.chat import translate_db_search_doc_to_server_search_doc from danswer.db.engine import get_session_context_manager -from danswer.db.llm import fetch_existing_llm_providers from danswer.db.models import SearchDoc as DbSearchDoc from danswer.db.models import ToolCall from danswer.db.models import User @@ -54,46 +62,35 @@ from danswer.llm.exceptions import GenAIDisabledException from danswer.llm.factory import get_llms_for_persona from danswer.llm.factory import get_main_llm_from_tuple -from danswer.llm.interfaces import LLMConfig from danswer.llm.utils import litellm_exception_to_error_msg from danswer.natural_language_processing.utils import get_tokenizer -from danswer.search.enums import LLMEvaluationType -from danswer.search.enums import OptionalSearchSetting -from danswer.search.enums import QueryFlow -from danswer.search.enums import SearchType -from danswer.search.models import InferenceSection -from danswer.search.retrieval.search_runner import inference_sections_from_ids -from danswer.search.utils import chunks_or_sections_to_search_docs -from danswer.search.utils import dedupe_documents -from danswer.search.utils import drop_llm_indices -from danswer.search.utils import relevant_sections_to_indices from danswer.server.query_and_chat.models import ChatMessageDetail from danswer.server.query_and_chat.models import CreateChatMessageRequest from danswer.server.utils import get_json_line -from danswer.tools.built_in_tools import get_built_in_tool_by_id -from danswer.tools.custom.custom_tool import build_custom_tools_from_openapi_schema -from danswer.tools.custom.custom_tool import CUSTOM_TOOL_RESPONSE_ID -from danswer.tools.custom.custom_tool import CustomToolCallSummary from danswer.tools.force import ForceUseTool -from danswer.tools.images.image_generation_tool import IMAGE_GENERATION_RESPONSE_ID -from danswer.tools.images.image_generation_tool import ImageGenerationResponse -from danswer.tools.images.image_generation_tool import ImageGenerationTool -from danswer.tools.internet_search.internet_search_tool import INTERNET_SEARCH_RESPONSE_ID -from danswer.tools.internet_search.internet_search_tool import internet_search_response_to_search_docs -from danswer.tools.internet_search.internet_search_tool import InternetSearchResponse -from danswer.tools.internet_search.internet_search_tool import InternetSearchTool -from danswer.tools.models import DynamicSchemaInfo -from danswer.tools.search.search_tool import FINAL_CONTEXT_DOCUMENTS_ID -from danswer.tools.search.search_tool import SEARCH_RESPONSE_SUMMARY_ID -from danswer.tools.search.search_tool import SearchResponseSummary -from danswer.tools.search.search_tool import SearchTool -from danswer.tools.search.search_tool import SECTION_RELEVANCE_LIST_ID +from danswer.tools.models import ToolResponse from danswer.tools.tool import Tool -from danswer.tools.tool import ToolResponse +from danswer.tools.tool_constructor import construct_tools +from danswer.tools.tool_constructor import CustomToolConfig +from danswer.tools.tool_constructor import ImageGenerationToolConfig +from danswer.tools.tool_constructor import InternetSearchToolConfig +from danswer.tools.tool_constructor import SearchToolConfig +from danswer.tools.tool_implementations.custom.custom_tool import CUSTOM_TOOL_RESPONSE_ID +from danswer.tools.tool_implementations.custom.custom_tool import CustomToolCallSummary +from danswer.tools.tool_implementations.images.image_generation_tool import IMAGE_GENERATION_RESPONSE_ID +from danswer.tools.tool_implementations.images.image_generation_tool import ImageGenerationResponse +from danswer.tools.tool_implementations.internet_search.internet_search_tool import INTERNET_SEARCH_RESPONSE_ID +from danswer.tools.tool_implementations.internet_search.internet_search_tool import internet_search_response_to_search_docs +from danswer.tools.tool_implementations.internet_search.internet_search_tool import InternetSearchResponse +from danswer.tools.tool_implementations.internet_search.internet_search_tool import InternetSearchTool +from danswer.tools.tool_implementations.search.search_tool import FINAL_CONTEXT_DOCUMENTS_ID +from danswer.tools.tool_implementations.search.search_tool import SEARCH_RESPONSE_SUMMARY_ID +from danswer.tools.tool_implementations.search.search_tool import SearchResponseSummary +from danswer.tools.tool_implementations.search.search_tool import SearchTool +from danswer.tools.tool_implementations.search.search_tool import SECTION_RELEVANCE_LIST_ID from danswer.tools.tool_runner import ToolCallFinalResult -from danswer.tools.utils import compute_all_tool_tokens -from danswer.tools.utils import explicit_tool_calling_supported from danswer.utils.logger import setup_logger +from danswer.utils.long_term_log import LongTermLogger from danswer.utils.timing import log_generator_function_time from sqlalchemy.orm import Session @@ -244,10 +241,11 @@ def _get_force_search_settings( | DanswerAnswerPiece | AllCitations | CitationInfo - | ImageGenerationDisplay + | FileChatDisplay | CustomToolResponse | MessageSpecificCitations | MessageResponseIDInfo + | StreamStopInfo ) ChatPacketStream = Iterator[ChatPacket] @@ -263,9 +261,10 @@ def stream_chat_message_objects( max_document_percentage: float = CHAT_TARGET_CHUNK_PERCENTAGE, # if specified, uses the last user message and does not create a new user message based # on the `new_msg_req.message`. Currently, requires a state where the last message is a - use_existing_user_message: bool = False, litellm_additional_headers: dict[str, str] | None = None, + custom_tool_additional_headers: dict[str, str] | None = None, is_connected: Callable[[], bool] | None = None, + enforce_chat_session_id_for_search_docs: bool = True, ) -> ChatPacketStream: """Streams in order: 1. [conditional] Retrieved documents if a search needs to be run @@ -273,6 +272,9 @@ def stream_chat_message_objects( 3. [always] A set of streamed LLM tokens or an error anywhere along the line if something fails 4. [always] Details on the final AI response message that is created """ + use_existing_user_message = new_msg_req.use_existing_user_message + existing_assistant_message_id = new_msg_req.existing_assistant_message_id + # Currently surrounding context is not supported for chat # Chat is already token heavy and harder for the model to process plus it would roll history over much faster new_msg_req.chunks_above = 0 @@ -296,6 +298,11 @@ def stream_chat_message_objects( retrieval_options = new_msg_req.retrieval_options alternate_assistant_id = new_msg_req.alternate_assistant_id + # permanent "log" store, used primarily for debugging + long_term_logger = LongTermLogger( + metadata={"user_id": str(user_id), "chat_session_id": str(chat_session_id)} + ) + # use alternate persona if alternative assistant id is passed in if alternate_assistant_id is not None: persona = get_persona_by_id( @@ -321,6 +328,7 @@ def stream_chat_message_objects( persona=persona, llm_override=new_msg_req.llm_override or chat_session.llm_override, additional_headers=litellm_additional_headers, + long_term_logger=long_term_logger, ) except GenAIDisabledException: raise RuntimeError("LLM is disabled. Can't use chat flow without LLM.") @@ -396,12 +404,20 @@ def stream_chat_message_objects( final_msg, history_msgs = create_chat_chain( chat_session_id=chat_session_id, db_session=db_session ) - if final_msg.message_type != MessageType.USER: - raise RuntimeError( - "The last message was not a user message. Cannot call " - "`stream_chat_message_objects` with `is_regenerate=True` " - "when the last message is not a user message." - ) + if existing_assistant_message_id is None: + if final_msg.message_type != MessageType.USER: + raise RuntimeError( + "The last message was not a user message. Cannot call " + "`stream_chat_message_objects` with `is_regenerate=True` " + "when the last message is not a user message." + ) + else: + if final_msg.id != existing_assistant_message_id: + raise RuntimeError( + "The last message was not the existing assistant message. " + f"Final message id: {final_msg.id}, " + f"existing assistant message id: {existing_assistant_message_id}" + ) # Disable Query Rephrasing for the first message # This leads to a better first response since the LLM rephrasing the question @@ -439,6 +455,7 @@ def stream_chat_message_objects( chat_session=chat_session, user_id=user_id, db_session=db_session, + enforce_chat_session_id_for_search_docs=enforce_chat_session_id_for_search_docs, ) # Generates full documents currently @@ -471,13 +488,19 @@ def stream_chat_message_objects( ), max_window_percentage=max_document_percentage, ) - reserved_message_id = reserve_message_id( - db_session=db_session, - chat_session_id=chat_session_id, - parent_message=user_message.id - if user_message is not None - else parent_message.id, - message_type=MessageType.ASSISTANT, + + # we don't need to reserve a message id if we're using an existing assistant message + reserved_message_id = ( + final_msg.id + if existing_assistant_message_id is not None + else reserve_message_id( + db_session=db_session, + chat_session_id=chat_session_id, + parent_message=user_message.id + if user_message is not None + else parent_message.id, + message_type=MessageType.ASSISTANT, + ) ) yield MessageResponseIDInfo( user_message_id=user_message.id if user_message else None, @@ -492,7 +515,13 @@ def stream_chat_message_objects( partial_response = partial( create_new_chat_message, chat_session_id=chat_session_id, - parent_message=final_msg, + # if we're using an existing assistant message, then this will just be an + # update operation, in which case the parent should be the parent of + # the latest. If we're creating a new assistant message, then the parent + # should be the latest message (latest user message) + parent_message=( + final_msg if existing_assistant_message_id is None else parent_message + ), prompt_id=prompt_id, overridden_model=overridden_model, # message=, @@ -504,6 +533,7 @@ def stream_chat_message_objects( # reference_docs=, db_session=db_session, commit=False, + reserved_message_id=reserved_message_id, ) if not final_msg.prompt: @@ -519,122 +549,54 @@ def stream_chat_message_objects( if not persona else PromptConfig.from_model(persona.prompts[0]) ) + answer_style_config = AnswerStyleConfig( + citation_config=CitationConfig( + all_docs_useful=selected_db_search_docs is not None + ), + document_pruning_config=document_pruning_config, + structured_response_format=new_msg_req.structured_response_format, + ) - # find out what tools to use - search_tool: SearchTool | None = None - tool_dict: dict[int, list[Tool]] = {} # tool_id to tool - for db_tool_model in persona.tools: - # handle in-code tools specially - if db_tool_model.in_code_tool_id: - tool_cls = get_built_in_tool_by_id(db_tool_model.id, db_session) - if tool_cls.__name__ == SearchTool.__name__ and not latest_query_files: - search_tool = SearchTool( - db_session=db_session, - user=user, - persona=persona, - retrieval_options=retrieval_options, - prompt_config=prompt_config, - llm=llm, - fast_llm=fast_llm, - pruning_config=document_pruning_config, - selected_sections=selected_sections, - chunks_above=new_msg_req.chunks_above, - chunks_below=new_msg_req.chunks_below, - full_doc=new_msg_req.full_doc, - evaluation_type=LLMEvaluationType.BASIC - if persona.llm_relevance_filter - else LLMEvaluationType.SKIP, - ) - tool_dict[db_tool_model.id] = [search_tool] - elif tool_cls.__name__ == ImageGenerationTool.__name__: - img_generation_llm_config: LLMConfig | None = None - if ( - llm - and llm.config.api_key - and llm.config.model_provider == "openai" - ): - img_generation_llm_config = llm.config - else: - llm_providers = fetch_existing_llm_providers(db_session) - openai_provider = next( - iter( - [ - llm_provider - for llm_provider in llm_providers - if llm_provider.provider == "openai" - ] - ), - None, - ) - if not openai_provider or not openai_provider.api_key: - raise ValueError( - "Image generation tool requires an OpenAI API key" - ) - img_generation_llm_config = LLMConfig( - model_provider=openai_provider.provider, - model_name=openai_provider.default_model_name, - temperature=GEN_AI_TEMPERATURE, - api_key=openai_provider.api_key, - api_base=openai_provider.api_base, - api_version=openai_provider.api_version, - ) - tool_dict[db_tool_model.id] = [ - ImageGenerationTool( - api_key=cast(str, img_generation_llm_config.api_key), - api_base=img_generation_llm_config.api_base, - api_version=img_generation_llm_config.api_version, - additional_headers=litellm_additional_headers, - ) - ] - elif tool_cls.__name__ == InternetSearchTool.__name__: - bing_api_key = BING_API_KEY - if not bing_api_key: - raise ValueError( - "Internet search tool requires a Bing API key, please contact your Danswer admin to get it added!" - ) - tool_dict[db_tool_model.id] = [ - InternetSearchTool(api_key=bing_api_key) - ] - - continue - - # handle all custom tools - if db_tool_model.openapi_schema: - tool_dict[db_tool_model.id] = cast( - list[Tool], - build_custom_tools_from_openapi_schema( - db_tool_model.openapi_schema, - dynamic_schema_info=DynamicSchemaInfo( - chat_session_id=chat_session_id, - message_id=user_message.id if user_message else None, - ), - ), - ) - + tool_dict = construct_tools( + persona=persona, + prompt_config=prompt_config, + db_session=db_session, + user=user, + llm=llm, + fast_llm=fast_llm, + search_tool_config=SearchToolConfig( + answer_style_config=answer_style_config, + document_pruning_config=document_pruning_config, + retrieval_options=retrieval_options or RetrievalDetails(), + selected_sections=selected_sections, + chunks_above=new_msg_req.chunks_above, + chunks_below=new_msg_req.chunks_below, + full_doc=new_msg_req.full_doc, + latest_query_files=latest_query_files, + ), + internet_search_tool_config=InternetSearchToolConfig( + answer_style_config=answer_style_config, + ), + image_generation_tool_config=ImageGenerationToolConfig( + additional_headers=litellm_additional_headers, + ), + custom_tool_config=CustomToolConfig( + chat_session_id=chat_session_id, + message_id=user_message.id if user_message else None, + additional_headers=custom_tool_additional_headers, + ), + ) tools: list[Tool] = [] for tool_list in tool_dict.values(): tools.extend(tool_list) - # factor in tool definition size when pruning - document_pruning_config.tool_num_tokens = compute_all_tool_tokens( - tools, llm_tokenizer - ) - document_pruning_config.using_tool_message = explicit_tool_calling_supported( - llm_provider, llm_model_name - ) - # LLM prompt building, response capturing, etc. answer = Answer( user_email=user_email, is_connected=is_connected, question=final_msg.message, latest_query_files=latest_query_files, - answer_style_config=AnswerStyleConfig( - citation_config=CitationConfig( - all_docs_useful=selected_db_search_docs is not None - ), - document_pruning_config=document_pruning_config, - ), + answer_style_config=answer_style_config, prompt_config=prompt_config, llm=( llm @@ -702,11 +664,11 @@ def stream_chat_message_objects( yield LLMRelevanceFilterResponse( llm_selected_doc_indices=llm_indices ) - elif packet.id == FINAL_CONTEXT_DOCUMENTS_ID: yield FinalUsedContextDocsResponse( final_context_docs=packet.response ) + elif packet.id == IMAGE_GENERATION_RESPONSE_ID: img_generation_response = cast( list[ImageGenerationResponse], packet.response @@ -719,7 +681,7 @@ def stream_chat_message_objects( FileDescriptor(id=str(file_id), type=ChatFileType.IMAGE) for file_id in file_ids ] - yield ImageGenerationDisplay( + yield FileChatDisplay( file_ids=[str(file_id) for file_id in file_ids] ) elif packet.id == INTERNET_SEARCH_RESPONSE_ID: @@ -733,11 +695,32 @@ def stream_chat_message_objects( yield qa_docs_response elif packet.id == CUSTOM_TOOL_RESPONSE_ID: custom_tool_response = cast(CustomToolCallSummary, packet.response) - yield CustomToolResponse( - response=custom_tool_response.tool_result, - tool_name=custom_tool_response.tool_name, - ) + if ( + custom_tool_response.response_type == "image" + or custom_tool_response.response_type == "csv" + ): + file_ids = custom_tool_response.tool_result.file_ids + ai_message_files = [ + FileDescriptor( + id=str(file_id), + type=ChatFileType.IMAGE + if custom_tool_response.response_type == "image" + else ChatFileType.CSV, + ) + for file_id in file_ids + ] + yield FileChatDisplay( + file_ids=[str(file_id) for file_id in file_ids] + ) + else: + yield CustomToolResponse( + response=custom_tool_response.tool_result, + tool_name=custom_tool_response.tool_name, + ) + + elif isinstance(packet, StreamStopInfo): + pass else: if isinstance(packet, ToolCallFinalResult): tool_result = packet @@ -767,6 +750,7 @@ def stream_chat_message_objects( # Post-LLM answer processing try: + logger.debug("Post-LLM answer processing") message_specific_citations: MessageSpecificCitations | None = None if reference_db_search_docs: message_specific_citations = _translate_citations( @@ -782,7 +766,6 @@ def stream_chat_message_objects( tool_name_to_tool_id[tool.name] = tool_id gen_ai_response_message = partial_response( - reserved_message_id=reserved_message_id, message=answer.llm_answer, rephrased_query=( qa_docs_response.rephrased_query if qa_docs_response else None @@ -790,21 +773,21 @@ def stream_chat_message_objects( reference_docs=reference_db_search_docs, files=ai_message_files, token_count=len(llm_tokenizer_encode_func(answer.llm_answer)), - citations=message_specific_citations.citation_map - if message_specific_citations - else None, + citations=( + message_specific_citations.citation_map + if message_specific_citations + else None + ), error=None, - tool_calls=( - [ - ToolCall( - tool_id=tool_name_to_tool_id[tool_result.tool_name], - tool_name=tool_result.tool_name, - tool_arguments=tool_result.tool_args, - tool_result=tool_result.tool_result, - ) - ] + tool_call=( + ToolCall( + tool_id=tool_name_to_tool_id[tool_result.tool_name], + tool_name=tool_result.tool_name, + tool_arguments=tool_result.tool_args, + tool_result=tool_result.tool_result, + ) if tool_result - else [] + else None ), ) @@ -828,8 +811,8 @@ def stream_chat_message_objects( def stream_chat_message( new_msg_req: CreateChatMessageRequest, user: User | None, - use_existing_user_message: bool = False, litellm_additional_headers: dict[str, str] | None = None, + custom_tool_additional_headers: dict[str, str] | None = None, is_connected: Callable[[], bool] | None = None, ) -> Iterator[str]: with get_session_context_manager() as db_session: @@ -837,8 +820,8 @@ def stream_chat_message( new_msg_req=new_msg_req, user=user, db_session=db_session, - use_existing_user_message=use_existing_user_message, litellm_additional_headers=litellm_additional_headers, + custom_tool_additional_headers=custom_tool_additional_headers, is_connected=is_connected, ) for obj in objects: diff --git a/backend/danswer/chat/tools.py b/backend/danswer/chat/tools.py deleted file mode 100644 index 11b40592973..00000000000 --- a/backend/danswer/chat/tools.py +++ /dev/null @@ -1,115 +0,0 @@ -from typing_extensions import TypedDict # noreorder - -from pydantic import BaseModel - -from danswer.prompts.chat_tools import DANSWER_TOOL_DESCRIPTION -from danswer.prompts.chat_tools import DANSWER_TOOL_NAME -from danswer.prompts.chat_tools import TOOL_FOLLOWUP -from danswer.prompts.chat_tools import TOOL_LESS_FOLLOWUP -from danswer.prompts.chat_tools import TOOL_LESS_PROMPT -from danswer.prompts.chat_tools import TOOL_TEMPLATE -from danswer.prompts.chat_tools import USER_INPUT - - -class ToolInfo(TypedDict): - name: str - description: str - - -class DanswerChatModelOut(BaseModel): - model_raw: str - action: str - action_input: str - - -def call_tool( - model_actions: DanswerChatModelOut, -) -> str: - raise NotImplementedError("There are no additional tool integrations right now") - - -def form_user_prompt_text( - query: str, - tool_text: str | None, - hint_text: str | None, - user_input_prompt: str = USER_INPUT, - tool_less_prompt: str = TOOL_LESS_PROMPT, -) -> str: - user_prompt = tool_text or tool_less_prompt - - user_prompt += user_input_prompt.format(user_input=query) - - if hint_text: - if user_prompt[-1] != "\n": - user_prompt += "\n" - user_prompt += "\nHint: " + hint_text - - return user_prompt.strip() - - -def form_tool_section_text( - tools: list[ToolInfo] | None, retrieval_enabled: bool, template: str = TOOL_TEMPLATE -) -> str | None: - if not tools and not retrieval_enabled: - return None - - if retrieval_enabled and tools: - tools.append( - {"name": DANSWER_TOOL_NAME, "description": DANSWER_TOOL_DESCRIPTION} - ) - - tools_intro = [] - if tools: - num_tools = len(tools) - for tool in tools: - description_formatted = tool["description"].replace("\n", " ") - tools_intro.append(f"> {tool['name']}: {description_formatted}") - - prefix = "Must be one of " if num_tools > 1 else "Must be " - - tools_intro_text = "\n".join(tools_intro) - tool_names_text = prefix + ", ".join([tool["name"] for tool in tools]) - - else: - return None - - return template.format( - tool_overviews=tools_intro_text, tool_names=tool_names_text - ).strip() - - -def form_tool_followup_text( - tool_output: str, - query: str, - hint_text: str | None, - tool_followup_prompt: str = TOOL_FOLLOWUP, - ignore_hint: bool = False, -) -> str: - # If multi-line query, it likely confuses the model more than helps - if "\n" not in query: - optional_reminder = f"\nAs a reminder, my query was: {query}\n" - else: - optional_reminder = "" - - if not ignore_hint and hint_text: - hint_text_spaced = f"\nHint: {hint_text}\n" - else: - hint_text_spaced = "" - - return tool_followup_prompt.format( - tool_output=tool_output, - optional_reminder=optional_reminder, - hint=hint_text_spaced, - ).strip() - - -def form_tool_less_followup_text( - tool_output: str, - query: str, - hint_text: str | None, - tool_followup_prompt: str = TOOL_LESS_FOLLOWUP, -) -> str: - hint = f"Hint: {hint_text}" if hint_text else "" - return tool_followup_prompt.format( - context_str=tool_output, user_query=query, hint_text=hint - ).strip() diff --git a/backend/danswer/configs/app_configs.py b/backend/danswer/configs/app_configs.py index 32fee1714bc..d5dac5e6e3b 100644 --- a/backend/danswer/configs/app_configs.py +++ b/backend/danswer/configs/app_configs.py @@ -43,6 +43,9 @@ AUTH_TYPE = AuthType((os.environ.get("AUTH_TYPE") or AuthType.DISABLED.value).lower()) DISABLE_AUTH = AUTH_TYPE == AuthType.DISABLED +# Necessary for cloud integration tests +DISABLE_VERIFICATION = os.environ.get("DISABLE_VERIFICATION", "").lower() == "true" + # Encryption key secret is used to encrypt connector credentials, api keys, and other sensitive # information. This provides an extra layer of security on top of Postgres access controls # and is available in Danswer EE @@ -53,7 +56,6 @@ os.environ.get("MASK_CREDENTIAL_PREFIX", "True").lower() != "false" ) - SESSION_EXPIRE_TIME_SECONDS = int( os.environ.get("SESSION_EXPIRE_TIME_SECONDS") or 86400 * 7 ) # 7 days @@ -122,17 +124,22 @@ VESPA_CONFIG_SERVER_HOST = os.environ.get("VESPA_CONFIG_SERVER_HOST") or VESPA_HOST VESPA_PORT = os.environ.get("VESPA_PORT") or "8081" VESPA_TENANT_PORT = os.environ.get("VESPA_TENANT_PORT") or "19071" + +VESPA_CLOUD_URL = os.environ.get("VESPA_CLOUD_URL", "") + # The default below is for dockerized deployment VESPA_DEPLOYMENT_ZIP = ( os.environ.get("VESPA_DEPLOYMENT_ZIP") or "/app/danswer/vespa-app.zip" ) +VESPA_CLOUD_CERT_PATH = os.environ.get("VESPA_CLOUD_CERT_PATH") +VESPA_CLOUD_KEY_PATH = os.environ.get("VESPA_CLOUD_KEY_PATH") + # Number of documents in a batch during indexing (further batching done by chunks before passing to bi-encoder) try: INDEX_BATCH_SIZE = int(os.environ.get("INDEX_BATCH_SIZE", 16)) except ValueError: INDEX_BATCH_SIZE = 16 - # Below are intended to match the env variables names used by the official postgres docker image # https://hub.docker.com/_/postgres POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres" @@ -144,6 +151,12 @@ POSTGRES_PORT = os.environ.get("POSTGRES_PORT") or "5432" POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres" +POSTGRES_API_SERVER_POOL_SIZE = int( + os.environ.get("POSTGRES_API_SERVER_POOL_SIZE") or 40 +) +POSTGRES_API_SERVER_POOL_OVERFLOW = int( + os.environ.get("POSTGRES_API_SERVER_POOL_OVERFLOW") or 10 +) # defaults to False POSTGRES_POOL_PRE_PING = os.environ.get("POSTGRES_POOL_PRE_PING", "").lower() == "true" @@ -156,6 +169,17 @@ except ValueError: POSTGRES_POOL_RECYCLE = POSTGRES_POOL_RECYCLE_DEFAULT +# Experimental setting to control idle transactions +POSTGRES_IDLE_SESSIONS_TIMEOUT_DEFAULT = 0 # milliseconds +try: + POSTGRES_IDLE_SESSIONS_TIMEOUT = int( + os.environ.get( + "POSTGRES_IDLE_SESSIONS_TIMEOUT", POSTGRES_IDLE_SESSIONS_TIMEOUT_DEFAULT + ) + ) +except ValueError: + POSTGRES_IDLE_SESSIONS_TIMEOUT = POSTGRES_IDLE_SESSIONS_TIMEOUT_DEFAULT + REDIS_SSL = os.getenv("REDIS_SSL", "").lower() == "true" REDIS_HOST = os.environ.get("REDIS_HOST") or "localhost" REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379)) @@ -170,13 +194,64 @@ ) REDIS_DB_NUMBER_CELERY = int(os.environ.get("REDIS_DB_NUMBER_CELERY", 15)) # broker +# will propagate to both our redis client as well as celery's redis client +REDIS_HEALTH_CHECK_INTERVAL = int(os.environ.get("REDIS_HEALTH_CHECK_INTERVAL", 60)) + +# our redis client only, not celery's +REDIS_POOL_MAX_CONNECTIONS = int(os.environ.get("REDIS_POOL_MAX_CONNECTIONS", 128)) + # https://docs.celeryq.dev/en/stable/userguide/configuration.html#redis-backend-settings # should be one of "required", "optional", or "none" REDIS_SSL_CERT_REQS = os.getenv("REDIS_SSL_CERT_REQS", "none") -REDIS_SSL_CA_CERTS = os.getenv("REDIS_SSL_CA_CERTS", "") +REDIS_SSL_CA_CERTS = os.getenv("REDIS_SSL_CA_CERTS", None) CELERY_RESULT_EXPIRES = int(os.environ.get("CELERY_RESULT_EXPIRES", 86400)) # seconds +# https://docs.celeryq.dev/en/stable/userguide/configuration.html#broker-pool-limit +# Setting to None may help when there is a proxy in the way closing idle connections +CELERY_BROKER_POOL_LIMIT_DEFAULT = 10 +try: + CELERY_BROKER_POOL_LIMIT = int( + os.environ.get("CELERY_BROKER_POOL_LIMIT", CELERY_BROKER_POOL_LIMIT_DEFAULT) + ) +except ValueError: + CELERY_BROKER_POOL_LIMIT = CELERY_BROKER_POOL_LIMIT_DEFAULT + +CELERY_WORKER_LIGHT_CONCURRENCY_DEFAULT = 24 +try: + CELERY_WORKER_LIGHT_CONCURRENCY = int( + os.environ.get( + "CELERY_WORKER_LIGHT_CONCURRENCY", CELERY_WORKER_LIGHT_CONCURRENCY_DEFAULT + ) + ) +except ValueError: + CELERY_WORKER_LIGHT_CONCURRENCY = CELERY_WORKER_LIGHT_CONCURRENCY_DEFAULT + +CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER_DEFAULT = 8 +try: + CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER = int( + os.environ.get( + "CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER", + CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER_DEFAULT, + ) + ) +except ValueError: + CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER = ( + CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER_DEFAULT + ) + +CELERY_WORKER_INDEXING_CONCURRENCY_DEFAULT = 3 +try: + env_value = os.environ.get("CELERY_WORKER_INDEXING_CONCURRENCY") + if not env_value: + env_value = os.environ.get("NUM_INDEXING_WORKERS") + + if not env_value: + env_value = str(CELERY_WORKER_INDEXING_CONCURRENCY_DEFAULT) + CELERY_WORKER_INDEXING_CONCURRENCY = int(env_value) +except ValueError: + CELERY_WORKER_INDEXING_CONCURRENCY = CELERY_WORKER_INDEXING_CONCURRENCY_DEFAULT + ##### # Connector Configs ##### @@ -193,9 +268,6 @@ # for some connectors ENABLE_EXPENSIVE_EXPERT_CALLS = False -GOOGLE_DRIVE_INCLUDE_SHARED = False -GOOGLE_DRIVE_FOLLOW_SHORTCUTS = False -GOOGLE_DRIVE_ONLY_ORG_PUBLIC = False # TODO these should be available for frontend configuration, via advanced options expandable WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get( @@ -232,12 +304,6 @@ os.environ.get("CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES", "").lower() == "true" ) -# Save pages labels as Danswer metadata tags -# The reason to skip this would be to reduce the number of calls to Confluence due to rate limit concerns -CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING = ( - os.environ.get("CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING", "").lower() == "true" -) - # Attachments exceeding this size will not be retrieved (in bytes) CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD = int( os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD", 10 * 1024 * 1024) @@ -253,6 +319,10 @@ for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",") if ignored_tag ] +# Maximum size for Jira tickets in bytes (default: 100KB) +JIRA_CONNECTOR_MAX_TICKET_SIZE = int( + os.environ.get("JIRA_CONNECTOR_MAX_TICKET_SIZE", 100 * 1024) +) GONG_CONNECTOR_START_TIME = os.environ.get("GONG_CONNECTOR_START_TIME") @@ -276,7 +346,7 @@ os.environ.get("ALLOW_SIMULTANEOUS_PRUNING", "").lower() == "true" ) -# This is the maxiumum rate at which documents are queried for a pruning job. 0 disables the limitation. +# This is the maximum rate at which documents are queried for a pruning job. 0 disables the limitation. MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE = int( os.environ.get("MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE", 0) ) @@ -340,12 +410,10 @@ # exception without aborting the attempt. INDEXING_EXCEPTION_LIMIT = int(os.environ.get("INDEXING_EXCEPTION_LIMIT", 0)) + ##### # Miscellaneous ##### -# File based Key Value store no longer used -DYNAMIC_CONFIG_STORE = "PostgresBackedDynamicConfigStore" - JOB_TIMEOUT = 60 * 60 * 6 # 6 hours default # used to allow the background indexing jobs to use a different embedding # model server than the API server @@ -360,6 +428,9 @@ LOG_DANSWER_MODEL_INTERACTIONS = ( os.environ.get("LOG_DANSWER_MODEL_INTERACTIONS", "").lower() == "true" ) +LOG_INDIVIDUAL_MODEL_TOKENS = ( + os.environ.get("LOG_INDIVIDUAL_MODEL_TOKENS", "").lower() == "true" +) # If set to `true` will enable additional logs about Vespa query performance # (time spent on finding the right docs + time spent fetching summaries from disk) LOG_VESPA_TIMING_INFORMATION = ( @@ -383,6 +454,11 @@ os.environ.get("CUSTOM_ANSWER_VALIDITY_CONDITIONS", "[]") ) +VESPA_REQUEST_TIMEOUT = int(os.environ.get("VESPA_REQUEST_TIMEOUT") or "15") + +SYSTEM_RECURSION_LIMIT = int(os.environ.get("SYSTEM_RECURSION_LIMIT") or "1000") + +PARSE_WITH_TRAFILATURA = os.environ.get("PARSE_WITH_TRAFILATURA", "").lower() == "true" ##### # Enterprise Edition Configs @@ -394,3 +470,49 @@ ENTERPRISE_EDITION_ENABLED = ( os.environ.get("ENABLE_PAID_ENTERPRISE_EDITION_FEATURES", "").lower() == "true" ) + +# Azure DALL-E Configurations +AZURE_DALLE_API_VERSION = os.environ.get("AZURE_DALLE_API_VERSION") +AZURE_DALLE_API_KEY = os.environ.get("AZURE_DALLE_API_KEY") +AZURE_DALLE_API_BASE = os.environ.get("AZURE_DALLE_API_BASE") +AZURE_DALLE_DEPLOYMENT_NAME = os.environ.get("AZURE_DALLE_DEPLOYMENT_NAME") + + +# Use managed Vespa (Vespa Cloud). If set, must also set VESPA_CLOUD_URL, VESPA_CLOUD_CERT_PATH and VESPA_CLOUD_KEY_PATH +MANAGED_VESPA = os.environ.get("MANAGED_VESPA", "").lower() == "true" + +ENABLE_EMAIL_INVITES = os.environ.get("ENABLE_EMAIL_INVITES", "").lower() == "true" + +# Security and authentication +DATA_PLANE_SECRET = os.environ.get( + "DATA_PLANE_SECRET", "" +) # Used for secure communication between the control and data plane +EXPECTED_API_KEY = os.environ.get( + "EXPECTED_API_KEY", "" +) # Additional security check for the control plane API + +# API configuration +CONTROL_PLANE_API_BASE_URL = os.environ.get( + "CONTROL_PLANE_API_BASE_URL", "http://localhost:8082" +) + +# JWT configuration +JWT_ALGORITHM = "HS256" + +# Super Users +SUPER_USERS = json.loads(os.environ.get("SUPER_USERS", '["pablo@danswer.ai"]')) +SUPER_CLOUD_API_KEY = os.environ.get("SUPER_CLOUD_API_KEY", "api_key") + + +##### +# API Key Configs +##### +# refers to the rounds described here: https://passlib.readthedocs.io/en/stable/lib/passlib.hash.sha256_crypt.html +_API_KEY_HASH_ROUNDS_RAW = os.environ.get("API_KEY_HASH_ROUNDS") +API_KEY_HASH_ROUNDS = ( + int(_API_KEY_HASH_ROUNDS_RAW) if _API_KEY_HASH_ROUNDS_RAW else None +) + + +POD_NAME = os.environ.get("POD_NAME") +POD_NAMESPACE = os.environ.get("POD_NAMESPACE") diff --git a/backend/danswer/configs/chat_configs.py b/backend/danswer/configs/chat_configs.py index e67e4258fec..2d72bed0f5a 100644 --- a/backend/danswer/configs/chat_configs.py +++ b/backend/danswer/configs/chat_configs.py @@ -1,9 +1,9 @@ import os -PROMPTS_YAML = "./danswer/chat/prompts.yaml" -PERSONAS_YAML = "./danswer/chat/personas.yaml" -INPUT_PROMPT_YAML = "./danswer/chat/input_prompts.yaml" +PROMPTS_YAML = "./danswer/seeding/prompts.yaml" +PERSONAS_YAML = "./danswer/seeding/personas.yaml" +INPUT_PROMPT_YAML = "./danswer/seeding/input_prompts.yaml" NUM_RETURNED_HITS = 50 # Used for LLM filtering and reranking @@ -17,9 +17,6 @@ # ~3k input, half for docs, half for chat history + prompts CHAT_TARGET_CHUNK_PERCENTAGE = 512 * 3 / 3072 -# For selecting a different LLM question-answering prompt format -# Valid values: default, cot, weak -QA_PROMPT_OVERRIDE = os.environ.get("QA_PROMPT_OVERRIDE") or None # 1 / (1 + DOC_TIME_DECAY * doc-age-in-years), set to 0 to have no decay # Capped in Vespa at 0.5 DOC_TIME_DECAY = float( @@ -27,8 +24,6 @@ ) BASE_RECENCY_DECAY = 0.5 FAVOR_RECENT_DECAY_MULTIPLIER = 2.0 -# Currently this next one is not configurable via env -DISABLE_LLM_QUERY_ANSWERABILITY = QA_PROMPT_OVERRIDE == "weak" # For the highest matching base size chunk, how many chunks above and below do we pull in by default # Note this is not in any of the deployment configs yet # Currently only applies to search flow not chat diff --git a/backend/danswer/configs/constants.py b/backend/danswer/configs/constants.py index 3e22e29df39..d7b887a0e5c 100644 --- a/backend/danswer/configs/constants.py +++ b/backend/danswer/configs/constants.py @@ -1,3 +1,5 @@ +import platform +import socket from enum import auto from enum import Enum @@ -34,7 +36,11 @@ POSTGRES_INDEXER_APP_NAME = "indexer" POSTGRES_CELERY_APP_NAME = "celery" POSTGRES_CELERY_BEAT_APP_NAME = "celery_beat" -POSTGRES_CELERY_WORKER_APP_NAME = "celery_worker" +POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME = "celery_worker_primary" +POSTGRES_CELERY_WORKER_LIGHT_APP_NAME = "celery_worker_light" +POSTGRES_CELERY_WORKER_HEAVY_APP_NAME = "celery_worker_heavy" +POSTGRES_CELERY_WORKER_INDEXING_APP_NAME = "celery_worker_indexing" +POSTGRES_CELERY_WORKER_INDEXING_CHILD_APP_NAME = "celery_worker_indexing_child" POSTGRES_PERMISSIONS_APP_NAME = "permissions" POSTGRES_UNKNOWN_APP_NAME = "unknown" @@ -46,6 +52,7 @@ # Key-Value store keys KV_REINDEX_KEY = "needs_reindexing" KV_SEARCH_SETTINGS = "search_settings" +KV_UNSTRUCTURED_API_KEY = "unstructured_api_key" KV_USER_STORE_KEY = "INVITED_USERS" KV_NO_AUTH_USER_PREFERENCES_KEY = "no_auth_user_preferences" KV_CRED_KEY = "credential_id_{}" @@ -53,15 +60,30 @@ KV_GMAIL_SERVICE_ACCOUNT_KEY = "gmail_service_account_key" KV_GOOGLE_DRIVE_CRED_KEY = "google_drive_app_credential" KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY = "google_drive_service_account_key" -KV_SLACK_BOT_TOKENS_CONFIG_KEY = "slack_bot_tokens_config_key" KV_GEN_AI_KEY_CHECK_TIME = "genai_api_key_last_check_time" KV_SETTINGS_KEY = "danswer_settings" KV_CUSTOMER_UUID_KEY = "customer_uuid" KV_INSTANCE_DOMAIN_KEY = "instance_domain" KV_ENTERPRISE_SETTINGS_KEY = "danswer_enterprise_settings" KV_CUSTOM_ANALYTICS_SCRIPT_KEY = "__custom_analytics_script__" +KV_DOCUMENTS_SEEDED_KEY = "documents_seeded" CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT = 60 +CELERY_PRIMARY_WORKER_LOCK_TIMEOUT = 120 + +# needs to be long enough to cover the maximum time it takes to download an object +# if we can get callbacks as object bytes download, we could lower this a lot. +CELERY_INDEXING_LOCK_TIMEOUT = 3 * 60 * 60 # 60 min + +# needs to be long enough to cover the maximum time it takes to download an object +# if we can get callbacks as object bytes download, we could lower this a lot. +CELERY_PRUNING_LOCK_TIMEOUT = 300 # 5 min + +CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT = 300 # 5 min + +CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT = 300 # 5 min + +DANSWER_REDIS_FUNCTION_LOCK_PREFIX = "da_function_lock:" class DocumentSource(str, Enum): @@ -100,15 +122,24 @@ class DocumentSource(str, Enum): CLICKUP = "clickup" MEDIAWIKI = "mediawiki" WIKIPEDIA = "wikipedia" + ASANA = "asana" S3 = "s3" R2 = "r2" GOOGLE_CLOUD_STORAGE = "google_cloud_storage" OCI_STORAGE = "oci_storage" + XENFORO = "xenforo" NOT_APPLICABLE = "not_applicable" + FRESHDESK = "freshdesk" + FIREFLIES = "fireflies" + + +DocumentSourceRequiringTenantContext: list[DocumentSource] = [DocumentSource.FILE] class NotificationType(str, Enum): REINDEX = "reindex" + PERSONA_SHARED = "persona_shared" + TRIAL_ENDS_TWO_DAYS = "two_day_trial_ending" # 2 days left in trial class BlobType(str, Enum): @@ -133,6 +164,9 @@ class AuthType(str, Enum): OIDC = "oidc" SAML = "saml" + # google auth and basic + CLOUD = "cloud" + class SessionType(str, Enum): CHAT = "Chat" @@ -179,17 +213,43 @@ class PostgresAdvisoryLocks(Enum): class DanswerCeleryQueues: - VESPA_DOCSET_SYNC_GENERATOR = "vespa_docset_sync_generator" - VESPA_USERGROUP_SYNC_GENERATOR = "vespa_usergroup_sync_generator" + # Light queue VESPA_METADATA_SYNC = "vespa_metadata_sync" + DOC_PERMISSIONS_UPSERT = "doc_permissions_upsert" CONNECTOR_DELETION = "connector_deletion" + # Heavy queue + CONNECTOR_PRUNING = "connector_pruning" + CONNECTOR_DOC_PERMISSIONS_SYNC = "connector_doc_permissions_sync" + CONNECTOR_EXTERNAL_GROUP_SYNC = "connector_external_group_sync" + + # Indexing queue + CONNECTOR_INDEXING = "connector_indexing" + class DanswerRedisLocks: + PRIMARY_WORKER = "da_lock:primary_worker" CHECK_VESPA_SYNC_BEAT_LOCK = "da_lock:check_vespa_sync_beat" - MONITOR_VESPA_SYNC_BEAT_LOCK = "da_lock:monitor_vespa_sync_beat" CHECK_CONNECTOR_DELETION_BEAT_LOCK = "da_lock:check_connector_deletion_beat" - MONITOR_CONNECTOR_DELETION_BEAT_LOCK = "da_lock:monitor_connector_deletion_beat" + CHECK_PRUNE_BEAT_LOCK = "da_lock:check_prune_beat" + CHECK_INDEXING_BEAT_LOCK = "da_lock:check_indexing_beat" + CHECK_CONNECTOR_DOC_PERMISSIONS_SYNC_BEAT_LOCK = ( + "da_lock:check_connector_doc_permissions_sync_beat" + ) + CHECK_CONNECTOR_EXTERNAL_GROUP_SYNC_BEAT_LOCK = ( + "da_lock:check_connector_external_group_sync_beat" + ) + MONITOR_VESPA_SYNC_BEAT_LOCK = "da_lock:monitor_vespa_sync_beat" + + CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX = ( + "da_lock:connector_doc_permissions_sync" + ) + CONNECTOR_EXTERNAL_GROUP_SYNC_LOCK_PREFIX = "da_lock:connector_external_group_sync" + PRUNING_LOCK_PREFIX = "da_lock:pruning" + INDEXING_METADATA_PREFIX = "da_metadata:indexing" + + SLACK_BOT_LOCK = "da_lock:slack_bot" + SLACK_BOT_HEARTBEAT_PREFIX = "da_heartbeat:slack_bot" class DanswerCeleryPriority(int, Enum): @@ -198,3 +258,13 @@ class DanswerCeleryPriority(int, Enum): MEDIUM = auto() LOW = auto() LOWEST = auto() + + +REDIS_SOCKET_KEEPALIVE_OPTIONS = {} +REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPINTVL] = 15 +REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPCNT] = 3 + +if platform.system() == "Darwin": + REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPALIVE] = 60 # type: ignore +else: + REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPIDLE] = 60 # type: ignore diff --git a/backend/danswer/configs/model_configs.py b/backend/danswer/configs/model_configs.py index c9668cd8136..0618bf5f684 100644 --- a/backend/danswer/configs/model_configs.py +++ b/backend/danswer/configs/model_configs.py @@ -119,3 +119,14 @@ logger.error( "Failed to parse LITELLM_PASS_THROUGH_HEADERS, must be a valid JSON object" ) + + +# if specified, will merge the specified JSON with the existing body of the +# request before sending it to the LLM +LITELLM_EXTRA_BODY: dict | None = None +_LITELLM_EXTRA_BODY_RAW = os.environ.get("LITELLM_EXTRA_BODY") +if _LITELLM_EXTRA_BODY_RAW: + try: + LITELLM_EXTRA_BODY = json.loads(_LITELLM_EXTRA_BODY_RAW) + except Exception: + pass diff --git a/backend/danswer/configs/tool_configs.py b/backend/danswer/configs/tool_configs.py new file mode 100644 index 00000000000..3170cb31ff9 --- /dev/null +++ b/backend/danswer/configs/tool_configs.py @@ -0,0 +1,22 @@ +import json +import os + + +# if specified, will pass through request headers to the call to API calls made by custom tools +CUSTOM_TOOL_PASS_THROUGH_HEADERS: list[str] | None = None +_CUSTOM_TOOL_PASS_THROUGH_HEADERS_RAW = os.environ.get( + "CUSTOM_TOOL_PASS_THROUGH_HEADERS" +) +if _CUSTOM_TOOL_PASS_THROUGH_HEADERS_RAW: + try: + CUSTOM_TOOL_PASS_THROUGH_HEADERS = json.loads( + _CUSTOM_TOOL_PASS_THROUGH_HEADERS_RAW + ) + except Exception: + # need to import here to avoid circular imports + from danswer.utils.logger import setup_logger + + logger = setup_logger() + logger.error( + "Failed to parse CUSTOM_TOOL_PASS_THROUGH_HEADERS, must be a valid JSON object" + ) diff --git a/backend/danswer/connectors/README.md b/backend/danswer/connectors/README.md index ef6c63d2697..bb7f5a5fe4f 100644 --- a/backend/danswer/connectors/README.md +++ b/backend/danswer/connectors/README.md @@ -13,8 +13,8 @@ Connectors come in 3 different flows: documents via a connector's API or loads the documents from some sort of a dump file. - Poll connector: - Incrementally updates documents based on a provided time range. It is used by the background job to pull the latest - changes additions and changes since the last round of polling. This connector helps keep the document index up to date - without needing to fetch/embed/index every document which generally be too slow to do frequently on large sets of + changes and additions since the last round of polling. This connector helps keep the document index up to date + without needing to fetch/embed/index every document which would be too slow to do frequently on large sets of documents. - Event Based connectors: - Connectors that listen to events and update documents accordingly. diff --git a/backend/danswer/search/__init__.py b/backend/danswer/connectors/asana/__init__.py similarity index 100% rename from backend/danswer/search/__init__.py rename to backend/danswer/connectors/asana/__init__.py diff --git a/backend/danswer/connectors/asana/asana_api.py b/backend/danswer/connectors/asana/asana_api.py new file mode 100755 index 00000000000..57c470c4531 --- /dev/null +++ b/backend/danswer/connectors/asana/asana_api.py @@ -0,0 +1,233 @@ +import time +from collections.abc import Iterator +from datetime import datetime +from typing import Dict + +import asana # type: ignore + +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +# https://github.com/Asana/python-asana/tree/master?tab=readme-ov-file#documentation-for-api-endpoints +class AsanaTask: + def __init__( + self, + id: str, + title: str, + text: str, + link: str, + last_modified: datetime, + project_gid: str, + project_name: str, + ) -> None: + self.id = id + self.title = title + self.text = text + self.link = link + self.last_modified = last_modified + self.project_gid = project_gid + self.project_name = project_name + + def __str__(self) -> str: + return f"ID: {self.id}\nTitle: {self.title}\nLast modified: {self.last_modified}\nText: {self.text}" + + +class AsanaAPI: + def __init__( + self, api_token: str, workspace_gid: str, team_gid: str | None + ) -> None: + self._user = None # type: ignore + self.workspace_gid = workspace_gid + self.team_gid = team_gid + + self.configuration = asana.Configuration() + self.api_client = asana.ApiClient(self.configuration) + self.tasks_api = asana.TasksApi(self.api_client) + self.stories_api = asana.StoriesApi(self.api_client) + self.users_api = asana.UsersApi(self.api_client) + self.project_api = asana.ProjectsApi(self.api_client) + self.workspaces_api = asana.WorkspacesApi(self.api_client) + + self.api_error_count = 0 + self.configuration.access_token = api_token + self.task_count = 0 + + def get_tasks( + self, project_gids: list[str] | None, start_date: str + ) -> Iterator[AsanaTask]: + """Get all tasks from the projects with the given gids that were modified since the given date. + If project_gids is None, get all tasks from all projects in the workspace.""" + logger.info("Starting to fetch Asana projects") + projects = self.project_api.get_projects( + opts={ + "workspace": self.workspace_gid, + "opt_fields": "gid,name,archived,modified_at", + } + ) + start_seconds = int(time.mktime(datetime.now().timetuple())) + projects_list = [] + project_count = 0 + for project_info in projects: + project_gid = project_info["gid"] + if project_gids is None or project_gid in project_gids: + projects_list.append(project_gid) + else: + logger.debug( + f"Skipping project: {project_gid} - not in accepted project_gids" + ) + project_count += 1 + if project_count % 100 == 0: + logger.info(f"Processed {project_count} projects") + + logger.info(f"Found {len(projects_list)} projects to process") + for project_gid in projects_list: + for task in self._get_tasks_for_project( + project_gid, start_date, start_seconds + ): + yield task + logger.info(f"Completed fetching {self.task_count} tasks from Asana") + if self.api_error_count > 0: + logger.warning( + f"Encountered {self.api_error_count} API errors during task fetching" + ) + + def _get_tasks_for_project( + self, project_gid: str, start_date: str, start_seconds: int + ) -> Iterator[AsanaTask]: + project = self.project_api.get_project(project_gid, opts={}) + if project["archived"]: + logger.info(f"Skipping archived project: {project['name']} ({project_gid})") + return [] + if not project["team"] or not project["team"]["gid"]: + logger.info( + f"Skipping project without a team: {project['name']} ({project_gid})" + ) + return [] + if project["privacy_setting"] == "private": + if self.team_gid and project["team"]["gid"] != self.team_gid: + logger.info( + f"Skipping private project not in configured team: {project['name']} ({project_gid})" + ) + return [] + else: + logger.info( + f"Processing private project in configured team: {project['name']} ({project_gid})" + ) + + simple_start_date = start_date.split(".")[0].split("+")[0] + logger.info( + f"Fetching tasks modified since {simple_start_date} for project: {project['name']} ({project_gid})" + ) + + opts = { + "opt_fields": "name,memberships,memberships.project,completed_at,completed_by,created_at," + "created_by,custom_fields,dependencies,due_at,due_on,external,html_notes,liked,likes," + "modified_at,notes,num_hearts,parent,projects,resource_subtype,resource_type,start_on," + "workspace,permalink_url", + "modified_since": start_date, + } + tasks_from_api = self.tasks_api.get_tasks_for_project(project_gid, opts) + for data in tasks_from_api: + self.task_count += 1 + if self.task_count % 10 == 0: + end_seconds = time.mktime(datetime.now().timetuple()) + runtime_seconds = end_seconds - start_seconds + if runtime_seconds > 0: + logger.info( + f"Processed {self.task_count} tasks in {runtime_seconds:.0f} seconds " + f"({self.task_count / runtime_seconds:.2f} tasks/second)" + ) + + logger.debug(f"Processing Asana task: {data['name']}") + + text = self._construct_task_text(data) + + try: + text += self._fetch_and_add_comments(data["gid"]) + + last_modified_date = self.format_date(data["modified_at"]) + text += f"Last modified: {last_modified_date}\n" + + task = AsanaTask( + id=data["gid"], + title=data["name"], + text=text, + link=data["permalink_url"], + last_modified=datetime.fromisoformat(data["modified_at"]), + project_gid=project_gid, + project_name=project["name"], + ) + yield task + except Exception: + logger.error( + f"Error processing task {data['gid']} in project {project_gid}", + exc_info=True, + ) + self.api_error_count += 1 + + def _construct_task_text(self, data: Dict) -> str: + text = f"{data['name']}\n\n" + + if data["notes"]: + text += f"{data['notes']}\n\n" + + if data["created_by"] and data["created_by"]["gid"]: + creator = self.get_user(data["created_by"]["gid"])["name"] + created_date = self.format_date(data["created_at"]) + text += f"Created by: {creator} on {created_date}\n" + + if data["due_on"]: + due_date = self.format_date(data["due_on"]) + text += f"Due date: {due_date}\n" + + if data["completed_at"]: + completed_date = self.format_date(data["completed_at"]) + text += f"Completed on: {completed_date}\n" + + text += "\n" + return text + + def _fetch_and_add_comments(self, task_gid: str) -> str: + text = "" + stories_opts: Dict[str, str] = {} + story_start = time.time() + stories = self.stories_api.get_stories_for_task(task_gid, stories_opts) + + story_count = 0 + comment_count = 0 + + for story in stories: + story_count += 1 + if story["resource_subtype"] == "comment_added": + comment = self.stories_api.get_story( + story["gid"], opts={"opt_fields": "text,created_by,created_at"} + ) + commenter = self.get_user(comment["created_by"]["gid"])["name"] + text += f"Comment by {commenter}: {comment['text']}\n\n" + comment_count += 1 + + story_duration = time.time() - story_start + logger.debug( + f"Processed {story_count} stories (including {comment_count} comments) in {story_duration:.2f} seconds" + ) + + return text + + def get_user(self, user_gid: str) -> Dict: + if self._user is not None: + return self._user + self._user = self.users_api.get_user(user_gid, {"opt_fields": "name,email"}) + + if not self._user: + logger.warning(f"Unable to fetch user information for user_gid: {user_gid}") + return {"name": "Unknown"} + return self._user + + def format_date(self, date_str: str) -> str: + date = datetime.fromisoformat(date_str) + return time.strftime("%Y-%m-%d", date.timetuple()) + + def get_time(self) -> str: + return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) diff --git a/backend/danswer/connectors/asana/connector.py b/backend/danswer/connectors/asana/connector.py new file mode 100755 index 00000000000..3e2c9a8aaf6 --- /dev/null +++ b/backend/danswer/connectors/asana/connector.py @@ -0,0 +1,120 @@ +import datetime +from typing import Any + +from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE +from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.constants import DocumentSource +from danswer.connectors.asana import asana_api +from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import LoadConnector +from danswer.connectors.interfaces import PollConnector +from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.models import Document +from danswer.connectors.models import Section +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +class AsanaConnector(LoadConnector, PollConnector): + def __init__( + self, + asana_workspace_id: str, + asana_project_ids: str | None = None, + asana_team_id: str | None = None, + batch_size: int = INDEX_BATCH_SIZE, + continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE, + ) -> None: + self.workspace_id = asana_workspace_id + self.project_ids_to_index: list[str] | None = ( + asana_project_ids.split(",") if asana_project_ids is not None else None + ) + self.asana_team_id = asana_team_id + self.batch_size = batch_size + self.continue_on_failure = continue_on_failure + logger.info( + f"AsanaConnector initialized with workspace_id: {asana_workspace_id}" + ) + + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: + self.api_token = credentials["asana_api_token_secret"] + self.asana_client = asana_api.AsanaAPI( + api_token=self.api_token, + workspace_gid=self.workspace_id, + team_gid=self.asana_team_id, + ) + logger.info("Asana credentials loaded and API client initialized") + return None + + def poll_source( + self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch | None + ) -> GenerateDocumentsOutput: + start_time = datetime.datetime.fromtimestamp(start).isoformat() + logger.info(f"Starting Asana poll from {start_time}") + asana = asana_api.AsanaAPI( + api_token=self.api_token, + workspace_gid=self.workspace_id, + team_gid=self.asana_team_id, + ) + docs_batch: list[Document] = [] + tasks = asana.get_tasks(self.project_ids_to_index, start_time) + + for task in tasks: + doc = self._message_to_doc(task) + docs_batch.append(doc) + + if len(docs_batch) >= self.batch_size: + logger.info(f"Yielding batch of {len(docs_batch)} documents") + yield docs_batch + docs_batch = [] + + if docs_batch: + logger.info(f"Yielding final batch of {len(docs_batch)} documents") + yield docs_batch + + logger.info("Asana poll completed") + + def load_from_state(self) -> GenerateDocumentsOutput: + logger.notice("Starting full index of all Asana tasks") + return self.poll_source(start=0, end=None) + + def _message_to_doc(self, task: asana_api.AsanaTask) -> Document: + logger.debug(f"Converting Asana task {task.id} to Document") + return Document( + id=task.id, + sections=[Section(link=task.link, text=task.text)], + doc_updated_at=task.last_modified, + source=DocumentSource.ASANA, + semantic_identifier=task.title, + metadata={ + "group": task.project_gid, + "project": task.project_name, + }, + ) + + +if __name__ == "__main__": + import time + import os + + logger.notice("Starting Asana connector test") + connector = AsanaConnector( + os.environ["WORKSPACE_ID"], + os.environ["PROJECT_IDS"], + os.environ["TEAM_ID"], + ) + connector.load_credentials( + { + "asana_api_token_secret": os.environ["API_TOKEN"], + } + ) + logger.info("Loading all documents from Asana") + all_docs = connector.load_from_state() + current = time.time() + one_day_ago = current - 24 * 60 * 60 # 1 day + logger.info("Polling for documents updated in the last 24 hours") + latest_docs = connector.poll_source(one_day_ago, current) + for docs in latest_docs: + for doc in docs: + print(doc.id) + logger.notice("Asana connector test completed") diff --git a/backend/danswer/connectors/axero/connector.py b/backend/danswer/connectors/axero/connector.py index a4d5162b6ce..000151209de 100644 --- a/backend/danswer/connectors/axero/connector.py +++ b/backend/danswer/connectors/axero/connector.py @@ -15,7 +15,6 @@ from danswer.connectors.cross_connector_utils.rate_limit_wrapper import ( rate_limit_builder, ) -from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch @@ -24,6 +23,7 @@ from danswer.connectors.models import Section from danswer.file_processing.html_utils import parse_html_page_basic from danswer.utils.logger import setup_logger +from danswer.utils.retry_wrapper import retry_builder logger = setup_logger() diff --git a/backend/danswer/connectors/blob/connector.py b/backend/danswer/connectors/blob/connector.py index a664a3d764a..20434c23075 100644 --- a/backend/danswer/connectors/blob/connector.py +++ b/backend/danswer/connectors/blob/connector.py @@ -5,9 +5,9 @@ from typing import Any from typing import Optional -import boto3 -from botocore.client import Config -from mypy_boto3_s3 import S3Client +import boto3 # type: ignore +from botocore.client import Config # type: ignore +from mypy_boto3_s3 import S3Client # type: ignore from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import BlobType @@ -194,8 +194,8 @@ def _yield_blob_objects( try: text = extract_file_text( - name, BytesIO(downloaded_file), + file_name=name, break_on_unprocessable=False, ) batch.append( diff --git a/backend/danswer/connectors/bookstack/connector.py b/backend/danswer/connectors/bookstack/connector.py index f2e692d2c5f..9255bc3b8e5 100644 --- a/backend/danswer/connectors/bookstack/connector.py +++ b/backend/danswer/connectors/bookstack/connector.py @@ -44,8 +44,6 @@ def _get_doc_batch( start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None, ) -> tuple[list[Document], int]: - doc_batch: list[Document] = [] - params = { "count": str(batch_size), "offset": str(start_ind), @@ -63,8 +61,7 @@ def _get_doc_batch( ) batch = bookstack_client.get(endpoint, params=params).get("data", []) - for item in batch: - doc_batch.append(transformer(bookstack_client, item)) + doc_batch = [transformer(bookstack_client, item) for item in batch] return doc_batch, len(batch) diff --git a/backend/danswer/connectors/clickup/connector.py b/backend/danswer/connectors/clickup/connector.py index 78d572af413..2ccc5ef4f65 100644 --- a/backend/danswer/connectors/clickup/connector.py +++ b/backend/danswer/connectors/clickup/connector.py @@ -10,7 +10,6 @@ from danswer.connectors.cross_connector_utils.rate_limit_wrapper import ( rate_limit_builder, ) -from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector @@ -19,6 +18,7 @@ from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.utils.retry_wrapper import retry_builder CLICKUP_API_BASE_URL = "https://api.clickup.com/api/v2" @@ -210,6 +210,7 @@ def poll_source( "clickup_team_id": os.environ["clickup_team_id"], } ) + latest_docs = clickup_connector.load_from_state() for doc in latest_docs: diff --git a/backend/danswer/connectors/confluence/connector.py b/backend/danswer/connectors/confluence/connector.py index 78efce4ab98..e30c85922ce 100644 --- a/backend/danswer/connectors/confluence/connector.py +++ b/backend/danswer/connectors/confluence/connector.py @@ -1,40 +1,29 @@ -import io -import os -from collections.abc import Callable -from collections.abc import Collection from datetime import datetime from datetime import timezone -from functools import lru_cache from typing import Any -from typing import cast +from urllib.parse import quote -import bs4 -from atlassian import Confluence # type:ignore -from requests import HTTPError - -from danswer.configs.app_configs import ( - CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD, -) -from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD -from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP -from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.confluence.rate_limit_handler import ( - make_confluence_call_handle_rate_limit, -) +from danswer.connectors.confluence.onyx_confluence import build_confluence_client +from danswer.connectors.confluence.onyx_confluence import OnyxConfluence +from danswer.connectors.confluence.utils import attachment_to_content +from danswer.connectors.confluence.utils import build_confluence_document_id +from danswer.connectors.confluence.utils import datetime_from_string +from danswer.connectors.confluence.utils import extract_text_from_confluence_html from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import GenerateSlimDocumentOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.interfaces import SlimConnector from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section -from danswer.file_processing.extract_file_text import extract_file_text -from danswer.file_processing.html_utils import format_document_soup +from danswer.connectors.models import SlimDocument from danswer.utils.logger import setup_logger logger = setup_logger() @@ -43,266 +32,37 @@ # 1. Include attachments, etc # 2. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost - -NO_PERMISSIONS_TO_VIEW_ATTACHMENTS_ERROR_STR = ( - "User not permitted to view attachments on content" -) -NO_PARENT_OR_NO_PERMISSIONS_ERROR_STR = ( - "No parent or not permitted to view content with id" -) - - -@lru_cache() -def _get_user(user_id: str, confluence_client: Confluence) -> str: - """Get Confluence Display Name based on the account-id or userkey value - - Args: - user_id (str): The user id (i.e: the account-id or userkey) - confluence_client (Confluence): The Confluence Client - - Returns: - str: The User Display Name. 'Unknown User' if the user is deactivated or not found - """ - user_not_found = "Unknown User" - - get_user_details_by_accountid = make_confluence_call_handle_rate_limit( - confluence_client.get_user_details_by_accountid - ) - try: - return get_user_details_by_accountid(user_id).get("displayName", user_not_found) - except Exception as e: - logger.warning( - f"Unable to get the User Display Name with the id: '{user_id}' - {e}" - ) - return user_not_found - - -def parse_html_page(text: str, confluence_client: Confluence) -> str: - """Parse a Confluence html page and replace the 'user Id' by the real - User Display Name - - Args: - text (str): The page content - confluence_client (Confluence): Confluence client - - Returns: - str: loaded and formated Confluence page - """ - soup = bs4.BeautifulSoup(text, "html.parser") - for user in soup.findAll("ri:user"): - user_id = ( - user.attrs["ri:account-id"] - if "ri:account-id" in user.attrs - else user.get("ri:userkey") - ) - if not user_id: - logger.warning( - "ri:userkey not found in ri:user element. " f"Found attrs: {user.attrs}" - ) - continue - # Include @ sign for tagging, more clear for LLM - user.replaceWith("@" + _get_user(user_id, confluence_client)) - return format_document_soup(soup) - - -def get_used_attachments(text: str, confluence_client: Confluence) -> list[str]: - """Parse a Confluence html page to generate a list of current - attachment in used - - Args: - text (str): The page content - confluence_client (Confluence): Confluence client - - Returns: - list[str]: List of filename currently in used - """ - files_in_used = [] - soup = bs4.BeautifulSoup(text, "html.parser") - for attachment in soup.findAll("ri:attachment"): - files_in_used.append(attachment.attrs["ri:filename"]) - return files_in_used - - -def _comment_dfs( - comments_str: str, - comment_pages: Collection[dict[str, Any]], - confluence_client: Confluence, -) -> str: - get_page_child_by_type = make_confluence_call_handle_rate_limit( - confluence_client.get_page_child_by_type - ) - - for comment_page in comment_pages: - comment_html = comment_page["body"]["storage"]["value"] - comments_str += "\nComment:\n" + parse_html_page( - comment_html, confluence_client - ) - try: - child_comment_pages = get_page_child_by_type( - comment_page["id"], - type="comment", - start=None, - limit=None, - expand="body.storage.value", - ) - comments_str = _comment_dfs( - comments_str, child_comment_pages, confluence_client - ) - except HTTPError as e: - # not the cleanest, but I'm not aware of a nicer way to check the error - if NO_PARENT_OR_NO_PERMISSIONS_ERROR_STR not in str(e): - raise - - return comments_str - - -def _datetime_from_string(datetime_string: str) -> datetime: - datetime_object = datetime.fromisoformat(datetime_string) - - if datetime_object.tzinfo is None: - # If no timezone info, assume it is UTC - datetime_object = datetime_object.replace(tzinfo=timezone.utc) - else: - # If not in UTC, translate it - datetime_object = datetime_object.astimezone(timezone.utc) - - return datetime_object - - -class RecursiveIndexer: - def __init__( - self, - batch_size: int, - confluence_client: Confluence, - index_recursively: bool, - origin_page_id: str, - ) -> None: - self.batch_size = 1 - # batch_size - self.confluence_client = confluence_client - self.index_recursively = index_recursively - self.origin_page_id = origin_page_id - self.pages = self.recurse_children_pages(0, self.origin_page_id) - - def get_origin_page(self) -> list[dict[str, Any]]: - return [self._fetch_origin_page()] - - def get_pages(self, ind: int, size: int) -> list[dict]: - if ind * size > len(self.pages): - return [] - return self.pages[ind * size : (ind + 1) * size] - - def _fetch_origin_page( - self, - ) -> dict[str, Any]: - get_page_by_id = make_confluence_call_handle_rate_limit( - self.confluence_client.get_page_by_id - ) - try: - origin_page = get_page_by_id( - self.origin_page_id, expand="body.storage.value,version" - ) - return origin_page - except Exception as e: - logger.warning( - f"Appending orgin page with id {self.origin_page_id} failed: {e}" - ) - return {} - - def recurse_children_pages( - self, - start_ind: int, - page_id: str, - ) -> list[dict[str, Any]]: - pages: list[dict[str, Any]] = [] - current_level_pages: list[dict[str, Any]] = [] - next_level_pages: list[dict[str, Any]] = [] - - # Initial fetch of first level children - index = start_ind - while batch := self._fetch_single_depth_child_pages( - index, self.batch_size, page_id - ): - current_level_pages.extend(batch) - index += len(batch) - - pages.extend(current_level_pages) - - # Recursively index children and children's children, etc. - while current_level_pages: - for child in current_level_pages: - child_index = 0 - while child_batch := self._fetch_single_depth_child_pages( - child_index, self.batch_size, child["id"] - ): - next_level_pages.extend(child_batch) - child_index += len(child_batch) - - pages.extend(next_level_pages) - current_level_pages = next_level_pages - next_level_pages = [] - - try: - origin_page = self._fetch_origin_page() - pages.append(origin_page) - except Exception as e: - logger.warning(f"Appending origin page with id {page_id} failed: {e}") - - return pages - - def _fetch_single_depth_child_pages( - self, start_ind: int, batch_size: int, page_id: str - ) -> list[dict[str, Any]]: - child_pages: list[dict[str, Any]] = [] - - get_page_child_by_type = make_confluence_call_handle_rate_limit( - self.confluence_client.get_page_child_by_type - ) - - try: - child_page = get_page_child_by_type( - page_id, - type="page", - start=start_ind, - limit=batch_size, - expand="body.storage.value,version", - ) - - child_pages.extend(child_page) - return child_pages - - except Exception: - logger.warning( - f"Batch failed with page {page_id} at offset {start_ind} " - f"with size {batch_size}, processing pages individually..." - ) - - for i in range(batch_size): - ind = start_ind + i - try: - child_page = get_page_child_by_type( - page_id, - type="page", - start=ind, - limit=1, - expand="body.storage.value,version", - ) - child_pages.extend(child_page) - except Exception as e: - logger.warning(f"Page {page_id} at offset {ind} failed: {e}") - raise e - - return child_pages - - -class ConfluenceConnector(LoadConnector, PollConnector): +_COMMENT_EXPANSION_FIELDS = ["body.storage.value"] +_PAGE_EXPANSION_FIELDS = [ + "body.storage.value", + "version", + "space", + "metadata.labels", +] +_ATTACHMENT_EXPANSION_FIELDS = [ + "version", + "space", + "metadata.labels", +] + +_RESTRICTIONS_EXPANSION_FIELDS = [ + "space", + "restrictions.read.restrictions.user", + "restrictions.read.restrictions.group", +] + +_SLIM_DOC_BATCH_SIZE = 5000 + + +class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector): def __init__( self, wiki_base: str, - space: str, is_cloud: bool, + space: str = "", page_id: str = "", index_recursively: bool = True, + cql_query: str | None = None, batch_size: int = INDEX_BATCH_SIZE, continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE, # if a page has one of the labels specified in this list, we will just @@ -312,500 +72,237 @@ def __init__( ) -> None: self.batch_size = batch_size self.continue_on_failure = continue_on_failure - self.labels_to_skip = set(labels_to_skip) - self.recursive_indexer: RecursiveIndexer | None = None - self.index_recursively = index_recursively + self._confluence_client: OnyxConfluence | None = None + self.is_cloud = is_cloud # Remove trailing slash from wiki_base if present self.wiki_base = wiki_base.rstrip("/") - self.space = space - self.page_id = page_id - - self.is_cloud = is_cloud - self.space_level_scan = False - self.confluence_client: Confluence | None = None - - if self.page_id is None or self.page_id == "": - self.space_level_scan = True + # if nothing is provided, we will fetch all pages + cql_page_query = "type=page" + if cql_query: + # if a cql_query is provided, we will use it to fetch the pages + cql_page_query = cql_query + elif page_id: + # if a cql_query is not provided, we will use the page_id to fetch the page + if index_recursively: + cql_page_query += f" and ancestor='{page_id}'" + else: + cql_page_query += f" and id='{page_id}'" + elif space: + # if no cql_query or page_id is provided, we will use the space to fetch the pages + cql_page_query += f" and space='{quote(space)}'" + + self.cql_page_query = cql_page_query + self.cql_time_filter = "" + + self.cql_label_filter = "" + if labels_to_skip: + labels_to_skip = list(set(labels_to_skip)) + comma_separated_labels = ",".join( + f"'{quote(label)}'" for label in labels_to_skip + ) + self.cql_label_filter = f" and label not in ({comma_separated_labels})" - logger.info( - f"wiki_base: {self.wiki_base}, space: {self.space}, page_id: {self.page_id}," - + f" space_level_scan: {self.space_level_scan}, index_recursively: {self.index_recursively}" - ) + @property + def confluence_client(self) -> OnyxConfluence: + if self._confluence_client is None: + raise ConnectorMissingCredentialError("Confluence") + return self._confluence_client def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: - username = credentials["confluence_username"] - access_token = credentials["confluence_access_token"] - self.confluence_client = Confluence( - url=self.wiki_base, - # passing in username causes issues for Confluence data center - username=username if self.is_cloud else None, - password=access_token if self.is_cloud else None, - token=access_token if not self.is_cloud else None, + # see https://github.com/atlassian-api/atlassian-python-api/blob/master/atlassian/rest_client.py + # for a list of other hidden constructor args + self._confluence_client = build_confluence_client( + credentials=credentials, + is_cloud=self.is_cloud, + wiki_base=self.wiki_base, ) return None - def _fetch_pages( - self, - confluence_client: Confluence, - start_ind: int, - ) -> list[dict[str, Any]]: - def _fetch_space(start_ind: int, batch_size: int) -> list[dict[str, Any]]: - get_all_pages_from_space = make_confluence_call_handle_rate_limit( - confluence_client.get_all_pages_from_space - ) - try: - return get_all_pages_from_space( - self.space, - start=start_ind, - limit=batch_size, - status=( - None if CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES else "current" - ), - expand="body.storage.value,version", - ) - except Exception: - logger.warning( - f"Batch failed with space {self.space} at offset {start_ind} " - f"with size {batch_size}, processing pages individually..." - ) - - view_pages: list[dict[str, Any]] = [] - for i in range(self.batch_size): - try: - # Could be that one of the pages here failed due to this bug: - # https://jira.atlassian.com/browse/CONFCLOUD-76433 - view_pages.extend( - get_all_pages_from_space( - self.space, - start=start_ind + i, - limit=1, - status=( - None - if CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES - else "current" - ), - expand="body.storage.value,version", - ) - ) - except HTTPError as e: - logger.warning( - f"Page failed with space {self.space} at offset {start_ind + i}, " - f"trying alternative expand option: {e}" - ) - # Use view instead, which captures most info but is less complete - view_pages.extend( - get_all_pages_from_space( - self.space, - start=start_ind + i, - limit=1, - expand="body.view.value,version", - ) - ) - - return view_pages - - def _fetch_page(start_ind: int, batch_size: int) -> list[dict[str, Any]]: - if self.recursive_indexer is None: - self.recursive_indexer = RecursiveIndexer( - origin_page_id=self.page_id, - batch_size=self.batch_size, - confluence_client=self.confluence_client, - index_recursively=self.index_recursively, - ) - - if self.index_recursively: - return self.recursive_indexer.get_pages(start_ind, batch_size) - else: - return self.recursive_indexer.get_origin_page() + def _get_comment_string_for_page_id(self, page_id: str) -> str: + comment_string = "" - pages: list[dict[str, Any]] = [] + comment_cql = f"type=comment and container='{page_id}'" + comment_cql += self.cql_label_filter - try: - pages = ( - _fetch_space(start_ind, self.batch_size) - if self.space_level_scan - else _fetch_page(start_ind, self.batch_size) + expand = ",".join(_COMMENT_EXPANSION_FIELDS) + for comment in self.confluence_client.paginated_cql_retrieval( + cql=comment_cql, + expand=expand, + ): + comment_string += "\nComment:\n" + comment_string += extract_text_from_confluence_html( + confluence_client=self.confluence_client, + confluence_object=comment, + fetched_titles=set(), ) - return pages - - except Exception as e: - if not self.continue_on_failure: - raise e - - # error checking phase, only reachable if `self.continue_on_failure=True` - for i in range(self.batch_size): - try: - pages = ( - _fetch_space(start_ind, self.batch_size) - if self.space_level_scan - else _fetch_page(start_ind, self.batch_size) - ) - return pages - - except Exception: - logger.exception( - "Ran into exception when fetching pages from Confluence" - ) - return pages - - def _fetch_comments(self, confluence_client: Confluence, page_id: str) -> str: - get_page_child_by_type = make_confluence_call_handle_rate_limit( - confluence_client.get_page_child_by_type + return comment_string + + def _convert_object_to_document( + self, confluence_object: dict[str, Any] + ) -> Document | None: + """ + Takes in a confluence object, extracts all metadata, and converts it into a document. + If its a page, it extracts the text, adds the comments for the document text. + If its an attachment, it just downloads the attachment and converts that into a document. + """ + # The url and the id are the same + object_url = build_confluence_document_id( + self.wiki_base, confluence_object["_links"]["webui"], self.is_cloud ) - try: - comment_pages = cast( - Collection[dict[str, Any]], - get_page_child_by_type( - page_id, - type="comment", - start=None, - limit=None, - expand="body.storage.value", - ), + object_text = None + # Extract text from page + if confluence_object["type"] == "page": + object_text = extract_text_from_confluence_html( + confluence_client=self.confluence_client, + confluence_object=confluence_object, + fetched_titles={confluence_object.get("title", "")}, ) - return _comment_dfs("", comment_pages, confluence_client) - except Exception as e: - if not self.continue_on_failure: - raise e - - logger.exception( - "Ran into exception when fetching comments from Confluence" + # Add comments to text + object_text += self._get_comment_string_for_page_id(confluence_object["id"]) + elif confluence_object["type"] == "attachment": + object_text = attachment_to_content( + confluence_client=self.confluence_client, attachment=confluence_object ) - return "" - - def _fetch_labels(self, confluence_client: Confluence, page_id: str) -> list[str]: - get_page_labels = make_confluence_call_handle_rate_limit( - confluence_client.get_page_labels - ) - try: - labels_response = get_page_labels(page_id) - return [label["name"] for label in labels_response["results"]] - except Exception as e: - if not self.continue_on_failure: - raise e - - logger.exception("Ran into exception when fetching labels from Confluence") - return [] - - @classmethod - def _attachment_to_download_link( - cls, confluence_client: Confluence, attachment: dict[str, Any] - ) -> str: - return confluence_client.url + attachment["_links"]["download"] - - @classmethod - def _attachment_to_content( - cls, - confluence_client: Confluence, - attachment: dict[str, Any], - ) -> str | None: - """If it returns None, assume that we should skip this attachment.""" - if attachment["metadata"]["mediaType"] in [ - "image/jpeg", - "image/png", - "image/gif", - "image/svg+xml", - "video/mp4", - "video/quicktime", - ]: - return None - - download_link = cls._attachment_to_download_link(confluence_client, attachment) - attachment_size = attachment["extensions"]["fileSize"] - if attachment_size > CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD: - logger.warning( - f"Skipping {download_link} due to size. " - f"size={attachment_size} " - f"threshold={CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD}" - ) + if object_text is None: + # This only happens for attachments that are not parseable return None - response = confluence_client._session.get(download_link) - if response.status_code != 200: - logger.warning( - f"Failed to fetch {download_link} with invalid status code {response.status_code}" - ) - return None - - extracted_text = extract_file_text( - attachment["title"], io.BytesIO(response.content), False - ) - if len(extracted_text) > CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD: - logger.warning( - f"Skipping {download_link} due to char count. " - f"char count={len(extracted_text)} " - f"threshold={CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD}" - ) - return None - - return extracted_text - - def _fetch_attachments( - self, confluence_client: Confluence, page_id: str, files_in_used: list[str] - ) -> tuple[str, list[dict[str, Any]]]: - unused_attachments: list = [] + # Get space name + doc_metadata: dict[str, str | list[str]] = { + "Wiki Space Name": confluence_object["space"]["name"] + } - get_attachments_from_content = make_confluence_call_handle_rate_limit( - confluence_client.get_attachments_from_content + # Get labels + label_dicts = confluence_object["metadata"]["labels"]["results"] + page_labels = [label["name"] for label in label_dicts] + if page_labels: + doc_metadata["labels"] = page_labels + + # Get last modified and author email + last_modified = datetime_from_string(confluence_object["version"]["when"]) + author_email = confluence_object["version"].get("by", {}).get("email") + + return Document( + id=object_url, + sections=[Section(link=object_url, text=object_text)], + source=DocumentSource.CONFLUENCE, + semantic_identifier=confluence_object["title"], + doc_updated_at=last_modified, + primary_owners=( + [BasicExpertInfo(email=author_email)] if author_email else None + ), + metadata=doc_metadata, ) - files_attachment_content: list = [] - - try: - expand = "history.lastUpdated,metadata.labels" - attachments_container = get_attachments_from_content( - page_id, start=0, limit=500, expand=expand - ) - for attachment in attachments_container["results"]: - if attachment["title"] not in files_in_used: - unused_attachments.append(attachment) - continue - attachment_content = self._attachment_to_content( - confluence_client, attachment - ) - if attachment_content: - files_attachment_content.append(attachment_content) - - except Exception as e: - if isinstance( - e, HTTPError - ) and NO_PERMISSIONS_TO_VIEW_ATTACHMENTS_ERROR_STR in str(e): - logger.warning( - f"User does not have access to attachments on page '{page_id}'" - ) - return "", [] - - if not self.continue_on_failure: - raise e - logger.exception( - f"Ran into exception when fetching attachments from Confluence: {e}" - ) - - return "\n".join(files_attachment_content), unused_attachments - - def _get_doc_batch( - self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None - ) -> tuple[list[Document], list[dict[str, Any]], int]: + def _fetch_document_batches(self) -> GenerateDocumentsOutput: doc_batch: list[Document] = [] - unused_attachments: list[dict[str, Any]] = [] - - if self.confluence_client is None: - raise ConnectorMissingCredentialError("Confluence") - batch = self._fetch_pages(self.confluence_client, start_ind) - - for page in batch: - last_modified = _datetime_from_string(page["version"]["when"]) - author = cast(str | None, page["version"].get("by", {}).get("email")) - - if time_filter and not time_filter(last_modified): - continue - - page_id = page["id"] - - if self.labels_to_skip or not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING: - page_labels = self._fetch_labels(self.confluence_client, page_id) - - # check disallowed labels - if self.labels_to_skip: - label_intersection = self.labels_to_skip.intersection(page_labels) - if label_intersection: - logger.info( - f"Page with ID '{page_id}' has a label which has been " - f"designated as disallowed: {label_intersection}. Skipping." - ) + confluence_page_ids: list[str] = [] + + page_query = self.cql_page_query + self.cql_label_filter + self.cql_time_filter + # Fetch pages as Documents + for page in self.confluence_client.paginated_cql_retrieval( + cql=page_query, + expand=",".join(_PAGE_EXPANSION_FIELDS), + limit=self.batch_size, + ): + confluence_page_ids.append(page["id"]) + doc = self._convert_object_to_document(page) + if doc is not None: + doc_batch.append(doc) + if len(doc_batch) >= self.batch_size: + yield doc_batch + doc_batch = [] + + # Fetch attachments as Documents + for confluence_page_id in confluence_page_ids: + attachment_cql = f"type=attachment and container='{confluence_page_id}'" + attachment_cql += self.cql_label_filter + # TODO: maybe should add time filter as well? + for attachment in self.confluence_client.paginated_cql_retrieval( + cql=attachment_cql, + expand=",".join(_ATTACHMENT_EXPANSION_FIELDS), + ): + doc = self._convert_object_to_document(attachment) + if doc is not None: + doc_batch.append(doc) + if len(doc_batch) >= self.batch_size: + yield doc_batch + doc_batch = [] + + if doc_batch: + yield doc_batch - continue + def load_from_state(self) -> GenerateDocumentsOutput: + return self._fetch_document_batches() - page_html = ( - page["body"].get("storage", page["body"].get("view", {})).get("value") - ) - page_url = self.wiki_base + page["_links"]["webui"] - if not page_html: - logger.debug("Page is empty, skipping: %s", page_url) - continue - page_text = parse_html_page(page_html, self.confluence_client) - - files_in_used = get_used_attachments(page_html, self.confluence_client) - attachment_text, unused_page_attachments = self._fetch_attachments( - self.confluence_client, page_id, files_in_used - ) - unused_attachments.extend(unused_page_attachments) - - page_text += attachment_text - comments_text = self._fetch_comments(self.confluence_client, page_id) - page_text += comments_text - doc_metadata: dict[str, str | list[str]] = {"Wiki Space Name": self.space} - if not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING and page_labels: - doc_metadata["labels"] = page_labels - - doc_batch.append( - Document( - id=page_url, - sections=[Section(link=page_url, text=page_text)], - source=DocumentSource.CONFLUENCE, - semantic_identifier=page["title"], - doc_updated_at=last_modified, - primary_owners=( - [BasicExpertInfo(email=author)] if author else None - ), - metadata=doc_metadata, - ) - ) - return ( - doc_batch, - unused_attachments, - len(batch), + def poll_source(self, start: float, end: float) -> GenerateDocumentsOutput: + # Add time filters + formatted_start_time = datetime.fromtimestamp(start, tz=timezone.utc).strftime( + "%Y-%m-%d %H:%M" + ) + formatted_end_time = datetime.fromtimestamp(end, tz=timezone.utc).strftime( + "%Y-%m-%d %H:%M" ) + self.cql_time_filter = f" and lastmodified >= '{formatted_start_time}'" + self.cql_time_filter += f" and lastmodified <= '{formatted_end_time}'" + return self._fetch_document_batches() - def _get_attachment_batch( + def retrieve_all_slim_documents( self, - start_ind: int, - attachments: list[dict[str, Any]], - time_filter: Callable[[datetime], bool] | None = None, - ) -> tuple[list[Document], int]: - doc_batch: list[Document] = [] - - if self.confluence_client is None: - raise ConnectorMissingCredentialError("Confluence") - - end_ind = min(start_ind + self.batch_size, len(attachments)) - - for attachment in attachments[start_ind:end_ind]: - last_updated = _datetime_from_string( - attachment["history"]["lastUpdated"]["when"] - ) - - if time_filter and not time_filter(last_updated): - continue - - attachment_url = self._attachment_to_download_link( - self.confluence_client, attachment - ) - attachment_content = self._attachment_to_content( - self.confluence_client, attachment - ) - if attachment_content is None: - continue - - creator_email = attachment["history"]["createdBy"].get("email") - - comment = attachment["metadata"].get("comment", "") - doc_metadata: dict[str, str | list[str]] = {"comment": comment} - - attachment_labels: list[str] = [] - if not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING: - for label in attachment["metadata"]["labels"]["results"]: - attachment_labels.append(label["name"]) - - doc_metadata["labels"] = attachment_labels - - doc_batch.append( - Document( - id=attachment_url, - sections=[Section(link=attachment_url, text=attachment_content)], - source=DocumentSource.CONFLUENCE, - semantic_identifier=attachment["title"], - doc_updated_at=last_updated, - primary_owners=( - [BasicExpertInfo(email=creator_email)] - if creator_email - else None + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> GenerateSlimDocumentOutput: + doc_metadata_list: list[SlimDocument] = [] + + restrictions_expand = ",".join(_RESTRICTIONS_EXPANSION_FIELDS) + + page_query = self.cql_page_query + self.cql_label_filter + for page in self.confluence_client.cql_paginate_all_expansions( + cql=page_query, + expand=restrictions_expand, + limit=_SLIM_DOC_BATCH_SIZE, + ): + # If the page has restrictions, add them to the perm_sync_data + # These will be used by doc_sync.py to sync permissions + perm_sync_data = { + "restrictions": page.get("restrictions", {}), + "space_key": page.get("space", {}).get("key"), + } + + doc_metadata_list.append( + SlimDocument( + id=build_confluence_document_id( + self.wiki_base, + page["_links"]["webui"], + self.is_cloud, ), - metadata=doc_metadata, + perm_sync_data=perm_sync_data, ) ) + attachment_cql = f"type=attachment and container='{page['id']}'" + attachment_cql += self.cql_label_filter + for attachment in self.confluence_client.cql_paginate_all_expansions( + cql=attachment_cql, + expand=restrictions_expand, + limit=_SLIM_DOC_BATCH_SIZE, + ): + doc_metadata_list.append( + SlimDocument( + id=build_confluence_document_id( + self.wiki_base, + attachment["_links"]["webui"], + self.is_cloud, + ), + perm_sync_data=perm_sync_data, + ) + ) + if len(doc_metadata_list) > _SLIM_DOC_BATCH_SIZE: + yield doc_metadata_list[:_SLIM_DOC_BATCH_SIZE] + doc_metadata_list = doc_metadata_list[_SLIM_DOC_BATCH_SIZE:] - return doc_batch, end_ind - start_ind - - def load_from_state(self) -> GenerateDocumentsOutput: - unused_attachments = [] - - if self.confluence_client is None: - raise ConnectorMissingCredentialError("Confluence") - - start_ind = 0 - while True: - doc_batch, unused_attachments_batch, num_pages = self._get_doc_batch( - start_ind - ) - unused_attachments.extend(unused_attachments_batch) - start_ind += num_pages - if doc_batch: - yield doc_batch - - if num_pages < self.batch_size: - break - - start_ind = 0 - while True: - attachment_batch, num_attachments = self._get_attachment_batch( - start_ind, unused_attachments - ) - start_ind += num_attachments - if attachment_batch: - yield attachment_batch - - if num_attachments < self.batch_size: - break - - def poll_source( - self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch - ) -> GenerateDocumentsOutput: - unused_attachments = [] - - if self.confluence_client is None: - raise ConnectorMissingCredentialError("Confluence") - - start_time = datetime.fromtimestamp(start, tz=timezone.utc) - end_time = datetime.fromtimestamp(end, tz=timezone.utc) - - start_ind = 0 - while True: - doc_batch, unused_attachments_batch, num_pages = self._get_doc_batch( - start_ind, time_filter=lambda t: start_time <= t <= end_time - ) - unused_attachments.extend(unused_attachments_batch) - - start_ind += num_pages - if doc_batch: - yield doc_batch - - if num_pages < self.batch_size: - break - - start_ind = 0 - while True: - attachment_batch, num_attachments = self._get_attachment_batch( - start_ind, - unused_attachments, - time_filter=lambda t: start_time <= t <= end_time, - ) - start_ind += num_attachments - if attachment_batch: - yield attachment_batch - - if num_attachments < self.batch_size: - break - - -if __name__ == "__main__": - connector = ConfluenceConnector( - wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"], - space=os.environ["CONFLUENCE_TEST_SPACE"], - is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true", - page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""), - index_recursively=True, - ) - connector.load_credentials( - { - "confluence_username": os.environ["CONFLUENCE_USER_NAME"], - "confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"], - } - ) - document_batches = connector.load_from_state() - print(next(document_batches)) + yield doc_metadata_list diff --git a/backend/danswer/connectors/confluence/onyx_confluence.py b/backend/danswer/connectors/confluence/onyx_confluence.py new file mode 100644 index 00000000000..e1542109c42 --- /dev/null +++ b/backend/danswer/connectors/confluence/onyx_confluence.py @@ -0,0 +1,338 @@ +import math +import time +from collections.abc import Callable +from collections.abc import Iterator +from typing import Any +from typing import cast +from typing import TypeVar +from urllib.parse import quote + +from atlassian import Confluence # type:ignore +from requests import HTTPError + +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +F = TypeVar("F", bound=Callable[..., Any]) + + +RATE_LIMIT_MESSAGE_LOWERCASE = "Rate limit exceeded".lower() + +# https://jira.atlassian.com/browse/CONFCLOUD-76433 +_PROBLEMATIC_EXPANSIONS = "body.storage.value" +_REPLACEMENT_EXPANSIONS = "body.view.value" + + +class ConfluenceRateLimitError(Exception): + pass + + +def _handle_http_error(e: HTTPError, attempt: int) -> int: + MIN_DELAY = 2 + MAX_DELAY = 60 + STARTING_DELAY = 5 + BACKOFF = 2 + + # Check if the response or headers are None to avoid potential AttributeError + if e.response is None or e.response.headers is None: + logger.warning("HTTPError with `None` as response or as headers") + raise e + + if ( + e.response.status_code != 429 + and RATE_LIMIT_MESSAGE_LOWERCASE not in e.response.text.lower() + ): + raise e + + retry_after = None + + retry_after_header = e.response.headers.get("Retry-After") + if retry_after_header is not None: + try: + retry_after = int(retry_after_header) + if retry_after > MAX_DELAY: + logger.warning( + f"Clamping retry_after from {retry_after} to {MAX_DELAY} seconds..." + ) + retry_after = MAX_DELAY + if retry_after < MIN_DELAY: + retry_after = MIN_DELAY + except ValueError: + pass + + if retry_after is not None: + logger.warning( + f"Rate limiting with retry header. Retrying after {retry_after} seconds..." + ) + delay = retry_after + else: + logger.warning( + "Rate limiting without retry header. Retrying with exponential backoff..." + ) + delay = min(STARTING_DELAY * (BACKOFF**attempt), MAX_DELAY) + + delay_until = math.ceil(time.monotonic() + delay) + return delay_until + + +# https://developer.atlassian.com/cloud/confluence/rate-limiting/ +# this uses the native rate limiting option provided by the +# confluence client and otherwise applies a simpler set of error handling +def handle_confluence_rate_limit(confluence_call: F) -> F: + def wrapped_call(*args: list[Any], **kwargs: Any) -> Any: + MAX_RETRIES = 5 + + TIMEOUT = 600 + timeout_at = time.monotonic() + TIMEOUT + + for attempt in range(MAX_RETRIES): + if time.monotonic() > timeout_at: + raise TimeoutError( + f"Confluence call attempts took longer than {TIMEOUT} seconds." + ) + + try: + # we're relying more on the client to rate limit itself + # and applying our own retries in a more specific set of circumstances + return confluence_call(*args, **kwargs) + except HTTPError as e: + delay_until = _handle_http_error(e, attempt) + logger.warning( + f"HTTPError in confluence call. " + f"Retrying in {delay_until} seconds..." + ) + while time.monotonic() < delay_until: + # in the future, check a signal here to exit + time.sleep(1) + except AttributeError as e: + # Some error within the Confluence library, unclear why it fails. + # Users reported it to be intermittent, so just retry + if attempt == MAX_RETRIES - 1: + raise e + + logger.exception( + "Confluence Client raised an AttributeError. Retrying..." + ) + time.sleep(5) + + return cast(F, wrapped_call) + + +_DEFAULT_PAGINATION_LIMIT = 1000 + + +class OnyxConfluence(Confluence): + """ + This is a custom Confluence class that overrides the default Confluence class to add a custom CQL method. + This is necessary because the default Confluence class does not properly support cql expansions. + All methods are automatically wrapped with handle_confluence_rate_limit. + """ + + def __init__(self, url: str, *args: Any, **kwargs: Any) -> None: + super(OnyxConfluence, self).__init__(url, *args, **kwargs) + self._wrap_methods() + + def _wrap_methods(self) -> None: + """ + For each attribute that is callable (i.e., a method) and doesn't start with an underscore, + wrap it with handle_confluence_rate_limit. + """ + for attr_name in dir(self): + if callable(getattr(self, attr_name)) and not attr_name.startswith("_"): + setattr( + self, + attr_name, + handle_confluence_rate_limit(getattr(self, attr_name)), + ) + + def _paginate_url( + self, url_suffix: str, limit: int | None = None + ) -> Iterator[dict[str, Any]]: + """ + This will paginate through the top level query. + """ + if not limit: + limit = _DEFAULT_PAGINATION_LIMIT + + connection_char = "&" if "?" in url_suffix else "?" + url_suffix += f"{connection_char}limit={limit}" + + while url_suffix: + try: + logger.debug(f"Making confluence call to {url_suffix}") + next_response = self.get(url_suffix) + except Exception as e: + logger.warning(f"Error in confluence call to {url_suffix}") + + # If the problematic expansion is in the url, replace it + # with the replacement expansion and try again + # If that fails, raise the error + if _PROBLEMATIC_EXPANSIONS not in url_suffix: + logger.exception(f"Error in confluence call to {url_suffix}") + raise e + logger.warning( + f"Replacing {_PROBLEMATIC_EXPANSIONS} with {_REPLACEMENT_EXPANSIONS}" + " and trying again." + ) + url_suffix = url_suffix.replace( + _PROBLEMATIC_EXPANSIONS, + _REPLACEMENT_EXPANSIONS, + ) + continue + + # yield the results individually + yield from next_response.get("results", []) + + url_suffix = next_response.get("_links", {}).get("next") + + def paginated_cql_retrieval( + self, + cql: str, + expand: str | None = None, + limit: int | None = None, + ) -> Iterator[dict[str, Any]]: + """ + The content/search endpoint can be used to fetch pages, attachments, and comments. + """ + expand_string = f"&expand={expand}" if expand else "" + yield from self._paginate_url( + f"rest/api/content/search?cql={cql}{expand_string}", limit + ) + + def cql_paginate_all_expansions( + self, + cql: str, + expand: str | None = None, + limit: int | None = None, + ) -> Iterator[dict[str, Any]]: + """ + This function will paginate through the top level query first, then + paginate through all of the expansions. + The limit only applies to the top level query. + All expansion paginations use default pagination limit (defined by Atlassian). + """ + + def _traverse_and_update(data: dict | list) -> None: + if isinstance(data, dict): + next_url = data.get("_links", {}).get("next") + if next_url and "results" in data: + data["results"].extend(self._paginate_url(next_url)) + + for value in data.values(): + _traverse_and_update(value) + elif isinstance(data, list): + for item in data: + _traverse_and_update(item) + + for confluence_object in self.paginated_cql_retrieval(cql, expand, limit): + _traverse_and_update(confluence_object) + yield confluence_object + + def paginated_cql_user_retrieval( + self, + expand: str | None = None, + limit: int | None = None, + ) -> Iterator[dict[str, Any]]: + """ + The search/user endpoint can be used to fetch users. + It's a seperate endpoint from the content/search endpoint used only for users. + Otherwise it's very similar to the content/search endpoint. + """ + cql = "type=user" + url = "rest/api/search/user" if self.cloud else "rest/api/search" + expand_string = f"&expand={expand}" if expand else "" + url += f"?cql={cql}{expand_string}" + yield from self._paginate_url(url, limit) + + def paginated_groups_by_user_retrieval( + self, + user: dict[str, Any], + limit: int | None = None, + ) -> Iterator[dict[str, Any]]: + """ + This is not an SQL like query. + It's a confluence specific endpoint that can be used to fetch groups. + """ + user_field = "accountId" if self.cloud else "key" + user_value = user["accountId"] if self.cloud else user["userKey"] + # Server uses userKey (but calls it key during the API call), Cloud uses accountId + user_query = f"{user_field}={quote(user_value)}" + + url = f"rest/api/user/memberof?{user_query}" + yield from self._paginate_url(url, limit) + + def paginated_groups_retrieval( + self, + limit: int | None = None, + ) -> Iterator[dict[str, Any]]: + """ + This is not an SQL like query. + It's a confluence specific endpoint that can be used to fetch groups. + """ + yield from self._paginate_url("rest/api/group", limit) + + def paginated_group_members_retrieval( + self, + group_name: str, + limit: int | None = None, + ) -> Iterator[dict[str, Any]]: + """ + This is not an SQL like query. + It's a confluence specific endpoint that can be used to fetch the members of a group. + THIS DOESN'T WORK FOR SERVER because it breaks when there is a slash in the group name. + E.g. neither "test/group" nor "test%2Fgroup" works for confluence. + """ + group_name = quote(group_name) + yield from self._paginate_url(f"rest/api/group/{group_name}/member", limit) + + +def _validate_connector_configuration( + credentials: dict[str, Any], + is_cloud: bool, + wiki_base: str, +) -> None: + # test connection with direct client, no retries + confluence_client_with_minimal_retries = Confluence( + api_version="cloud" if is_cloud else "latest", + url=wiki_base.rstrip("/"), + username=credentials["confluence_username"] if is_cloud else None, + password=credentials["confluence_access_token"] if is_cloud else None, + token=credentials["confluence_access_token"] if not is_cloud else None, + backoff_and_retry=True, + max_backoff_retries=6, + max_backoff_seconds=10, + ) + spaces = confluence_client_with_minimal_retries.get_all_spaces(limit=1) + + if not spaces: + raise RuntimeError( + f"No spaces found at {wiki_base}! " + "Check your credentials and wiki_base and make sure " + "is_cloud is set correctly." + ) + + +def build_confluence_client( + credentials: dict[str, Any], + is_cloud: bool, + wiki_base: str, +) -> OnyxConfluence: + _validate_connector_configuration( + credentials=credentials, + is_cloud=is_cloud, + wiki_base=wiki_base, + ) + return OnyxConfluence( + api_version="cloud" if is_cloud else "latest", + # Remove trailing slash from wiki_base if present + url=wiki_base.rstrip("/"), + # passing in username causes issues for Confluence data center + username=credentials["confluence_username"] if is_cloud else None, + password=credentials["confluence_access_token"] if is_cloud else None, + token=credentials["confluence_access_token"] if not is_cloud else None, + backoff_and_retry=True, + max_backoff_retries=10, + max_backoff_seconds=60, + ) diff --git a/backend/danswer/connectors/confluence/rate_limit_handler.py b/backend/danswer/connectors/confluence/rate_limit_handler.py deleted file mode 100644 index 822badb9b99..00000000000 --- a/backend/danswer/connectors/confluence/rate_limit_handler.py +++ /dev/null @@ -1,76 +0,0 @@ -import time -from collections.abc import Callable -from typing import Any -from typing import cast -from typing import TypeVar - -from requests import HTTPError - -from danswer.utils.logger import setup_logger - -logger = setup_logger() - - -F = TypeVar("F", bound=Callable[..., Any]) - - -RATE_LIMIT_MESSAGE_LOWERCASE = "Rate limit exceeded".lower() - - -class ConfluenceRateLimitError(Exception): - pass - - -def make_confluence_call_handle_rate_limit(confluence_call: F) -> F: - def wrapped_call(*args: list[Any], **kwargs: Any) -> Any: - max_retries = 5 - starting_delay = 5 - backoff = 2 - max_delay = 600 - - for attempt in range(max_retries): - try: - return confluence_call(*args, **kwargs) - except HTTPError as e: - # Check if the response or headers are None to avoid potential AttributeError - if e.response is None or e.response.headers is None: - logger.warning("HTTPError with `None` as response or as headers") - raise e - - retry_after_header = e.response.headers.get("Retry-After") - if ( - e.response.status_code == 429 - or RATE_LIMIT_MESSAGE_LOWERCASE in e.response.text.lower() - ): - retry_after = None - if retry_after_header is not None: - try: - retry_after = int(retry_after_header) - except ValueError: - pass - - if retry_after is not None: - logger.warning( - f"Rate limit hit. Retrying after {retry_after} seconds..." - ) - time.sleep(retry_after) - else: - logger.warning( - "Rate limit hit. Retrying with exponential backoff..." - ) - delay = min(starting_delay * (backoff**attempt), max_delay) - time.sleep(delay) - else: - # re-raise, let caller handle - raise - except AttributeError as e: - # Some error within the Confluence library, unclear why it fails. - # Users reported it to be intermittent, so just retry - logger.warning(f"Confluence Internal Error, retrying... {e}") - delay = min(starting_delay * (backoff**attempt), max_delay) - time.sleep(delay) - - if attempt == max_retries - 1: - raise e - - return cast(F, wrapped_call) diff --git a/backend/danswer/connectors/confluence/utils.py b/backend/danswer/connectors/confluence/utils.py new file mode 100644 index 00000000000..e6ac0308a3a --- /dev/null +++ b/backend/danswer/connectors/confluence/utils.py @@ -0,0 +1,271 @@ +import io +from datetime import datetime +from datetime import timezone +from typing import Any +from urllib.parse import quote + +import bs4 + +from danswer.configs.app_configs import ( + CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD, +) +from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD +from danswer.connectors.confluence.onyx_confluence import ( + OnyxConfluence, +) +from danswer.file_processing.extract_file_text import extract_file_text +from danswer.file_processing.html_utils import format_document_soup +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +_USER_EMAIL_CACHE: dict[str, str | None] = {} + + +def get_user_email_from_username__server( + confluence_client: OnyxConfluence, user_name: str +) -> str | None: + global _USER_EMAIL_CACHE + if _USER_EMAIL_CACHE.get(user_name) is None: + try: + response = confluence_client.get_mobile_parameters(user_name) + email = response.get("email") + except Exception: + email = None + _USER_EMAIL_CACHE[user_name] = email + return _USER_EMAIL_CACHE[user_name] + + +_USER_NOT_FOUND = "Unknown Confluence User" +_USER_ID_TO_DISPLAY_NAME_CACHE: dict[str, str | None] = {} + + +def _get_user(confluence_client: OnyxConfluence, user_id: str) -> str: + """Get Confluence Display Name based on the account-id or userkey value + + Args: + user_id (str): The user id (i.e: the account-id or userkey) + confluence_client (Confluence): The Confluence Client + + Returns: + str: The User Display Name. 'Unknown User' if the user is deactivated or not found + """ + global _USER_ID_TO_DISPLAY_NAME_CACHE + if _USER_ID_TO_DISPLAY_NAME_CACHE.get(user_id) is None: + try: + result = confluence_client.get_user_details_by_userkey(user_id) + found_display_name = result.get("displayName") + except Exception: + found_display_name = None + + if not found_display_name: + try: + result = confluence_client.get_user_details_by_accountid(user_id) + found_display_name = result.get("displayName") + except Exception: + found_display_name = None + + _USER_ID_TO_DISPLAY_NAME_CACHE[user_id] = found_display_name + + return _USER_ID_TO_DISPLAY_NAME_CACHE.get(user_id) or _USER_NOT_FOUND + + +def extract_text_from_confluence_html( + confluence_client: OnyxConfluence, + confluence_object: dict[str, Any], + fetched_titles: set[str], +) -> str: + """Parse a Confluence html page and replace the 'user Id' by the real + User Display Name + + Args: + confluence_object (dict): The confluence object as a dict + confluence_client (Confluence): Confluence client + fetched_titles (set[str]): The titles of the pages that have already been fetched + Returns: + str: loaded and formated Confluence page + """ + body = confluence_object["body"] + object_html = body.get("storage", body.get("view", {})).get("value") + + soup = bs4.BeautifulSoup(object_html, "html.parser") + for user in soup.findAll("ri:user"): + user_id = ( + user.attrs["ri:account-id"] + if "ri:account-id" in user.attrs + else user.get("ri:userkey") + ) + if not user_id: + logger.warning( + "ri:userkey not found in ri:user element. " f"Found attrs: {user.attrs}" + ) + continue + # Include @ sign for tagging, more clear for LLM + user.replaceWith("@" + _get_user(confluence_client, user_id)) + + for html_page_reference in soup.findAll("ac:structured-macro"): + # Here, we only want to process page within page macros + if html_page_reference.attrs.get("ac:name") != "include": + continue + + page_data = html_page_reference.find("ri:page") + if not page_data: + logger.warning( + f"Skipping retrieval of {html_page_reference} because because page data is missing" + ) + continue + + page_title = page_data.attrs.get("ri:content-title") + if not page_title: + # only fetch pages that have a title + logger.warning( + f"Skipping retrieval of {html_page_reference} because it has no title" + ) + continue + + if page_title in fetched_titles: + # prevent recursive fetching of pages + logger.debug(f"Skipping {page_title} because it has already been fetched") + continue + + fetched_titles.add(page_title) + + # Wrap this in a try-except because there are some pages that might not exist + try: + page_query = f"type=page and title='{quote(page_title)}'" + + page_contents: dict[str, Any] | None = None + # Confluence enforces title uniqueness, so we should only get one result here + for page in confluence_client.paginated_cql_retrieval( + cql=page_query, + expand="body.storage.value", + limit=1, + ): + page_contents = page + break + except Exception as e: + logger.warning( + f"Error getting page contents for object {confluence_object}: {e}" + ) + continue + + if not page_contents: + continue + + text_from_page = extract_text_from_confluence_html( + confluence_client=confluence_client, + confluence_object=page_contents, + fetched_titles=fetched_titles, + ) + + html_page_reference.replaceWith(text_from_page) + + for html_link_body in soup.findAll("ac:link-body"): + # This extracts the text from inline links in the page so they can be + # represented in the document text as plain text + try: + text_from_link = html_link_body.text + html_link_body.replaceWith(f"(LINK TEXT: {text_from_link})") + except Exception as e: + logger.warning(f"Error processing ac:link-body: {e}") + + return format_document_soup(soup) + + +def attachment_to_content( + confluence_client: OnyxConfluence, + attachment: dict[str, Any], +) -> str | None: + """If it returns None, assume that we should skip this attachment.""" + if attachment["metadata"]["mediaType"] in [ + "image/jpeg", + "image/png", + "image/gif", + "image/svg+xml", + "video/mp4", + "video/quicktime", + ]: + return None + + download_link = confluence_client.url + attachment["_links"]["download"] + + attachment_size = attachment["extensions"]["fileSize"] + if attachment_size > CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD: + logger.warning( + f"Skipping {download_link} due to size. " + f"size={attachment_size} " + f"threshold={CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD}" + ) + return None + + logger.info(f"_attachment_to_content - _session.get: link={download_link}") + response = confluence_client._session.get(download_link) + if response.status_code != 200: + logger.warning( + f"Failed to fetch {download_link} with invalid status code {response.status_code}" + ) + return None + + extracted_text = extract_file_text( + io.BytesIO(response.content), + file_name=attachment["title"], + break_on_unprocessable=False, + ) + if len(extracted_text) > CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD: + logger.warning( + f"Skipping {download_link} due to char count. " + f"char count={len(extracted_text)} " + f"threshold={CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD}" + ) + return None + + return extracted_text + + +def build_confluence_document_id( + base_url: str, content_url: str, is_cloud: bool +) -> str: + """For confluence, the document id is the page url for a page based document + or the attachment download url for an attachment based document + + Args: + base_url (str): The base url of the Confluence instance + content_url (str): The url of the page or attachment download url + + Returns: + str: The document id + """ + if is_cloud and not base_url.endswith("/wiki"): + base_url += "/wiki" + return f"{base_url}{content_url}" + + +def extract_referenced_attachment_names(page_text: str) -> list[str]: + """Parse a Confluence html page to generate a list of current + attachments in use + + Args: + text (str): The page content + + Returns: + list[str]: List of filenames currently in use by the page text + """ + referenced_attachment_filenames = [] + soup = bs4.BeautifulSoup(page_text, "html.parser") + for attachment in soup.findAll("ri:attachment"): + referenced_attachment_filenames.append(attachment.attrs["ri:filename"]) + return referenced_attachment_filenames + + +def datetime_from_string(datetime_string: str) -> datetime: + datetime_object = datetime.fromisoformat(datetime_string) + + if datetime_object.tzinfo is None: + # If no timezone info, assume it is UTC + datetime_object = datetime_object.replace(tzinfo=timezone.utc) + else: + # If not in UTC, translate it + datetime_object = datetime_object.astimezone(timezone.utc) + + return datetime_object diff --git a/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py b/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py index 897503dca99..8e8ea8d7d65 100644 --- a/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py +++ b/backend/danswer/connectors/cross_connector_utils/miscellaneous_utils.py @@ -11,6 +11,10 @@ from danswer.utils.text_processing import is_valid_email +T = TypeVar("T") +U = TypeVar("U") + + def datetime_to_utc(dt: datetime) -> datetime: if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None: dt = dt.replace(tzinfo=timezone.utc) @@ -19,7 +23,16 @@ def datetime_to_utc(dt: datetime) -> datetime: def time_str_to_utc(datetime_str: str) -> datetime: - dt = parse(datetime_str) + try: + dt = parse(datetime_str) + except ValueError: + # Handle malformed timezone by attempting to fix common format issues + if "0000" in datetime_str: + # Convert "0000" to "+0000" for proper timezone parsing + fixed_dt_str = datetime_str.replace(" 0000", " +0000") + dt = parse(fixed_dt_str) + else: + raise return datetime_to_utc(dt) @@ -49,10 +62,6 @@ def get_experts_stores_representations( return [owner for owner in reps if owner is not None] -T = TypeVar("T") -U = TypeVar("U") - - def process_in_batches( objects: list[T], process_function: Callable[[T], U], batch_size: int ) -> Iterator[list[U]]: diff --git a/backend/danswer/connectors/danswer_jira/connector.py b/backend/danswer/connectors/danswer_jira/connector.py index e3562f3a45c..1d20c608f89 100644 --- a/backend/danswer/connectors/danswer_jira/connector.py +++ b/backend/danswer/connectors/danswer_jira/connector.py @@ -1,198 +1,158 @@ import os +from collections.abc import Iterable from datetime import datetime from datetime import timezone from typing import Any -from urllib.parse import urlparse from jira import JIRA from jira.resources import Issue from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import JIRA_CONNECTOR_LABELS_TO_SKIP +from danswer.configs.app_configs import JIRA_CONNECTOR_MAX_TICKET_SIZE from danswer.configs.constants import DocumentSource from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc +from danswer.connectors.danswer_jira.utils import best_effort_basic_expert_info +from danswer.connectors.danswer_jira.utils import best_effort_get_field_from_issue +from danswer.connectors.danswer_jira.utils import build_jira_client +from danswer.connectors.danswer_jira.utils import build_jira_url +from danswer.connectors.danswer_jira.utils import extract_jira_project +from danswer.connectors.danswer_jira.utils import extract_text_from_adf +from danswer.connectors.danswer_jira.utils import get_comment_strs from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import GenerateSlimDocumentOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch -from danswer.connectors.models import BasicExpertInfo +from danswer.connectors.interfaces import SlimConnector from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.connectors.models import SlimDocument from danswer.utils.logger import setup_logger logger = setup_logger() -PROJECT_URL_PAT = "projects" -JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2" - - -def extract_jira_project(url: str) -> tuple[str, str]: - parsed_url = urlparse(url) - jira_base = parsed_url.scheme + "://" + parsed_url.netloc - - # Split the path by '/' and find the position of 'projects' to get the project name - split_path = parsed_url.path.split("/") - if PROJECT_URL_PAT in split_path: - project_pos = split_path.index(PROJECT_URL_PAT) - if len(split_path) > project_pos + 1: - jira_project = split_path[project_pos + 1] - else: - raise ValueError("No project name found in the URL") - else: - raise ValueError("'projects' not found in the URL") - - return jira_base, jira_project - - -def extract_text_from_adf(adf: dict | None) -> str: - """Extracts plain text from Atlassian Document Format: - https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/ - - WARNING: This function is incomplete and will e.g. skip lists! - """ - texts = [] - if adf is not None and "content" in adf: - for block in adf["content"]: - if "content" in block: - for item in block["content"]: - if item["type"] == "text": - texts.append(item["text"]) - return " ".join(texts) - -def best_effort_get_field_from_issue(jira_issue: Issue, field: str) -> Any: - if hasattr(jira_issue.fields, field): - return getattr(jira_issue.fields, field) - - try: - return jira_issue.raw["fields"][field] - except Exception: - return None +JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2" +_JIRA_SLIM_PAGE_SIZE = 500 +_JIRA_FULL_PAGE_SIZE = 50 -def _get_comment_strs( - jira: Issue, comment_email_blacklist: tuple[str, ...] = () -) -> list[str]: - comment_strs = [] - for comment in jira.fields.comment.comments: - try: - body_text = ( - comment.body - if JIRA_API_VERSION == "2" - else extract_text_from_adf(comment.raw["body"]) - ) +def _paginate_jql_search( + jira_client: JIRA, + jql: str, + max_results: int, + fields: str | None = None, +) -> Iterable[Issue]: + start = 0 + while True: + logger.debug( + f"Fetching Jira issues with JQL: {jql}, " + f"starting at {start}, max results: {max_results}" + ) + issues = jira_client.search_issues( + jql_str=jql, + startAt=start, + maxResults=max_results, + fields=fields, + ) - if ( - hasattr(comment, "author") - and hasattr(comment.author, "emailAddress") - and comment.author.emailAddress in comment_email_blacklist - ): - continue # Skip adding comment if author's email is in blacklist + for issue in issues: + if isinstance(issue, Issue): + yield issue + else: + raise Exception(f"Found Jira object not of type Issue: {issue}") - comment_strs.append(body_text) - except Exception as e: - logger.error(f"Failed to process comment due to an error: {e}") - continue + if len(issues) < max_results: + break - return comment_strs + start += max_results def fetch_jira_issues_batch( - jql: str, - start_index: int, jira_client: JIRA, - batch_size: int = INDEX_BATCH_SIZE, + jql: str, + batch_size: int, comment_email_blacklist: tuple[str, ...] = (), labels_to_skip: set[str] | None = None, -) -> tuple[list[Document], int]: - doc_batch = [] - - batch = jira_client.search_issues( - jql, - startAt=start_index, - maxResults=batch_size, - ) - - for jira in batch: - if type(jira) != Issue: - logger.warning(f"Found Jira object not of type Issue {jira}") - continue - - if labels_to_skip and any( - label in jira.fields.labels for label in labels_to_skip - ): - logger.info( - f"Skipping {jira.key} because it has a label to skip. Found " - f"labels: {jira.fields.labels}. Labels to skip: {labels_to_skip}." - ) - continue +) -> Iterable[Document]: + for issue in _paginate_jql_search( + jira_client=jira_client, + jql=jql, + max_results=batch_size, + ): + if labels_to_skip: + if any(label in issue.fields.labels for label in labels_to_skip): + logger.info( + f"Skipping {issue.key} because it has a label to skip. Found " + f"labels: {issue.fields.labels}. Labels to skip: {labels_to_skip}." + ) + continue description = ( - jira.fields.description + issue.fields.description if JIRA_API_VERSION == "2" - else extract_text_from_adf(jira.raw["fields"]["description"]) + else extract_text_from_adf(issue.raw["fields"]["description"]) + ) + comments = get_comment_strs( + issue=issue, + comment_email_blacklist=comment_email_blacklist, ) - comments = _get_comment_strs(jira, comment_email_blacklist) - semantic_rep = f"{description}\n" + "\n".join( + ticket_content = f"{description}\n" + "\n".join( [f"Comment: {comment}" for comment in comments if comment] ) - page_url = f"{jira_client.client_info()}/browse/{jira.key}" + # Check ticket size + if len(ticket_content.encode("utf-8")) > JIRA_CONNECTOR_MAX_TICKET_SIZE: + logger.info( + f"Skipping {issue.key} because it exceeds the maximum size of " + f"{JIRA_CONNECTOR_MAX_TICKET_SIZE} bytes." + ) + continue + + page_url = f"{jira_client.client_info()}/browse/{issue.key}" people = set() try: - people.add( - BasicExpertInfo( - display_name=jira.fields.creator.displayName, - email=jira.fields.creator.emailAddress, - ) - ) + creator = best_effort_get_field_from_issue(issue, "creator") + if basic_expert_info := best_effort_basic_expert_info(creator): + people.add(basic_expert_info) except Exception: # Author should exist but if not, doesn't matter pass try: - people.add( - BasicExpertInfo( - display_name=jira.fields.assignee.displayName, # type: ignore - email=jira.fields.assignee.emailAddress, # type: ignore - ) - ) + assignee = best_effort_get_field_from_issue(issue, "assignee") + if basic_expert_info := best_effort_basic_expert_info(assignee): + people.add(basic_expert_info) except Exception: # Author should exist but if not, doesn't matter pass metadata_dict = {} - priority = best_effort_get_field_from_issue(jira, "priority") - if priority: + if priority := best_effort_get_field_from_issue(issue, "priority"): metadata_dict["priority"] = priority.name - status = best_effort_get_field_from_issue(jira, "status") - if status: + if status := best_effort_get_field_from_issue(issue, "status"): metadata_dict["status"] = status.name - resolution = best_effort_get_field_from_issue(jira, "resolution") - if resolution: + if resolution := best_effort_get_field_from_issue(issue, "resolution"): metadata_dict["resolution"] = resolution.name - labels = best_effort_get_field_from_issue(jira, "labels") - if labels: + if labels := best_effort_get_field_from_issue(issue, "labels"): metadata_dict["label"] = labels - doc_batch.append( - Document( - id=page_url, - sections=[Section(link=page_url, text=semantic_rep)], - source=DocumentSource.JIRA, - semantic_identifier=jira.fields.summary, - doc_updated_at=time_str_to_utc(jira.fields.updated), - primary_owners=list(people) or None, - # TODO add secondary_owners (commenters) if needed - metadata=metadata_dict, - ) + yield Document( + id=page_url, + sections=[Section(link=page_url, text=ticket_content)], + source=DocumentSource.JIRA, + semantic_identifier=issue.fields.summary, + doc_updated_at=time_str_to_utc(issue.fields.updated), + primary_owners=list(people) or None, + # TODO add secondary_owners (commenters) if needed + metadata=metadata_dict, ) - return doc_batch, len(batch) -class JiraConnector(LoadConnector, PollConnector): +class JiraConnector(LoadConnector, PollConnector, SlimConnector): def __init__( self, jira_project_url: str, @@ -204,8 +164,8 @@ def __init__( labels_to_skip: list[str] = JIRA_CONNECTOR_LABELS_TO_SKIP, ) -> None: self.batch_size = batch_size - self.jira_base, self.jira_project = extract_jira_project(jira_project_url) - self.jira_client: JIRA | None = None + self.jira_base, self._jira_project = extract_jira_project(jira_project_url) + self._jira_client: JIRA | None = None self._comment_email_blacklist = comment_email_blacklist or [] self.labels_to_skip = set(labels_to_skip) @@ -214,52 +174,45 @@ def __init__( def comment_email_blacklist(self) -> tuple: return tuple(email.strip() for email in self._comment_email_blacklist) + @property + def jira_client(self) -> JIRA: + if self._jira_client is None: + raise ConnectorMissingCredentialError("Jira") + return self._jira_client + + @property + def quoted_jira_project(self) -> str: + # Quote the project name to handle reserved words + return f'"{self._jira_project}"' + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: - api_token = credentials["jira_api_token"] - # if user provide an email we assume it's cloud - if "jira_user_email" in credentials: - email = credentials["jira_user_email"] - self.jira_client = JIRA( - basic_auth=(email, api_token), - server=self.jira_base, - options={"rest_api_version": JIRA_API_VERSION}, - ) - else: - self.jira_client = JIRA( - token_auth=api_token, - server=self.jira_base, - options={"rest_api_version": JIRA_API_VERSION}, - ) + self._jira_client = build_jira_client( + credentials=credentials, + jira_base=self.jira_base, + ) return None def load_from_state(self) -> GenerateDocumentsOutput: - if self.jira_client is None: - raise ConnectorMissingCredentialError("Jira") - - start_ind = 0 - while True: - doc_batch, fetched_batch_size = fetch_jira_issues_batch( - jql=f"project = {self.jira_project}", - start_index=start_ind, - jira_client=self.jira_client, - batch_size=self.batch_size, - comment_email_blacklist=self.comment_email_blacklist, - labels_to_skip=self.labels_to_skip, - ) - - if doc_batch: - yield doc_batch + jql = f"project = {self.quoted_jira_project}" + + document_batch = [] + for doc in fetch_jira_issues_batch( + jira_client=self.jira_client, + jql=jql, + batch_size=_JIRA_FULL_PAGE_SIZE, + comment_email_blacklist=self.comment_email_blacklist, + labels_to_skip=self.labels_to_skip, + ): + document_batch.append(doc) + if len(document_batch) >= self.batch_size: + yield document_batch + document_batch = [] - start_ind += fetched_batch_size - if fetched_batch_size < self.batch_size: - break + yield document_batch def poll_source( self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch ) -> GenerateDocumentsOutput: - if self.jira_client is None: - raise ConnectorMissingCredentialError("Jira") - start_date_str = datetime.fromtimestamp(start, tz=timezone.utc).strftime( "%Y-%m-%d %H:%M" ) @@ -268,28 +221,53 @@ def poll_source( ) jql = ( - f"project = {self.jira_project} AND " + f"project = {self.quoted_jira_project} AND " f"updated >= '{start_date_str}' AND " f"updated <= '{end_date_str}'" ) - start_ind = 0 - while True: - doc_batch, fetched_batch_size = fetch_jira_issues_batch( - jql=jql, - start_index=start_ind, - jira_client=self.jira_client, - batch_size=self.batch_size, - comment_email_blacklist=self.comment_email_blacklist, - labels_to_skip=self.labels_to_skip, - ) + document_batch = [] + for doc in fetch_jira_issues_batch( + jira_client=self.jira_client, + jql=jql, + batch_size=_JIRA_FULL_PAGE_SIZE, + comment_email_blacklist=self.comment_email_blacklist, + labels_to_skip=self.labels_to_skip, + ): + document_batch.append(doc) + if len(document_batch) >= self.batch_size: + yield document_batch + document_batch = [] - if doc_batch: - yield doc_batch + yield document_batch + + def retrieve_all_slim_documents( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> GenerateSlimDocumentOutput: + jql = f"project = {self.quoted_jira_project}" + + slim_doc_batch = [] + for issue in _paginate_jql_search( + jira_client=self.jira_client, + jql=jql, + max_results=_JIRA_SLIM_PAGE_SIZE, + fields="key", + ): + issue_key = best_effort_get_field_from_issue(issue, "key") + id = build_jira_url(self.jira_client, issue_key) + slim_doc_batch.append( + SlimDocument( + id=id, + perm_sync_data=None, + ) + ) + if len(slim_doc_batch) >= _JIRA_SLIM_PAGE_SIZE: + yield slim_doc_batch + slim_doc_batch = [] - start_ind += fetched_batch_size - if fetched_batch_size < self.batch_size: - break + yield slim_doc_batch if __name__ == "__main__": diff --git a/backend/danswer/connectors/danswer_jira/utils.py b/backend/danswer/connectors/danswer_jira/utils.py index 506f5eff75e..7fc3642b3a5 100644 --- a/backend/danswer/connectors/danswer_jira/utils.py +++ b/backend/danswer/connectors/danswer_jira/utils.py @@ -1,17 +1,136 @@ """Module with custom fields processing functions""" +import os from typing import Any from typing import List +from urllib.parse import urlparse from jira import JIRA from jira.resources import CustomFieldOption from jira.resources import Issue from jira.resources import User +from danswer.connectors.models import BasicExpertInfo from danswer.utils.logger import setup_logger logger = setup_logger() +PROJECT_URL_PAT = "projects" +JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2" + + +def best_effort_basic_expert_info(obj: Any) -> BasicExpertInfo | None: + display_name = None + email = None + if hasattr(obj, "display_name"): + display_name = obj.display_name + else: + display_name = obj.get("displayName") + + if hasattr(obj, "emailAddress"): + email = obj.emailAddress + else: + email = obj.get("emailAddress") + + if not email and not display_name: + return None + + return BasicExpertInfo(display_name=display_name, email=email) + + +def best_effort_get_field_from_issue(jira_issue: Issue, field: str) -> Any: + if hasattr(jira_issue.fields, field): + return getattr(jira_issue.fields, field) + + try: + return jira_issue.raw["fields"][field] + except Exception: + return None + + +def extract_text_from_adf(adf: dict | None) -> str: + """Extracts plain text from Atlassian Document Format: + https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/ + + WARNING: This function is incomplete and will e.g. skip lists! + """ + texts = [] + if adf is not None and "content" in adf: + for block in adf["content"]: + if "content" in block: + for item in block["content"]: + if item["type"] == "text": + texts.append(item["text"]) + return " ".join(texts) + + +def build_jira_url(jira_client: JIRA, issue_key: str) -> str: + return f"{jira_client.client_info()}/browse/{issue_key}" + + +def build_jira_client(credentials: dict[str, Any], jira_base: str) -> JIRA: + api_token = credentials["jira_api_token"] + # if user provide an email we assume it's cloud + if "jira_user_email" in credentials: + email = credentials["jira_user_email"] + return JIRA( + basic_auth=(email, api_token), + server=jira_base, + options={"rest_api_version": JIRA_API_VERSION}, + ) + else: + return JIRA( + token_auth=api_token, + server=jira_base, + options={"rest_api_version": JIRA_API_VERSION}, + ) + + +def extract_jira_project(url: str) -> tuple[str, str]: + parsed_url = urlparse(url) + jira_base = parsed_url.scheme + "://" + parsed_url.netloc + + # Split the path by '/' and find the position of 'projects' to get the project name + split_path = parsed_url.path.split("/") + if PROJECT_URL_PAT in split_path: + project_pos = split_path.index(PROJECT_URL_PAT) + if len(split_path) > project_pos + 1: + jira_project = split_path[project_pos + 1] + else: + raise ValueError("No project name found in the URL") + else: + raise ValueError("'projects' not found in the URL") + + return jira_base, jira_project + + +def get_comment_strs( + issue: Issue, comment_email_blacklist: tuple[str, ...] = () +) -> list[str]: + comment_strs = [] + for comment in issue.fields.comment.comments: + try: + body_text = ( + comment.body + if JIRA_API_VERSION == "2" + else extract_text_from_adf(comment.raw["body"]) + ) + + if ( + hasattr(comment, "author") + and hasattr(comment.author, "emailAddress") + and comment.author.emailAddress in comment_email_blacklist + ): + continue # Skip adding comment if author's email is in blacklist + + comment_strs.append(body_text) + except Exception as e: + logger.error(f"Failed to process comment due to an error: {e}") + continue + + return comment_strs + + class CustomFieldExtractor: @staticmethod def _process_custom_field_value(value: Any) -> str: diff --git a/backend/danswer/connectors/discourse/connector.py b/backend/danswer/connectors/discourse/connector.py index d74aad0f276..d1b6395a189 100644 --- a/backend/danswer/connectors/discourse/connector.py +++ b/backend/danswer/connectors/discourse/connector.py @@ -14,7 +14,6 @@ from danswer.connectors.cross_connector_utils.rate_limit_wrapper import ( rate_limit_builder, ) -from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch @@ -24,6 +23,7 @@ from danswer.connectors.models import Section from danswer.file_processing.html_utils import parse_html_page_basic from danswer.utils.logger import setup_logger +from danswer.utils.retry_wrapper import retry_builder logger = setup_logger() diff --git a/backend/danswer/connectors/document360/connector.py b/backend/danswer/connectors/document360/connector.py index 6a9f4ba6a56..7ccf3c92e62 100644 --- a/backend/danswer/connectors/document360/connector.py +++ b/backend/danswer/connectors/document360/connector.py @@ -11,7 +11,6 @@ from danswer.connectors.cross_connector_utils.rate_limit_wrapper import ( rate_limit_builder, ) -from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder from danswer.connectors.document360.utils import flatten_child_categories from danswer.connectors.interfaces import GenerateDocumentsOutput from danswer.connectors.interfaces import LoadConnector @@ -22,6 +21,7 @@ from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.file_processing.html_utils import parse_html_page_basic +from danswer.utils.retry_wrapper import retry_builder # Limitations and Potential Improvements # 1. The "Categories themselves contain potentially relevant information" but they're not pulled in diff --git a/backend/danswer/connectors/dropbox/connector.py b/backend/danswer/connectors/dropbox/connector.py index b36f0fbd122..7d2eb0166c7 100644 --- a/backend/danswer/connectors/dropbox/connector.py +++ b/backend/danswer/connectors/dropbox/connector.py @@ -97,8 +97,8 @@ def _yield_files_recursive( link = self._get_shared_link(entry.path_display) try: text = extract_file_text( - entry.name, BytesIO(downloaded_file), + file_name=entry.name, break_on_unprocessable=False, ) batch.append( diff --git a/backend/danswer/connectors/factory.py b/backend/danswer/connectors/factory.py index 8edfbc6a5d4..85b99953a50 100644 --- a/backend/danswer/connectors/factory.py +++ b/backend/danswer/connectors/factory.py @@ -1,8 +1,11 @@ -from typing import Any -from typing import Type +from typing import Any, Type -from danswer.configs.constants import DocumentSource +from danswer.configs.constants import ( + DocumentSource, + DocumentSourceRequiringTenantContext, +) from danswer.connectors.airtable.connector import AirtableConnector +from danswer.connectors.asana.connector import AsanaConnector from danswer.connectors.axero.connector import AxeroConnector from danswer.connectors.blob.connector import BlobStorageConnector from danswer.connectors.bookstack.connector import BookstackConnector @@ -13,6 +16,8 @@ from danswer.connectors.document360.connector import Document360Connector from danswer.connectors.dropbox.connector import DropboxConnector from danswer.connectors.file.connector import LocalFileConnector +from danswer.connectors.fireflies.connector import FirefliesConnector +from danswer.connectors.freshdesk.connector import FreshdeskConnector from danswer.connectors.github.connector import GithubConnector from danswer.connectors.gitlab.connector import GitlabConnector from danswer.connectors.gmail.connector import GmailConnector @@ -21,17 +26,18 @@ from danswer.connectors.google_site.connector import GoogleSitesConnector from danswer.connectors.guru.connector import GuruConnector from danswer.connectors.hubspot.connector import HubSpotConnector -from danswer.connectors.interfaces import BaseConnector -from danswer.connectors.interfaces import EventConnector -from danswer.connectors.interfaces import LoadConnector -from danswer.connectors.interfaces import PollConnector +from danswer.connectors.interfaces import ( + BaseConnector, + EventConnector, + LoadConnector, + PollConnector, +) from danswer.connectors.linear.connector import LinearConnector from danswer.connectors.loopio.connector import LoopioConnector from danswer.connectors.mediawiki.wiki import MediaWikiConnector from danswer.connectors.models import InputType from danswer.connectors.notion.connector import NotionConnector from danswer.connectors.productboard.connector import ProductboardConnector -from danswer.connectors.requesttracker.connector import RequestTrackerConnector from danswer.connectors.salesforce.connector import SalesforceConnector from danswer.connectors.sharepoint.connector import SharepointConnector from danswer.connectors.slab.connector import SlabConnector @@ -40,6 +46,7 @@ from danswer.connectors.teams.connector import TeamsConnector from danswer.connectors.web.connector import WebConnector from danswer.connectors.wikipedia.connector import WikipediaConnector +from danswer.connectors.xenforo.connector import XenforoConnector from danswer.connectors.zendesk.connector import ZendeskConnector from danswer.connectors.zulip.connector import ZulipConnector from danswer.db.credentials import backend_update_credential_json @@ -62,6 +69,7 @@ def identify_connector_class( DocumentSource.SLACK: { InputType.LOAD_STATE: SlackLoadConnector, InputType.POLL: SlackPollConnector, + InputType.SLIM_RETRIEVAL: SlackPollConnector, }, DocumentSource.GITHUB: GithubConnector, DocumentSource.GMAIL: GmailConnector, @@ -74,7 +82,6 @@ def identify_connector_class( DocumentSource.SLAB: SlabConnector, DocumentSource.NOTION: NotionConnector, DocumentSource.ZULIP: ZulipConnector, - DocumentSource.REQUESTTRACKER: RequestTrackerConnector, DocumentSource.GURU: GuruConnector, DocumentSource.LINEAR: LinearConnector, DocumentSource.HUBSPOT: HubSpotConnector, @@ -92,10 +99,14 @@ def identify_connector_class( DocumentSource.CLICKUP: ClickupConnector, DocumentSource.MEDIAWIKI: MediaWikiConnector, DocumentSource.WIKIPEDIA: WikipediaConnector, + DocumentSource.ASANA: AsanaConnector, DocumentSource.S3: BlobStorageConnector, DocumentSource.R2: BlobStorageConnector, DocumentSource.GOOGLE_CLOUD_STORAGE: BlobStorageConnector, DocumentSource.OCI_STORAGE: BlobStorageConnector, + DocumentSource.XENFORO: XenforoConnector, + DocumentSource.FRESHDESK: FreshdeskConnector, + DocumentSource.FIREFLIES: FirefliesConnector, } connector_by_source = connector_map.get(source, {}) @@ -125,13 +136,18 @@ def identify_connector_class( def instantiate_connector( + db_session: Session, source: DocumentSource, input_type: InputType, connector_specific_config: dict[str, Any], credential: Credential, - db_session: Session, + tenant_id: str | None = None, ) -> BaseConnector: connector_class = identify_connector_class(source, input_type) + + if source in DocumentSourceRequiringTenantContext: + connector_specific_config["tenant_id"] = tenant_id + connector = connector_class(**connector_specific_config) new_credentials = connector.load_credentials(credential.credential_json) diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index 83d0af2c12e..b263354822f 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -16,7 +16,7 @@ from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import Document from danswer.connectors.models import Section -from danswer.db.engine import get_sqlalchemy_engine +from danswer.db.engine import get_session_with_tenant from danswer.file_processing.extract_file_text import check_file_ext_is_valid from danswer.file_processing.extract_file_text import detect_encoding from danswer.file_processing.extract_file_text import extract_file_text @@ -27,6 +27,8 @@ from danswer.file_processing.extract_file_text import read_text_file from danswer.file_store.file_store import get_default_file_store from danswer.utils.logger import setup_logger +from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR logger = setup_logger() @@ -74,13 +76,14 @@ def _process_file( ) # Using the PDF reader function directly to pass in password cleanly - elif extension == ".pdf": + elif extension == ".pdf" and pdf_pass is not None: file_content_raw, file_metadata = read_pdf_file(file=file, pdf_pass=pdf_pass) else: file_content_raw = extract_file_text( - file_name=file_name, file=file, + file_name=file_name, + break_on_unprocessable=True, ) all_metadata = {**metadata, **file_metadata} if metadata else file_metadata @@ -120,9 +123,13 @@ def _process_file( "filename", "file_display_name", "title", + "connector_type", ] } + source_type_str = all_metadata.get("connector_type") + source_type = DocumentSource(source_type_str) if source_type_str else None + p_owner_names = all_metadata.get("primary_owners") s_owner_names = all_metadata.get("secondary_owners") p_owners = ( @@ -142,7 +149,7 @@ def _process_file( sections=[ Section(link=all_metadata.get("link"), text=file_content_raw.strip()) ], - source=DocumentSource.FILE, + source=source_type or DocumentSource.FILE, semantic_identifier=file_display_name, title=title, doc_updated_at=final_time_updated, @@ -158,10 +165,12 @@ class LocalFileConnector(LoadConnector): def __init__( self, file_locations: list[Path | str], + tenant_id: str = POSTGRES_DEFAULT_SCHEMA, batch_size: int = INDEX_BATCH_SIZE, ) -> None: self.file_locations = [Path(file_location) for file_location in file_locations] self.batch_size = batch_size + self.tenant_id = tenant_id self.pdf_pass: str | None = None def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: @@ -170,7 +179,9 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None def load_from_state(self) -> GenerateDocumentsOutput: documents: list[Document] = [] - with Session(get_sqlalchemy_engine()) as db_session: + token = CURRENT_TENANT_ID_CONTEXTVAR.set(self.tenant_id) + + with get_session_with_tenant(self.tenant_id) as db_session: for file_path in self.file_locations: current_datetime = datetime.now(timezone.utc) files = _read_files_and_metadata( @@ -192,6 +203,8 @@ def load_from_state(self) -> GenerateDocumentsOutput: if documents: yield documents + CURRENT_TENANT_ID_CONTEXTVAR.reset(token) + if __name__ == "__main__": connector = LocalFileConnector(file_locations=[os.environ["TEST_FILE"]]) diff --git a/backend/ee/danswer/connectors/__init__.py b/backend/danswer/connectors/fireflies/__init__.py similarity index 100% rename from backend/ee/danswer/connectors/__init__.py rename to backend/danswer/connectors/fireflies/__init__.py diff --git a/backend/danswer/connectors/fireflies/connector.py b/backend/danswer/connectors/fireflies/connector.py new file mode 100644 index 00000000000..16ed7b2ede1 --- /dev/null +++ b/backend/danswer/connectors/fireflies/connector.py @@ -0,0 +1,182 @@ +from collections.abc import Iterator +from datetime import datetime +from datetime import timezone +from typing import List + +import requests + +from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.constants import DocumentSource +from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import LoadConnector +from danswer.connectors.interfaces import PollConnector +from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.models import BasicExpertInfo +from danswer.connectors.models import ConnectorMissingCredentialError +from danswer.connectors.models import Document +from danswer.connectors.models import Section +from danswer.utils.logger import setup_logger + +logger = setup_logger() + +_FIREFLIES_ID_PREFIX = "FIREFLIES_" + +_FIREFLIES_API_URL = "https://api.fireflies.ai/graphql" + +_FIREFLIES_TRANSCRIPT_QUERY_SIZE = 50 # Max page size is 50 + +_FIREFLIES_API_QUERY = """ + query Transcripts($fromDate: DateTime, $toDate: DateTime, $limit: Int!, $skip: Int!) { + transcripts(fromDate: $fromDate, toDate: $toDate, limit: $limit, skip: $skip) { + id + title + host_email + participants + date + transcript_url + sentences { + text + speaker_name + } + } + } +""" + + +def _create_doc_from_transcript(transcript: dict) -> Document | None: + meeting_text = "" + sentences = transcript.get("sentences", []) + if sentences: + for sentence in sentences: + meeting_text += sentence.get("speaker_name") or "Unknown Speaker" + meeting_text += ": " + sentence.get("text", "") + "\n\n" + else: + return None + + meeting_link = transcript["transcript_url"] + + fireflies_id = _FIREFLIES_ID_PREFIX + transcript["id"] + + meeting_title = transcript["title"] or "No Title" + + meeting_date_unix = transcript["date"] + meeting_date = datetime.fromtimestamp(meeting_date_unix / 1000, tz=timezone.utc) + + meeting_host_email = transcript["host_email"] + host_email_user_info = [BasicExpertInfo(email=meeting_host_email)] + + meeting_participants_email_list = [] + for participant in transcript.get("participants", []): + if participant != meeting_host_email and participant: + meeting_participants_email_list.append(BasicExpertInfo(email=participant)) + + return Document( + id=fireflies_id, + sections=[ + Section( + link=meeting_link, + text=meeting_text, + ) + ], + source=DocumentSource.FIREFLIES, + semantic_identifier=meeting_title, + metadata={}, + doc_updated_at=meeting_date, + primary_owners=host_email_user_info, + secondary_owners=meeting_participants_email_list, + ) + + +class FirefliesConnector(PollConnector, LoadConnector): + def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None: + self.batch_size = batch_size + + def load_credentials(self, credentials: dict[str, str]) -> None: + api_key = credentials.get("fireflies_api_key") + + if not isinstance(api_key, str): + raise ConnectorMissingCredentialError( + "The Fireflies API key must be a string" + ) + + self.api_key = api_key + + return None + + def _fetch_transcripts( + self, start_datetime: str | None = None, end_datetime: str | None = None + ) -> Iterator[List[dict]]: + if self.api_key is None: + raise ConnectorMissingCredentialError("Missing API key") + + headers = { + "Content-Type": "application/json", + "Authorization": "Bearer " + self.api_key, + } + + skip = 0 + variables: dict[str, int | str] = { + "limit": _FIREFLIES_TRANSCRIPT_QUERY_SIZE, + } + + if start_datetime: + variables["fromDate"] = start_datetime + if end_datetime: + variables["toDate"] = end_datetime + + while True: + variables["skip"] = skip + response = requests.post( + _FIREFLIES_API_URL, + headers=headers, + json={"query": _FIREFLIES_API_QUERY, "variables": variables}, + ) + + response.raise_for_status() + + if response.status_code == 204: + break + + recieved_transcripts = response.json() + parsed_transcripts = recieved_transcripts.get("data", {}).get( + "transcripts", [] + ) + + yield parsed_transcripts + + if len(parsed_transcripts) < _FIREFLIES_TRANSCRIPT_QUERY_SIZE: + break + + skip += _FIREFLIES_TRANSCRIPT_QUERY_SIZE + + def _process_transcripts( + self, start: str | None = None, end: str | None = None + ) -> GenerateDocumentsOutput: + doc_batch: List[Document] = [] + + for transcript_batch in self._fetch_transcripts(start, end): + for transcript in transcript_batch: + if doc := _create_doc_from_transcript(transcript): + doc_batch.append(doc) + + if len(doc_batch) >= self.batch_size: + yield doc_batch + doc_batch = [] + + if doc_batch: + yield doc_batch + + def load_from_state(self) -> GenerateDocumentsOutput: + return self._process_transcripts() + + def poll_source( + self, start_unixtime: SecondsSinceUnixEpoch, end_unixtime: SecondsSinceUnixEpoch + ) -> GenerateDocumentsOutput: + start_datetime = datetime.fromtimestamp( + start_unixtime, tz=timezone.utc + ).strftime("%Y-%m-%dT%H:%M:%S.000Z") + end_datetime = datetime.fromtimestamp(end_unixtime, tz=timezone.utc).strftime( + "%Y-%m-%dT%H:%M:%S.000Z" + ) + + yield from self._process_transcripts(start_datetime, end_datetime) diff --git a/backend/danswer/search/postprocessing/reranker.py b/backend/danswer/connectors/freshdesk/__init__,py similarity index 100% rename from backend/danswer/search/postprocessing/reranker.py rename to backend/danswer/connectors/freshdesk/__init__,py diff --git a/backend/danswer/connectors/freshdesk/connector.py b/backend/danswer/connectors/freshdesk/connector.py new file mode 100644 index 00000000000..db2a293e95f --- /dev/null +++ b/backend/danswer/connectors/freshdesk/connector.py @@ -0,0 +1,239 @@ +import json +from collections.abc import Iterator +from datetime import datetime +from datetime import timezone +from typing import List + +import requests + +from danswer.configs.app_configs import INDEX_BATCH_SIZE +from danswer.configs.constants import DocumentSource +from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import LoadConnector +from danswer.connectors.interfaces import PollConnector +from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.models import ConnectorMissingCredentialError +from danswer.connectors.models import Document +from danswer.connectors.models import Section +from danswer.file_processing.html_utils import parse_html_page_basic +from danswer.utils.logger import setup_logger + +logger = setup_logger() + +_FRESHDESK_ID_PREFIX = "FRESHDESK_" + + +_TICKET_FIELDS_TO_INCLUDE = { + "fr_escalated", + "spam", + "priority", + "source", + "status", + "type", + "is_escalated", + "tags", + "nr_due_by", + "nr_escalated", + "cc_emails", + "fwd_emails", + "reply_cc_emails", + "ticket_cc_emails", + "support_email", + "to_emails", +} + +_SOURCE_NUMBER_TYPE_MAP: dict[int, str] = { + 1: "Email", + 2: "Portal", + 3: "Phone", + 7: "Chat", + 9: "Feedback Widget", + 10: "Outbound Email", +} + +_PRIORITY_NUMBER_TYPE_MAP: dict[int, str] = { + 1: "low", + 2: "medium", + 3: "high", + 4: "urgent", +} + +_STATUS_NUMBER_TYPE_MAP: dict[int, str] = { + 2: "open", + 3: "pending", + 4: "resolved", + 5: "closed", +} + + +def _create_metadata_from_ticket(ticket: dict) -> dict: + metadata: dict[str, str | list[str]] = {} + # Combine all emails into a list so there are no repeated emails + email_data: set[str] = set() + + for key, value in ticket.items(): + # Skip fields that aren't useful for embedding + if key not in _TICKET_FIELDS_TO_INCLUDE: + continue + + # Skip empty fields + if not value or value == "[]": + continue + + # Convert strings or lists to strings + stringified_value: str | list[str] + if isinstance(value, list): + stringified_value = [str(item) for item in value] + else: + stringified_value = str(value) + + if "email" in key: + if isinstance(stringified_value, list): + email_data.update(stringified_value) + else: + email_data.add(stringified_value) + else: + metadata[key] = stringified_value + + if email_data: + metadata["emails"] = list(email_data) + + # Convert source numbers to human-parsable string + if source_number := ticket.get("source"): + metadata["source"] = _SOURCE_NUMBER_TYPE_MAP.get( + source_number, "Unknown Source Type" + ) + + # Convert priority numbers to human-parsable string + if priority_number := ticket.get("priority"): + metadata["priority"] = _PRIORITY_NUMBER_TYPE_MAP.get( + priority_number, "Unknown Priority" + ) + + # Convert status to human-parsable string + if status_number := ticket.get("status"): + metadata["status"] = _STATUS_NUMBER_TYPE_MAP.get( + status_number, "Unknown Status" + ) + + due_by = datetime.fromisoformat(ticket["due_by"].replace("Z", "+00:00")) + metadata["overdue"] = str(datetime.now(timezone.utc) > due_by) + + return metadata + + +def _create_doc_from_ticket(ticket: dict, domain: str) -> Document: + # Use the ticket description as the text + text = f"Ticket description: {parse_html_page_basic(ticket.get('description_text', ''))}" + metadata = _create_metadata_from_ticket(ticket) + + # This is also used in the ID because it is more unique than the just the ticket ID + link = f"https://{domain}.freshdesk.com/helpdesk/tickets/{ticket['id']}" + + return Document( + id=_FRESHDESK_ID_PREFIX + link, + sections=[ + Section( + link=link, + text=text, + ) + ], + source=DocumentSource.FRESHDESK, + semantic_identifier=ticket["subject"], + metadata=metadata, + doc_updated_at=datetime.fromisoformat( + ticket["updated_at"].replace("Z", "+00:00") + ), + ) + + +class FreshdeskConnector(PollConnector, LoadConnector): + def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None: + self.batch_size = batch_size + + def load_credentials(self, credentials: dict[str, str | int]) -> None: + api_key = credentials.get("freshdesk_api_key") + domain = credentials.get("freshdesk_domain") + password = credentials.get("freshdesk_password") + + if not all(isinstance(cred, str) for cred in [domain, api_key, password]): + raise ConnectorMissingCredentialError( + "All Freshdesk credentials must be strings" + ) + + self.api_key = str(api_key) + self.domain = str(domain) + self.password = str(password) + + def _fetch_tickets( + self, start: datetime | None = None, end: datetime | None = None + ) -> Iterator[List[dict]]: + """ + 'end' is not currently used, so we may double fetch tickets created after the indexing + starts but before the actual call is made. + + To use 'end' would require us to use the search endpoint but it has limitations, + namely having to fetch all IDs and then individually fetch each ticket because there is no + 'include' field available for this endpoint: + https://developers.freshdesk.com/api/#filter_tickets + """ + if self.api_key is None or self.domain is None or self.password is None: + raise ConnectorMissingCredentialError("freshdesk") + + base_url = f"https://{self.domain}.freshdesk.com/api/v2/tickets" + params: dict[str, int | str] = { + "include": "description", + "per_page": 50, + "page": 1, + } + + if start: + params["updated_since"] = start.isoformat() + + while True: + response = requests.get( + base_url, auth=(self.api_key, self.password), params=params + ) + response.raise_for_status() + + if response.status_code == 204: + break + + tickets = json.loads(response.content) + logger.info( + f"Fetched {len(tickets)} tickets from Freshdesk API (Page {params['page']})" + ) + + yield tickets + + if len(tickets) < int(params["per_page"]): + break + + params["page"] = int(params["page"]) + 1 + + def _process_tickets( + self, start: datetime | None = None, end: datetime | None = None + ) -> GenerateDocumentsOutput: + doc_batch: List[Document] = [] + + for ticket_batch in self._fetch_tickets(start, end): + for ticket in ticket_batch: + doc_batch.append(_create_doc_from_ticket(ticket, self.domain)) + + if len(doc_batch) >= self.batch_size: + yield doc_batch + doc_batch = [] + + if doc_batch: + yield doc_batch + + def load_from_state(self) -> GenerateDocumentsOutput: + return self._process_tickets() + + def poll_source( + self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch + ) -> GenerateDocumentsOutput: + start_datetime = datetime.fromtimestamp(start, tz=timezone.utc) + end_datetime = datetime.fromtimestamp(end, tz=timezone.utc) + + yield from self._process_tickets(start_datetime, end_datetime) diff --git a/backend/danswer/connectors/gitlab/connector.py b/backend/danswer/connectors/gitlab/connector.py index f07baf3e141..39ec443e709 100644 --- a/backend/danswer/connectors/gitlab/connector.py +++ b/backend/danswer/connectors/gitlab/connector.py @@ -24,6 +24,9 @@ from danswer.connectors.models import Section from danswer.utils.logger import setup_logger + +logger = setup_logger() + # List of directories/Files to exclude exclude_patterns = [ "logs", @@ -31,7 +34,6 @@ ".gitlab/", ".pre-commit-config.yaml", ] -logger = setup_logger() def _batch_gitlab_objects( diff --git a/backend/danswer/connectors/gmail/connector.py b/backend/danswer/connectors/gmail/connector.py index 42d2f305f73..170e1219e11 100644 --- a/backend/danswer/connectors/gmail/connector.py +++ b/backend/danswer/connectors/gmail/connector.py @@ -1,221 +1,361 @@ from base64 import urlsafe_b64decode from typing import Any -from typing import cast from typing import Dict from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore -from googleapiclient import discovery # type: ignore from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc -from danswer.connectors.gmail.connector_auth import ( - get_gmail_creds_for_authorized_user, -) -from danswer.connectors.gmail.connector_auth import ( - get_gmail_creds_for_service_account, -) -from danswer.connectors.gmail.constants import ( - DB_CREDENTIALS_DICT_DELEGATED_USER_KEY, -) -from danswer.connectors.gmail.constants import DB_CREDENTIALS_DICT_TOKEN_KEY -from danswer.connectors.gmail.constants import ( - GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY, +from danswer.connectors.google_utils.google_auth import get_google_creds +from danswer.connectors.google_utils.google_utils import execute_paginated_retrieval +from danswer.connectors.google_utils.resources import get_admin_service +from danswer.connectors.google_utils.resources import get_gmail_service +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_PRIMARY_ADMIN_KEY, ) +from danswer.connectors.google_utils.shared_constants import MISSING_SCOPES_ERROR_STR +from danswer.connectors.google_utils.shared_constants import ONYX_SCOPE_INSTRUCTIONS +from danswer.connectors.google_utils.shared_constants import SLIM_BATCH_SIZE +from danswer.connectors.google_utils.shared_constants import USER_FIELDS from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import GenerateSlimDocumentOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.interfaces import SlimConnector +from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.connectors.models import SlimDocument from danswer.utils.logger import setup_logger +from danswer.utils.retry_wrapper import retry_builder + logger = setup_logger() +# This is for the initial list call to get the thread ids +THREAD_LIST_FIELDS = "nextPageToken, threads(id)" + +# These are the fields to retrieve using the ID from the initial list call +PARTS_FIELDS = "parts(body(data), mimeType)" +PAYLOAD_FIELDS = f"payload(headers, {PARTS_FIELDS})" +MESSAGES_FIELDS = f"messages(id, {PAYLOAD_FIELDS})" +THREADS_FIELDS = f"threads(id, {MESSAGES_FIELDS})" +THREAD_FIELDS = f"id, {MESSAGES_FIELDS}" + +EMAIL_FIELDS = [ + "cc", + "bcc", + "from", + "to", +] + +add_retries = retry_builder(tries=50, max_delay=30) + + +def _build_time_range_query( + time_range_start: SecondsSinceUnixEpoch | None = None, + time_range_end: SecondsSinceUnixEpoch | None = None, +) -> str | None: + query = "" + if time_range_start is not None and time_range_start != 0: + query += f"after:{int(time_range_start)}" + if time_range_end is not None and time_range_end != 0: + query += f" before:{int(time_range_end)}" + query = query.strip() + + if len(query) == 0: + return None + + return query + + +def _clean_email_and_extract_name(email: str) -> tuple[str, str | None]: + email = email.strip() + if "<" in email and ">" in email: + # Handle format: "Display Name " + display_name = email[: email.find("<")].strip() + email_address = email[email.find("<") + 1 : email.find(">")].strip() + return email_address, display_name if display_name else None + else: + # Handle plain email address + return email.strip(), None + + +def _get_owners_from_emails(emails: dict[str, str | None]) -> list[BasicExpertInfo]: + owners = [] + for email, names in emails.items(): + if names: + name_parts = names.split(" ") + first_name = " ".join(name_parts[:-1]) + last_name = name_parts[-1] + else: + first_name = None + last_name = None + owners.append( + BasicExpertInfo(email=email, first_name=first_name, last_name=last_name) + ) + return owners + + +def _get_message_body(payload: dict[str, Any]) -> str: + parts = payload.get("parts", []) + message_body = "" + for part in parts: + mime_type = part.get("mimeType") + body = part.get("body") + if mime_type == "text/plain" and body: + data = body.get("data", "") + text = urlsafe_b64decode(data).decode() + message_body += text + return message_body + + +def message_to_section(message: Dict[str, Any]) -> tuple[Section, dict[str, str]]: + link = f"https://mail.google.com/mail/u/0/#inbox/{message['id']}" + + payload = message.get("payload", {}) + headers = payload.get("headers", []) + metadata: dict[str, Any] = {} + for header in headers: + name = header.get("name").lower() + value = header.get("value") + if name in EMAIL_FIELDS: + metadata[name] = value + if name == "subject": + metadata["subject"] = value + if name == "date": + metadata["updated_at"] = value + + if labels := message.get("labelIds"): + metadata["labels"] = labels + + message_data = "" + for name, value in metadata.items(): + # updated at isnt super useful for the llm + if name != "updated_at": + message_data += f"{name}: {value}\n" + + message_body_text: str = _get_message_body(payload) + + return Section(link=link, text=message_body_text + message_data), metadata + + +def thread_to_document(full_thread: Dict[str, Any]) -> Document | None: + all_messages = full_thread.get("messages", []) + if not all_messages: + return None + + sections = [] + semantic_identifier = "" + updated_at = None + from_emails: dict[str, str | None] = {} + other_emails: dict[str, str | None] = {} + for message in all_messages: + section, message_metadata = message_to_section(message) + sections.append(section) + + for name, value in message_metadata.items(): + if name in EMAIL_FIELDS: + email, display_name = _clean_email_and_extract_name(value) + if name == "from": + from_emails[email] = ( + display_name if not from_emails.get(email) else None + ) + else: + other_emails[email] = ( + display_name if not other_emails.get(email) else None + ) + + # If we haven't set the semantic identifier yet, set it to the subject of the first message + if not semantic_identifier: + semantic_identifier = message_metadata.get("subject", "") -class GmailConnector(LoadConnector, PollConnector): + if message_metadata.get("updated_at"): + updated_at = message_metadata.get("updated_at") + + updated_at_datetime = None + if updated_at: + updated_at_datetime = time_str_to_utc(updated_at) + + id = full_thread.get("id") + if not id: + raise ValueError("Thread ID is required") + + primary_owners = _get_owners_from_emails(from_emails) + secondary_owners = _get_owners_from_emails(other_emails) + + return Document( + id=id, + semantic_identifier=semantic_identifier, + sections=sections, + source=DocumentSource.GMAIL, + # This is used to perform permission sync + primary_owners=primary_owners, + secondary_owners=secondary_owners, + doc_updated_at=updated_at_datetime, + # Not adding emails to metadata because it's already in the sections + metadata={}, + ) + + +class GmailConnector(LoadConnector, PollConnector, SlimConnector): def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None: self.batch_size = batch_size - self.creds: OAuthCredentials | ServiceAccountCredentials | None = None - def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None: - """Checks for two different types of credentials. - (1) A credential which holds a token acquired via a user going thorugh - the Google OAuth flow. - (2) A credential which holds a service account key JSON file, which - can then be used to impersonate any user in the workspace. - """ - creds: OAuthCredentials | ServiceAccountCredentials | None = None - new_creds_dict = None - if DB_CREDENTIALS_DICT_TOKEN_KEY in credentials: - access_token_json_str = cast( - str, credentials[DB_CREDENTIALS_DICT_TOKEN_KEY] - ) - creds = get_gmail_creds_for_authorized_user( - token_json_str=access_token_json_str - ) + self._creds: OAuthCredentials | ServiceAccountCredentials | None = None + self._primary_admin_email: str | None = None - # tell caller to update token stored in DB if it has changed - # (e.g. the token has been refreshed) - new_creds_json_str = creds.to_json() if creds else "" - if new_creds_json_str != access_token_json_str: - new_creds_dict = {DB_CREDENTIALS_DICT_TOKEN_KEY: new_creds_json_str} - - if GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY in credentials: - service_account_key_json_str = credentials[ - GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY - ] - creds = get_gmail_creds_for_service_account( - service_account_key_json_str=service_account_key_json_str + @property + def primary_admin_email(self) -> str: + if self._primary_admin_email is None: + raise RuntimeError( + "Primary admin email missing, " + "should not call this property " + "before calling load_credentials" ) + return self._primary_admin_email - # "Impersonate" a user if one is specified - delegated_user_email = cast( - str | None, credentials.get(DB_CREDENTIALS_DICT_DELEGATED_USER_KEY) + @property + def google_domain(self) -> str: + if self._primary_admin_email is None: + raise RuntimeError( + "Primary admin email missing, " + "should not call this property " + "before calling load_credentials" ) - if delegated_user_email: - creds = creds.with_subject(delegated_user_email) if creds else None # type: ignore + return self._primary_admin_email.split("@")[-1] - if creds is None: - raise PermissionError( - "Unable to access Gmail - unknown credential structure." + @property + def creds(self) -> OAuthCredentials | ServiceAccountCredentials: + if self._creds is None: + raise RuntimeError( + "Creds missing, " + "should not call this property " + "before calling load_credentials" ) + return self._creds - self.creds = creds - return new_creds_dict + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None: + primary_admin_email = credentials[DB_CREDENTIALS_PRIMARY_ADMIN_KEY] + self._primary_admin_email = primary_admin_email - def _get_email_body(self, payload: dict[str, Any]) -> str: - parts = payload.get("parts", []) - email_body = "" - for part in parts: - mime_type = part.get("mimeType") - body = part.get("body") - if mime_type == "text/plain": - data = body.get("data", "") - text = urlsafe_b64decode(data).decode() - email_body += text - return email_body - - def _email_to_document(self, full_email: Dict[str, Any]) -> Document: - email_id = full_email["id"] - payload = full_email["payload"] - headers = payload.get("headers") - labels = full_email.get("labelIds", []) - metadata = {} - if headers: - for header in headers: - name = header.get("name").lower() - value = header.get("value") - if name in ["from", "to", "subject", "date", "cc", "bcc"]: - metadata[name] = value - email_data = "" - for name, value in metadata.items(): - email_data += f"{name}: {value}\n" - metadata["labels"] = labels - logger.debug(f"{email_data}") - email_body_text: str = self._get_email_body(payload) - date_str = metadata.get("date") - email_updated_at = time_str_to_utc(date_str) if date_str else None - link = f"https://mail.google.com/mail/u/0/#inbox/{email_id}" - return Document( - id=email_id, - sections=[Section(link=link, text=email_data + email_body_text)], + self._creds, new_creds_dict = get_google_creds( + credentials=credentials, source=DocumentSource.GMAIL, - title=metadata.get("subject"), - semantic_identifier=metadata.get("subject", "Untitled Email"), - doc_updated_at=email_updated_at, - metadata=metadata, ) + return new_creds_dict - @staticmethod - def _build_time_range_query( - time_range_start: SecondsSinceUnixEpoch | None = None, - time_range_end: SecondsSinceUnixEpoch | None = None, - ) -> str | None: - query = "" - if time_range_start is not None and time_range_start != 0: - query += f"after:{int(time_range_start)}" - if time_range_end is not None and time_range_end != 0: - query += f" before:{int(time_range_end)}" - query = query.strip() - - if len(query) == 0: - return None - - return query + def _get_all_user_emails(self) -> list[str]: + admin_service = get_admin_service(self.creds, self.primary_admin_email) + emails = [] + for user in execute_paginated_retrieval( + retrieval_function=admin_service.users().list, + list_key="users", + fields=USER_FIELDS, + domain=self.google_domain, + ): + if email := user.get("primaryEmail"): + emails.append(email) + return emails - def _fetch_mails_from_gmail( + def _fetch_threads( self, time_range_start: SecondsSinceUnixEpoch | None = None, time_range_end: SecondsSinceUnixEpoch | None = None, ) -> GenerateDocumentsOutput: - if self.creds is None: - raise PermissionError("Not logged into Gmail") - page_token = "" - query = GmailConnector._build_time_range_query(time_range_start, time_range_end) - service = discovery.build("gmail", "v1", credentials=self.creds) - while page_token is not None: - result = ( - service.users() - .messages() - .list( - userId="me", - pageToken=page_token, - q=query, - maxResults=self.batch_size, - ) - .execute() - ) - page_token = result.get("nextPageToken") - messages = result.get("messages", []) - doc_batch = [] - for message in messages: - message_id = message["id"] - msg = ( - service.users() - .messages() - .get(userId="me", id=message_id, format="full") - .execute() + query = _build_time_range_query(time_range_start, time_range_end) + doc_batch = [] + for user_email in self._get_all_user_emails(): + gmail_service = get_gmail_service(self.creds, user_email) + for thread in execute_paginated_retrieval( + retrieval_function=gmail_service.users().threads().list, + list_key="threads", + userId=user_email, + fields=THREAD_LIST_FIELDS, + q=query, + ): + full_threads = execute_paginated_retrieval( + retrieval_function=gmail_service.users().threads().get, + list_key=None, + userId=user_email, + fields=THREAD_FIELDS, + id=thread["id"], ) - doc = self._email_to_document(msg) + # full_threads is an iterator containing a single thread + # so we need to convert it to a list and grab the first element + full_thread = list(full_threads)[0] + doc = thread_to_document(full_thread) + if doc is None: + continue doc_batch.append(doc) - if len(doc_batch) > 0: - yield doc_batch + if len(doc_batch) > self.batch_size: + yield doc_batch + doc_batch = [] + if doc_batch: + yield doc_batch + + def _fetch_slim_threads( + self, + time_range_start: SecondsSinceUnixEpoch | None = None, + time_range_end: SecondsSinceUnixEpoch | None = None, + ) -> GenerateSlimDocumentOutput: + query = _build_time_range_query(time_range_start, time_range_end) + doc_batch = [] + for user_email in self._get_all_user_emails(): + logger.info(f"Fetching slim threads for user: {user_email}") + gmail_service = get_gmail_service(self.creds, user_email) + for thread in execute_paginated_retrieval( + retrieval_function=gmail_service.users().threads().list, + list_key="threads", + userId=user_email, + fields=THREAD_LIST_FIELDS, + q=query, + ): + doc_batch.append( + SlimDocument( + id=thread["id"], + perm_sync_data={"user_email": user_email}, + ) + ) + if len(doc_batch) > SLIM_BATCH_SIZE: + yield doc_batch + doc_batch = [] + if doc_batch: + yield doc_batch def load_from_state(self) -> GenerateDocumentsOutput: - yield from self._fetch_mails_from_gmail() + try: + yield from self._fetch_threads() + except Exception as e: + if MISSING_SCOPES_ERROR_STR in str(e): + raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e + raise e def poll_source( self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch ) -> GenerateDocumentsOutput: - yield from self._fetch_mails_from_gmail(start, end) + try: + yield from self._fetch_threads(start, end) + except Exception as e: + if MISSING_SCOPES_ERROR_STR in str(e): + raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e + raise e + def retrieve_all_slim_documents( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> GenerateSlimDocumentOutput: + try: + yield from self._fetch_slim_threads(start, end) + except Exception as e: + if MISSING_SCOPES_ERROR_STR in str(e): + raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e + raise e -if __name__ == "__main__": - import json - import os - service_account_json_path = os.environ.get("GOOGLE_SERVICE_ACCOUNT_KEY_JSON_PATH") - if not service_account_json_path: - raise ValueError( - "Please set GOOGLE_SERVICE_ACCOUNT_KEY_JSON_PATH environment variable" - ) - with open(service_account_json_path) as f: - creds = json.load(f) - - credentials_dict = { - DB_CREDENTIALS_DICT_TOKEN_KEY: json.dumps(creds), - } - delegated_user = os.environ.get("GMAIL_DELEGATED_USER") - if delegated_user: - credentials_dict[DB_CREDENTIALS_DICT_DELEGATED_USER_KEY] = delegated_user - - connector = GmailConnector() - connector.load_credentials( - json.loads(credentials_dict[DB_CREDENTIALS_DICT_TOKEN_KEY]) - ) - document_batch_generator = connector.load_from_state() - for document_batch in document_batch_generator: - print(document_batch) - break +if __name__ == "__main__": + pass diff --git a/backend/danswer/connectors/gmail/connector_auth.py b/backend/danswer/connectors/gmail/connector_auth.py deleted file mode 100644 index ad80d1e1eb1..00000000000 --- a/backend/danswer/connectors/gmail/connector_auth.py +++ /dev/null @@ -1,199 +0,0 @@ -import json -from typing import cast -from urllib.parse import parse_qs -from urllib.parse import ParseResult -from urllib.parse import urlparse - -from google.auth.transport.requests import Request # type: ignore -from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore -from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore -from google_auth_oauthlib.flow import InstalledAppFlow # type: ignore -from sqlalchemy.orm import Session - -from danswer.configs.app_configs import WEB_DOMAIN -from danswer.configs.constants import DocumentSource -from danswer.configs.constants import KV_CRED_KEY -from danswer.configs.constants import KV_GMAIL_CRED_KEY -from danswer.configs.constants import KV_GMAIL_SERVICE_ACCOUNT_KEY -from danswer.connectors.gmail.constants import ( - DB_CREDENTIALS_DICT_DELEGATED_USER_KEY, -) -from danswer.connectors.gmail.constants import DB_CREDENTIALS_DICT_TOKEN_KEY -from danswer.connectors.gmail.constants import ( - GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY, -) -from danswer.connectors.gmail.constants import SCOPES -from danswer.db.credentials import update_credential_json -from danswer.db.models import User -from danswer.dynamic_configs.factory import get_dynamic_config_store -from danswer.server.documents.models import CredentialBase -from danswer.server.documents.models import GoogleAppCredentials -from danswer.server.documents.models import GoogleServiceAccountKey -from danswer.utils.logger import setup_logger - -logger = setup_logger() - - -def _build_frontend_gmail_redirect() -> str: - return f"{WEB_DOMAIN}/admin/connectors/gmail/auth/callback" - - -def get_gmail_creds_for_authorized_user( - token_json_str: str, -) -> OAuthCredentials | None: - creds_json = json.loads(token_json_str) - creds = OAuthCredentials.from_authorized_user_info(creds_json, SCOPES) - if creds.valid: - return creds - - if creds.expired and creds.refresh_token: - try: - creds.refresh(Request()) - if creds.valid: - logger.notice("Refreshed Gmail tokens.") - return creds - except Exception as e: - logger.exception(f"Failed to refresh gmail access token due to: {e}") - return None - - return None - - -def get_gmail_creds_for_service_account( - service_account_key_json_str: str, -) -> ServiceAccountCredentials | None: - service_account_key = json.loads(service_account_key_json_str) - creds = ServiceAccountCredentials.from_service_account_info( - service_account_key, scopes=SCOPES - ) - if not creds.valid or not creds.expired: - creds.refresh(Request()) - return creds if creds.valid else None - - -def verify_csrf(credential_id: int, state: str) -> None: - csrf = get_dynamic_config_store().load(KV_CRED_KEY.format(str(credential_id))) - if csrf != state: - raise PermissionError( - "State from Gmail Connector callback does not match expected" - ) - - -def get_gmail_auth_url(credential_id: int) -> str: - creds_str = str(get_dynamic_config_store().load(KV_GMAIL_CRED_KEY)) - credential_json = json.loads(creds_str) - flow = InstalledAppFlow.from_client_config( - credential_json, - scopes=SCOPES, - redirect_uri=_build_frontend_gmail_redirect(), - ) - auth_url, _ = flow.authorization_url(prompt="consent") - - parsed_url = cast(ParseResult, urlparse(auth_url)) - params = parse_qs(parsed_url.query) - - get_dynamic_config_store().store( - KV_CRED_KEY.format(credential_id), params.get("state", [None])[0], encrypt=True - ) # type: ignore - return str(auth_url) - - -def get_auth_url(credential_id: int) -> str: - creds_str = str(get_dynamic_config_store().load(KV_GMAIL_CRED_KEY)) - credential_json = json.loads(creds_str) - flow = InstalledAppFlow.from_client_config( - credential_json, - scopes=SCOPES, - redirect_uri=_build_frontend_gmail_redirect(), - ) - auth_url, _ = flow.authorization_url(prompt="consent") - - parsed_url = cast(ParseResult, urlparse(auth_url)) - params = parse_qs(parsed_url.query) - - get_dynamic_config_store().store( - KV_CRED_KEY.format(credential_id), params.get("state", [None])[0], encrypt=True - ) # type: ignore - return str(auth_url) - - -def update_gmail_credential_access_tokens( - auth_code: str, - credential_id: int, - user: User, - db_session: Session, -) -> OAuthCredentials | None: - app_credentials = get_google_app_gmail_cred() - flow = InstalledAppFlow.from_client_config( - app_credentials.model_dump(), - scopes=SCOPES, - redirect_uri=_build_frontend_gmail_redirect(), - ) - flow.fetch_token(code=auth_code) - creds = flow.credentials - token_json_str = creds.to_json() - new_creds_dict = {DB_CREDENTIALS_DICT_TOKEN_KEY: token_json_str} - - if not update_credential_json(credential_id, new_creds_dict, user, db_session): - return None - return creds - - -def build_service_account_creds( - delegated_user_email: str | None = None, -) -> CredentialBase: - service_account_key = get_gmail_service_account_key() - - credential_dict = { - GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY: service_account_key.json(), - } - if delegated_user_email: - credential_dict[DB_CREDENTIALS_DICT_DELEGATED_USER_KEY] = delegated_user_email - - return CredentialBase( - source=DocumentSource.GMAIL, - credential_json=credential_dict, - admin_public=True, - ) - - -def get_google_app_gmail_cred() -> GoogleAppCredentials: - creds_str = str(get_dynamic_config_store().load(KV_GMAIL_CRED_KEY)) - return GoogleAppCredentials(**json.loads(creds_str)) - - -def upsert_google_app_gmail_cred(app_credentials: GoogleAppCredentials) -> None: - get_dynamic_config_store().store( - KV_GMAIL_CRED_KEY, app_credentials.json(), encrypt=True - ) - - -def delete_google_app_gmail_cred() -> None: - get_dynamic_config_store().delete(KV_GMAIL_CRED_KEY) - - -def get_gmail_service_account_key() -> GoogleServiceAccountKey: - creds_str = str(get_dynamic_config_store().load(KV_GMAIL_SERVICE_ACCOUNT_KEY)) - return GoogleServiceAccountKey(**json.loads(creds_str)) - - -def upsert_gmail_service_account_key( - service_account_key: GoogleServiceAccountKey, -) -> None: - get_dynamic_config_store().store( - KV_GMAIL_SERVICE_ACCOUNT_KEY, service_account_key.json(), encrypt=True - ) - - -def upsert_service_account_key(service_account_key: GoogleServiceAccountKey) -> None: - get_dynamic_config_store().store( - KV_GMAIL_SERVICE_ACCOUNT_KEY, service_account_key.json(), encrypt=True - ) - - -def delete_gmail_service_account_key() -> None: - get_dynamic_config_store().delete(KV_GMAIL_SERVICE_ACCOUNT_KEY) - - -def delete_service_account_key() -> None: - get_dynamic_config_store().delete(KV_GMAIL_SERVICE_ACCOUNT_KEY) diff --git a/backend/danswer/connectors/gmail/constants.py b/backend/danswer/connectors/gmail/constants.py deleted file mode 100644 index 36eff081818..00000000000 --- a/backend/danswer/connectors/gmail/constants.py +++ /dev/null @@ -1,4 +0,0 @@ -DB_CREDENTIALS_DICT_TOKEN_KEY = "gmail_tokens" -GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY = "gmail_service_account_key" -DB_CREDENTIALS_DICT_DELEGATED_USER_KEY = "gmail_delegated_user" -SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"] diff --git a/backend/danswer/connectors/google_drive/connector.py b/backend/danswer/connectors/google_drive/connector.py index 80674b5a37d..ad929eb0905 100644 --- a/backend/danswer/connectors/google_drive/connector.py +++ b/backend/danswer/connectors/google_drive/connector.py @@ -1,561 +1,535 @@ -import io +from collections.abc import Callable from collections.abc import Iterator -from collections.abc import Sequence -from datetime import datetime -from datetime import timezone -from enum import Enum -from itertools import chain +from concurrent.futures import as_completed +from concurrent.futures import ThreadPoolExecutor +from functools import partial from typing import Any -from typing import cast from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore -from googleapiclient import discovery # type: ignore -from googleapiclient.errors import HttpError # type: ignore -from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE -from danswer.configs.app_configs import GOOGLE_DRIVE_FOLLOW_SHORTCUTS -from danswer.configs.app_configs import GOOGLE_DRIVE_INCLUDE_SHARED -from danswer.configs.app_configs import GOOGLE_DRIVE_ONLY_ORG_PUBLIC from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.configs.constants import IGNORE_FOR_QA -from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder -from danswer.connectors.google_drive.connector_auth import ( - get_google_drive_creds_for_authorized_user, +from danswer.connectors.google_drive.doc_conversion import build_slim_document +from danswer.connectors.google_drive.doc_conversion import ( + convert_drive_item_to_document, ) -from danswer.connectors.google_drive.connector_auth import ( - get_google_drive_creds_for_service_account, +from danswer.connectors.google_drive.file_retrieval import crawl_folders_for_files +from danswer.connectors.google_drive.file_retrieval import get_all_files_for_oauth +from danswer.connectors.google_drive.file_retrieval import get_all_files_in_my_drive +from danswer.connectors.google_drive.file_retrieval import get_files_in_shared_drive +from danswer.connectors.google_drive.models import GoogleDriveFileType +from danswer.connectors.google_utils.google_auth import get_google_creds +from danswer.connectors.google_utils.google_utils import execute_paginated_retrieval +from danswer.connectors.google_utils.resources import get_admin_service +from danswer.connectors.google_utils.resources import get_drive_service +from danswer.connectors.google_utils.resources import get_google_docs_service +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_PRIMARY_ADMIN_KEY, ) -from danswer.connectors.google_drive.constants import ( - DB_CREDENTIALS_DICT_DELEGATED_USER_KEY, -) -from danswer.connectors.google_drive.constants import ( - DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY, -) -from danswer.connectors.google_drive.constants import DB_CREDENTIALS_DICT_TOKEN_KEY +from danswer.connectors.google_utils.shared_constants import MISSING_SCOPES_ERROR_STR +from danswer.connectors.google_utils.shared_constants import ONYX_SCOPE_INSTRUCTIONS +from danswer.connectors.google_utils.shared_constants import SCOPE_DOC_URL +from danswer.connectors.google_utils.shared_constants import SLIM_BATCH_SIZE +from danswer.connectors.google_utils.shared_constants import USER_FIELDS from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import GenerateSlimDocumentOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch -from danswer.connectors.models import Document -from danswer.connectors.models import Section -from danswer.file_processing.extract_file_text import docx_to_text -from danswer.file_processing.extract_file_text import pptx_to_text -from danswer.file_processing.extract_file_text import read_pdf_file -from danswer.utils.batching import batch_generator +from danswer.connectors.interfaces import SlimConnector from danswer.utils.logger import setup_logger logger = setup_logger() +# TODO: Improve this by using the batch utility: https://googleapis.github.io/google-api-python-client/docs/batch.html +# All file retrievals could be batched and made at once + -DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder" -DRIVE_SHORTCUT_TYPE = "application/vnd.google-apps.shortcut" -UNSUPPORTED_FILE_TYPE_CONTENT = "" # keep empty for now +def _extract_str_list_from_comma_str(string: str | None) -> list[str]: + if not string: + return [] + return [s.strip() for s in string.split(",") if s.strip()] -class GDriveMimeType(str, Enum): - DOC = "application/vnd.google-apps.document" - SPREADSHEET = "application/vnd.google-apps.spreadsheet" - PDF = "application/pdf" - WORD_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - PPT = "application/vnd.google-apps.presentation" - POWERPOINT = ( - "application/vnd.openxmlformats-officedocument.presentationml.presentation" +def _extract_ids_from_urls(urls: list[str]) -> list[str]: + return [url.split("/")[-1] for url in urls] + + +def _convert_single_file( + creds: Any, primary_admin_email: str, file: dict[str, Any] +) -> Any: + user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email + user_drive_service = get_drive_service(creds, user_email=user_email) + docs_service = get_google_docs_service(creds, user_email=user_email) + return convert_drive_item_to_document( + file=file, + drive_service=user_drive_service, + docs_service=docs_service, ) - PLAIN_TEXT = "text/plain" - MARKDOWN = "text/markdown" - - -GoogleDriveFileType = dict[str, Any] - -# Google Drive APIs are quite flakey and may 500 for an -# extended period of time. Trying to combat here by adding a very -# long retry period (~20 minutes of trying every minute) -add_retries = retry_builder(tries=50, max_delay=30) - - -def _run_drive_file_query( - service: discovery.Resource, - query: str, - continue_on_failure: bool, - include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED, - follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS, - batch_size: int = INDEX_BATCH_SIZE, -) -> Iterator[GoogleDriveFileType]: - next_page_token = "" - while next_page_token is not None: - logger.debug(f"Running Google Drive fetch with query: {query}") - results = add_retries( - lambda: ( - service.files() - .list( - corpora="allDrives" - if include_shared - else "user", # needed to search through shared drives - pageSize=batch_size, - supportsAllDrives=include_shared, - includeItemsFromAllDrives=include_shared, - fields=( - "nextPageToken, files(mimeType, id, name, permissions, " - "modifiedTime, webViewLink, shortcutDetails)" - ), - pageToken=next_page_token, - q=query, - ) - .execute() + + +def _process_files_batch( + files: list[GoogleDriveFileType], convert_func: Callable, batch_size: int +) -> GenerateDocumentsOutput: + doc_batch = [] + with ThreadPoolExecutor(max_workers=min(16, len(files))) as executor: + for doc in executor.map(convert_func, files): + if doc: + doc_batch.append(doc) + if len(doc_batch) >= batch_size: + yield doc_batch + doc_batch = [] + if doc_batch: + yield doc_batch + + +def _clean_requested_drive_ids( + requested_drive_ids: set[str], + requested_folder_ids: set[str], + all_drive_ids_available: set[str], +) -> tuple[set[str], set[str]]: + invalid_requested_drive_ids = requested_drive_ids - all_drive_ids_available + filtered_folder_ids = requested_folder_ids - all_drive_ids_available + if invalid_requested_drive_ids: + logger.warning( + f"Some shared drive IDs were not found. IDs: {invalid_requested_drive_ids}" + ) + logger.warning("Checking for folder access instead...") + filtered_folder_ids.update(invalid_requested_drive_ids) + + valid_requested_drive_ids = requested_drive_ids - invalid_requested_drive_ids + return valid_requested_drive_ids, filtered_folder_ids + + +class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector): + def __init__( + self, + include_shared_drives: bool = False, + include_my_drives: bool = False, + include_files_shared_with_me: bool = False, + shared_drive_urls: str | None = None, + my_drive_emails: str | None = None, + shared_folder_urls: str | None = None, + batch_size: int = INDEX_BATCH_SIZE, + # OLD PARAMETERS + folder_paths: list[str] | None = None, + include_shared: bool | None = None, + follow_shortcuts: bool | None = None, + only_org_public: bool | None = None, + continue_on_failure: bool | None = None, + ) -> None: + # Check for old input parameters + if ( + folder_paths is not None + or include_shared is not None + or follow_shortcuts is not None + or only_org_public is not None + or continue_on_failure is not None + ): + logger.exception( + "Google Drive connector received old input parameters. " + "Please visit the docs for help with the new setup: " + f"{SCOPE_DOC_URL}" ) - )() - next_page_token = results.get("nextPageToken") - files = results["files"] - for file in files: - if follow_shortcuts and "shortcutDetails" in file: - try: - file_shortcut_points_to = add_retries( - lambda: ( - service.files() - .get( - fileId=file["shortcutDetails"]["targetId"], - supportsAllDrives=include_shared, - fields="mimeType, id, name, modifiedTime, webViewLink, permissions, shortcutDetails", - ) - .execute() - ) - )() - yield file_shortcut_points_to - except HttpError: - logger.error( - f"Failed to follow shortcut with details: {file['shortcutDetails']}" - ) - if continue_on_failure: - continue - raise - else: - yield file - - -def _get_folder_id( - service: discovery.Resource, - parent_id: str, - folder_name: str, - include_shared: bool, - follow_shortcuts: bool, -) -> str | None: - """ - Get the ID of a folder given its name and the ID of its parent folder. - """ - query = f"'{parent_id}' in parents and name='{folder_name}' and " - if follow_shortcuts: - query += f"(mimeType='{DRIVE_FOLDER_TYPE}' or mimeType='{DRIVE_SHORTCUT_TYPE}')" - else: - query += f"mimeType='{DRIVE_FOLDER_TYPE}'" - - # TODO: support specifying folder path in shared drive rather than just `My Drive` - results = add_retries( - lambda: ( - service.files() - .list( - q=query, - spaces="drive", - fields="nextPageToken, files(id, name, shortcutDetails)", - supportsAllDrives=include_shared, - includeItemsFromAllDrives=include_shared, + raise ValueError( + "Google Drive connector received old input parameters. " + "Please visit the docs for help with the new setup: " + f"{SCOPE_DOC_URL}" ) - .execute() + + if ( + not include_shared_drives + and not include_my_drives + and not include_files_shared_with_me + and not shared_folder_urls + and not my_drive_emails + and not shared_drive_urls + ): + raise ValueError( + "Nothing to index. Please specify at least one of the following: " + "include_shared_drives, include_my_drives, include_files_shared_with_me, " + "shared_folder_urls, or my_drive_emails" + ) + + self.batch_size = batch_size + + specific_requests_made = False + if bool(shared_drive_urls) or bool(my_drive_emails) or bool(shared_folder_urls): + specific_requests_made = True + + self.include_files_shared_with_me = ( + False if specific_requests_made else include_files_shared_with_me + ) + self.include_my_drives = False if specific_requests_made else include_my_drives + self.include_shared_drives = ( + False if specific_requests_made else include_shared_drives ) - )() - items = results.get("files", []) - - folder_id = None - if items: - if follow_shortcuts and "shortcutDetails" in items[0]: - folder_id = items[0]["shortcutDetails"]["targetId"] - else: - folder_id = items[0]["id"] - return folder_id - - -def _get_folders( - service: discovery.Resource, - continue_on_failure: bool, - folder_id: str | None = None, # if specified, only fetches files within this folder - include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED, - follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS, - batch_size: int = INDEX_BATCH_SIZE, -) -> Iterator[GoogleDriveFileType]: - query = f"mimeType = '{DRIVE_FOLDER_TYPE}' " - if follow_shortcuts: - query = "(" + query + f" or mimeType = '{DRIVE_SHORTCUT_TYPE}'" + ") " - - if folder_id: - query += f"and '{folder_id}' in parents " - query = query.rstrip() # remove the trailing space(s) - - for file in _run_drive_file_query( - service=service, - query=query, - continue_on_failure=continue_on_failure, - include_shared=include_shared, - follow_shortcuts=follow_shortcuts, - batch_size=batch_size, - ): - # Need to check this since file may have been a target of a shortcut - # and not necessarily a folder - if file["mimeType"] == DRIVE_FOLDER_TYPE: - yield file - else: - pass - - -def _get_files( - service: discovery.Resource, - continue_on_failure: bool, - time_range_start: SecondsSinceUnixEpoch | None = None, - time_range_end: SecondsSinceUnixEpoch | None = None, - folder_id: str | None = None, # if specified, only fetches files within this folder - include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED, - follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS, - batch_size: int = INDEX_BATCH_SIZE, -) -> Iterator[GoogleDriveFileType]: - query = f"mimeType != '{DRIVE_FOLDER_TYPE}' " - if time_range_start is not None: - time_start = datetime.utcfromtimestamp(time_range_start).isoformat() + "Z" - query += f"and modifiedTime >= '{time_start}' " - if time_range_end is not None: - time_stop = datetime.utcfromtimestamp(time_range_end).isoformat() + "Z" - query += f"and modifiedTime <= '{time_stop}' " - if folder_id: - query += f"and '{folder_id}' in parents " - query = query.rstrip() # remove the trailing space(s) - - files = _run_drive_file_query( - service=service, - query=query, - continue_on_failure=continue_on_failure, - include_shared=include_shared, - follow_shortcuts=follow_shortcuts, - batch_size=batch_size, - ) - return files - - -def get_all_files_batched( - service: discovery.Resource, - continue_on_failure: bool, - include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED, - follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS, - batch_size: int = INDEX_BATCH_SIZE, - time_range_start: SecondsSinceUnixEpoch | None = None, - time_range_end: SecondsSinceUnixEpoch | None = None, - folder_id: str | None = None, # if specified, only fetches files within this folder - # if True, will fetch files in sub-folders of the specified folder ID. - # Only applies if folder_id is specified. - traverse_subfolders: bool = True, - folder_ids_traversed: list[str] | None = None, -) -> Iterator[list[GoogleDriveFileType]]: - """Gets all files matching the criteria specified by the args from Google Drive - in batches of size `batch_size`. - """ - found_files = _get_files( - service=service, - continue_on_failure=continue_on_failure, - time_range_start=time_range_start, - time_range_end=time_range_end, - folder_id=folder_id, - include_shared=include_shared, - follow_shortcuts=follow_shortcuts, - batch_size=batch_size, - ) - yield from batch_generator( - items=found_files, - batch_size=batch_size, - pre_batch_yield=lambda batch_files: logger.debug( - f"Parseable Documents in batch: {[file['name'] for file in batch_files]}" - ), - ) + shared_drive_url_list = _extract_str_list_from_comma_str(shared_drive_urls) + self._requested_shared_drive_ids = set( + _extract_ids_from_urls(shared_drive_url_list) + ) - if traverse_subfolders and folder_id is not None: - folder_ids_traversed = folder_ids_traversed or [] - subfolders = _get_folders( - service=service, - folder_id=folder_id, - continue_on_failure=continue_on_failure, - include_shared=include_shared, - follow_shortcuts=follow_shortcuts, - batch_size=batch_size, + self._requested_my_drive_emails = set( + _extract_str_list_from_comma_str(my_drive_emails) ) - for subfolder in subfolders: - if subfolder["id"] not in folder_ids_traversed: - logger.info("Fetching all files in subfolder: " + subfolder["name"]) - folder_ids_traversed.append(subfolder["id"]) - yield from get_all_files_batched( - service=service, - continue_on_failure=continue_on_failure, - include_shared=include_shared, - follow_shortcuts=follow_shortcuts, - batch_size=batch_size, - time_range_start=time_range_start, - time_range_end=time_range_end, - folder_id=subfolder["id"], - traverse_subfolders=traverse_subfolders, - folder_ids_traversed=folder_ids_traversed, - ) - else: - logger.debug( - "Skipping subfolder since already traversed: " + subfolder["name"] - ) + shared_folder_url_list = _extract_str_list_from_comma_str(shared_folder_urls) + self._requested_folder_ids = set(_extract_ids_from_urls(shared_folder_url_list)) + + self._primary_admin_email: str | None = None + + self._creds: OAuthCredentials | ServiceAccountCredentials | None = None -def extract_text(file: dict[str, str], service: discovery.Resource) -> str: - mime_type = file["mimeType"] + self._retrieved_ids: set[str] = set() - if mime_type not in set(item.value for item in GDriveMimeType): - # Unsupported file types can still have a title, finding this way is still useful - return UNSUPPORTED_FILE_TYPE_CONTENT + @property + def primary_admin_email(self) -> str: + if self._primary_admin_email is None: + raise RuntimeError( + "Primary admin email missing, " + "should not call this property " + "before calling load_credentials" + ) + return self._primary_admin_email + + @property + def google_domain(self) -> str: + if self._primary_admin_email is None: + raise RuntimeError( + "Primary admin email missing, " + "should not call this property " + "before calling load_credentials" + ) + return self._primary_admin_email.split("@")[-1] + + @property + def creds(self) -> OAuthCredentials | ServiceAccountCredentials: + if self._creds is None: + raise RuntimeError( + "Creds missing, " + "should not call this property " + "before calling load_credentials" + ) + return self._creds - if mime_type in [ - GDriveMimeType.DOC.value, - GDriveMimeType.PPT.value, - GDriveMimeType.SPREADSHEET.value, - ]: - export_mime_type = ( - "text/plain" - if mime_type != GDriveMimeType.SPREADSHEET.value - else "text/csv" + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None: + primary_admin_email = credentials[DB_CREDENTIALS_PRIMARY_ADMIN_KEY] + self._primary_admin_email = primary_admin_email + + self._creds, new_creds_dict = get_google_creds( + credentials=credentials, + source=DocumentSource.GOOGLE_DRIVE, ) - return ( - service.files() - .export(fileId=file["id"], mimeType=export_mime_type) - .execute() - .decode("utf-8") + return new_creds_dict + + def _update_traversed_parent_ids(self, folder_id: str) -> None: + self._retrieved_ids.add(folder_id) + + def _get_all_user_emails(self) -> list[str]: + # Start with primary admin email + user_emails = [self.primary_admin_email] + + # Only fetch additional users if using service account + if isinstance(self.creds, OAuthCredentials): + return user_emails + + admin_service = get_admin_service( + creds=self.creds, + user_email=self.primary_admin_email, ) - elif mime_type in [ - GDriveMimeType.PLAIN_TEXT.value, - GDriveMimeType.MARKDOWN.value, - ]: - return service.files().get_media(fileId=file["id"]).execute().decode("utf-8") - elif mime_type == GDriveMimeType.WORD_DOC.value: - response = service.files().get_media(fileId=file["id"]).execute() - return docx_to_text(file=io.BytesIO(response)) - elif mime_type == GDriveMimeType.PDF.value: - response = service.files().get_media(fileId=file["id"]).execute() - text, _ = read_pdf_file(file=io.BytesIO(response)) - return text - elif mime_type == GDriveMimeType.POWERPOINT.value: - response = service.files().get_media(fileId=file["id"]).execute() - return pptx_to_text(file=io.BytesIO(response)) - - return UNSUPPORTED_FILE_TYPE_CONTENT - - -class GoogleDriveConnector(LoadConnector, PollConnector): - def __init__( + + # Get admins first since they're more likely to have access to most files + for is_admin in [True, False]: + query = "isAdmin=true" if is_admin else "isAdmin=false" + for user in execute_paginated_retrieval( + retrieval_function=admin_service.users().list, + list_key="users", + fields=USER_FIELDS, + domain=self.google_domain, + query=query, + ): + if email := user.get("primaryEmail"): + if email not in user_emails: + user_emails.append(email) + return user_emails + + def _get_all_drive_ids(self) -> set[str]: + primary_drive_service = get_drive_service( + creds=self.creds, + user_email=self.primary_admin_email, + ) + is_service_account = isinstance(self.creds, ServiceAccountCredentials) + all_drive_ids = set() + for drive in execute_paginated_retrieval( + retrieval_function=primary_drive_service.drives().list, + list_key="drives", + useDomainAdminAccess=is_service_account, + fields="drives(id)", + ): + all_drive_ids.add(drive["id"]) + + if not all_drive_ids: + logger.warning( + "No drives found even though we are indexing shared drives was requested." + ) + + return all_drive_ids + + def _impersonate_user_for_retrieval( self, - # optional list of folder paths e.g. "[My Folder/My Subfolder]" - # if specified, will only index files in these folders - folder_paths: list[str] | None = None, - batch_size: int = INDEX_BATCH_SIZE, - include_shared: bool = GOOGLE_DRIVE_INCLUDE_SHARED, - follow_shortcuts: bool = GOOGLE_DRIVE_FOLLOW_SHORTCUTS, - only_org_public: bool = GOOGLE_DRIVE_ONLY_ORG_PUBLIC, - continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE, - ) -> None: - self.folder_paths = folder_paths or [] - self.batch_size = batch_size - self.include_shared = include_shared - self.follow_shortcuts = follow_shortcuts - self.only_org_public = only_org_public - self.continue_on_failure = continue_on_failure - self.creds: OAuthCredentials | ServiceAccountCredentials | None = None - - @staticmethod - def _process_folder_paths( - service: discovery.Resource, - folder_paths: list[str], - include_shared: bool, - follow_shortcuts: bool, - ) -> list[str]: - """['Folder/Sub Folder'] -> ['']""" - folder_ids: list[str] = [] - for path in folder_paths: - folder_names = path.split("/") - parent_id = "root" - for folder_name in folder_names: - found_parent_id = _get_folder_id( - service=service, - parent_id=parent_id, - folder_name=folder_name, - include_shared=include_shared, - follow_shortcuts=follow_shortcuts, - ) - if found_parent_id is None: - raise ValueError( - ( - f"Folder '{folder_name}' in path '{path}' " - "not found in Google Drive" - ) - ) - parent_id = found_parent_id - folder_ids.append(parent_id) - - return folder_ids + user_email: str, + is_slim: bool, + filtered_drive_ids: set[str], + filtered_folder_ids: set[str], + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> Iterator[GoogleDriveFileType]: + drive_service = get_drive_service(self.creds, user_email) + + # if we are including my drives, try to get the current user's my + # drive if any of the following are true: + # - include_my_drives is true + # - the current user's email is in the requested emails + if self.include_my_drives or user_email in self._requested_my_drive_emails: + yield from get_all_files_in_my_drive( + service=drive_service, + update_traversed_ids_func=self._update_traversed_parent_ids, + is_slim=is_slim, + start=start, + end=end, + ) - def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None: - """Checks for two different types of credentials. - (1) A credential which holds a token acquired via a user going thorough - the Google OAuth flow. - (2) A credential which holds a service account key JSON file, which - can then be used to impersonate any user in the workspace. - """ - creds: OAuthCredentials | ServiceAccountCredentials | None = None - new_creds_dict = None - if DB_CREDENTIALS_DICT_TOKEN_KEY in credentials: - access_token_json_str = cast( - str, credentials[DB_CREDENTIALS_DICT_TOKEN_KEY] + remaining_drive_ids = filtered_drive_ids - self._retrieved_ids + for drive_id in remaining_drive_ids: + yield from get_files_in_shared_drive( + service=drive_service, + drive_id=drive_id, + is_slim=is_slim, + update_traversed_ids_func=self._update_traversed_parent_ids, + start=start, + end=end, ) - creds = get_google_drive_creds_for_authorized_user( - token_json_str=access_token_json_str + + remaining_folders = filtered_folder_ids - self._retrieved_ids + for folder_id in remaining_folders: + yield from crawl_folders_for_files( + service=drive_service, + parent_id=folder_id, + traversed_parent_ids=self._retrieved_ids, + update_traversed_ids_func=self._update_traversed_parent_ids, + start=start, + end=end, ) - # tell caller to update token stored in DB if it has changed - # (e.g. the token has been refreshed) - new_creds_json_str = creds.to_json() if creds else "" - if new_creds_json_str != access_token_json_str: - new_creds_dict = {DB_CREDENTIALS_DICT_TOKEN_KEY: new_creds_json_str} - - if DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY in credentials: - service_account_key_json_str = credentials[ - DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY - ] - creds = get_google_drive_creds_for_service_account( - service_account_key_json_str=service_account_key_json_str + def _manage_service_account_retrieval( + self, + is_slim: bool, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> Iterator[GoogleDriveFileType]: + all_org_emails: list[str] = self._get_all_user_emails() + + all_drive_ids: set[str] = self._get_all_drive_ids() + + drive_ids_to_retrieve: set[str] = set() + folder_ids_to_retrieve: set[str] = set() + if self._requested_shared_drive_ids or self._requested_folder_ids: + drive_ids_to_retrieve, folder_ids_to_retrieve = _clean_requested_drive_ids( + requested_drive_ids=self._requested_shared_drive_ids, + requested_folder_ids=self._requested_folder_ids, + all_drive_ids_available=all_drive_ids, + ) + elif self.include_shared_drives: + drive_ids_to_retrieve = all_drive_ids + + # Process users in parallel using ThreadPoolExecutor + with ThreadPoolExecutor(max_workers=10) as executor: + future_to_email = { + executor.submit( + self._impersonate_user_for_retrieval, + email, + is_slim, + drive_ids_to_retrieve, + folder_ids_to_retrieve, + start, + end, + ): email + for email in all_org_emails + } + + # Yield results as they complete + for future in as_completed(future_to_email): + yield from future.result() + + remaining_folders = ( + drive_ids_to_retrieve | folder_ids_to_retrieve + ) - self._retrieved_ids + if remaining_folders: + logger.warning( + f"Some folders/drives were not retrieved. IDs: {remaining_folders}" ) - # "Impersonate" a user if one is specified - delegated_user_email = cast( - str | None, credentials.get(DB_CREDENTIALS_DICT_DELEGATED_USER_KEY) + def _manage_oauth_retrieval( + self, + is_slim: bool, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> Iterator[GoogleDriveFileType]: + drive_service = get_drive_service(self.creds, self.primary_admin_email) + + if self.include_files_shared_with_me or self.include_my_drives: + yield from get_all_files_for_oauth( + service=drive_service, + include_files_shared_with_me=self.include_files_shared_with_me, + include_my_drives=self.include_my_drives, + include_shared_drives=self.include_shared_drives, + is_slim=is_slim, + start=start, + end=end, ) - if delegated_user_email: - creds = creds.with_subject(delegated_user_email) if creds else None # type: ignore - if creds is None: - raise PermissionError( - "Unable to access Google Drive - unknown credential structure." + all_requested = ( + self.include_files_shared_with_me + and self.include_my_drives + and self.include_shared_drives + ) + if all_requested: + # If all 3 are true, we already yielded from get_all_files_for_oauth + return + + all_drive_ids = self._get_all_drive_ids() + drive_ids_to_retrieve: set[str] = set() + folder_ids_to_retrieve: set[str] = set() + if self._requested_shared_drive_ids or self._requested_folder_ids: + drive_ids_to_retrieve, folder_ids_to_retrieve = _clean_requested_drive_ids( + requested_drive_ids=self._requested_shared_drive_ids, + requested_folder_ids=self._requested_folder_ids, + all_drive_ids_available=all_drive_ids, + ) + elif self.include_shared_drives: + drive_ids_to_retrieve = all_drive_ids + + for drive_id in drive_ids_to_retrieve: + yield from get_files_in_shared_drive( + service=drive_service, + drive_id=drive_id, + is_slim=is_slim, + update_traversed_ids_func=self._update_traversed_parent_ids, + start=start, + end=end, ) - self.creds = creds - return new_creds_dict + # Even if no folders were requested, we still check if any drives were requested + # that could be folders. + remaining_folders = folder_ids_to_retrieve - self._retrieved_ids + for folder_id in remaining_folders: + yield from crawl_folders_for_files( + service=drive_service, + parent_id=folder_id, + traversed_parent_ids=self._retrieved_ids, + update_traversed_ids_func=self._update_traversed_parent_ids, + start=start, + end=end, + ) - def _fetch_docs_from_drive( + remaining_folders = ( + drive_ids_to_retrieve | folder_ids_to_retrieve + ) - self._retrieved_ids + if remaining_folders: + logger.warning( + f"Some folders/drives were not retrieved. IDs: {remaining_folders}" + ) + + def _fetch_drive_items( self, + is_slim: bool, start: SecondsSinceUnixEpoch | None = None, end: SecondsSinceUnixEpoch | None = None, - ) -> GenerateDocumentsOutput: - if self.creds is None: - raise PermissionError("Not logged into Google Drive") + ) -> Iterator[GoogleDriveFileType]: + retrieval_method = ( + self._manage_service_account_retrieval + if isinstance(self.creds, ServiceAccountCredentials) + else self._manage_oauth_retrieval + ) + return retrieval_method( + is_slim=is_slim, + start=start, + end=end, + ) - service = discovery.build("drive", "v3", credentials=self.creds) - folder_ids: Sequence[str | None] = self._process_folder_paths( - service, self.folder_paths, self.include_shared, self.follow_shortcuts + def _extract_docs_from_google_drive( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> GenerateDocumentsOutput: + # Create a larger process pool for file conversion + convert_func = partial( + _convert_single_file, self.creds, self.primary_admin_email ) - if not folder_ids: - folder_ids = [None] - - file_batches = chain( - *[ - get_all_files_batched( - service=service, - continue_on_failure=self.continue_on_failure, - include_shared=self.include_shared, - follow_shortcuts=self.follow_shortcuts, - batch_size=self.batch_size, - time_range_start=start, - time_range_end=end, - folder_id=folder_id, - traverse_subfolders=True, + + # Process files in larger batches + LARGE_BATCH_SIZE = self.batch_size * 4 + files_to_process = [] + # Gather the files into batches to be processed in parallel + for file in self._fetch_drive_items(is_slim=False, start=start, end=end): + files_to_process.append(file) + if len(files_to_process) >= LARGE_BATCH_SIZE: + yield from _process_files_batch( + files_to_process, convert_func, self.batch_size ) - for folder_id in folder_ids - ] - ) - for files_batch in file_batches: - doc_batch = [] - for file in files_batch: - try: - # Skip files that are shortcuts - if file.get("mimeType") == DRIVE_SHORTCUT_TYPE: - logger.info("Ignoring Drive Shortcut Filetype") - continue - - if self.only_org_public: - if "permissions" not in file: - continue - if not any( - permission["type"] == "domain" - for permission in file["permissions"] - ): - continue - - text_contents = extract_text(file, service) or "" - - doc_batch.append( - Document( - id=file["webViewLink"], - sections=[ - Section(link=file["webViewLink"], text=text_contents) - ], - source=DocumentSource.GOOGLE_DRIVE, - semantic_identifier=file["name"], - doc_updated_at=datetime.fromisoformat( - file["modifiedTime"] - ).astimezone(timezone.utc), - metadata={} if text_contents else {IGNORE_FOR_QA: "True"}, - ) - ) - except Exception as e: - if not self.continue_on_failure: - raise e - - logger.exception( - "Ran into exception when pulling a file from Google Drive" - ) - - yield doc_batch + files_to_process = [] + + # Process any remaining files + if files_to_process: + yield from _process_files_batch( + files_to_process, convert_func, self.batch_size + ) def load_from_state(self) -> GenerateDocumentsOutput: - yield from self._fetch_docs_from_drive() + try: + yield from self._extract_docs_from_google_drive() + except Exception as e: + if MISSING_SCOPES_ERROR_STR in str(e): + raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e + raise e def poll_source( self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch ) -> GenerateDocumentsOutput: - # need to subtract 10 minutes from start time to account for modifiedTime - # propogation if a document is modified, it takes some time for the API to - # reflect these changes if we do not have an offset, then we may "miss" the - # update when polling - yield from self._fetch_docs_from_drive(start, end) - - -if __name__ == "__main__": - import json - import os - - service_account_json_path = os.environ.get("GOOGLE_SERVICE_ACCOUNT_KEY_JSON_PATH") - if not service_account_json_path: - raise ValueError( - "Please set GOOGLE_SERVICE_ACCOUNT_KEY_JSON_PATH environment variable" - ) - with open(service_account_json_path) as f: - creds = json.load(f) - - credentials_dict = { - DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY: json.dumps(creds), - } - delegated_user = os.environ.get("GOOGLE_DRIVE_DELEGATED_USER") - if delegated_user: - credentials_dict[DB_CREDENTIALS_DICT_DELEGATED_USER_KEY] = delegated_user - - connector = GoogleDriveConnector(include_shared=True, follow_shortcuts=True) - connector.load_credentials(credentials_dict) - document_batch_generator = connector.load_from_state() - for document_batch in document_batch_generator: - print(document_batch) - break + try: + yield from self._extract_docs_from_google_drive(start, end) + except Exception as e: + if MISSING_SCOPES_ERROR_STR in str(e): + raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e + raise e + + def _extract_slim_docs_from_google_drive( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> GenerateSlimDocumentOutput: + slim_batch = [] + for file in self._fetch_drive_items( + is_slim=True, + start=start, + end=end, + ): + if doc := build_slim_document(file): + slim_batch.append(doc) + if len(slim_batch) >= SLIM_BATCH_SIZE: + yield slim_batch + slim_batch = [] + yield slim_batch + + def retrieve_all_slim_documents( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> GenerateSlimDocumentOutput: + try: + yield from self._extract_slim_docs_from_google_drive(start, end) + except Exception as e: + if MISSING_SCOPES_ERROR_STR in str(e): + raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e + raise e diff --git a/backend/danswer/connectors/google_drive/connector_auth.py b/backend/danswer/connectors/google_drive/connector_auth.py deleted file mode 100644 index 0f47727e6ee..00000000000 --- a/backend/danswer/connectors/google_drive/connector_auth.py +++ /dev/null @@ -1,171 +0,0 @@ -import json -from typing import cast -from urllib.parse import parse_qs -from urllib.parse import ParseResult -from urllib.parse import urlparse - -from google.auth.transport.requests import Request # type: ignore -from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore -from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore -from google_auth_oauthlib.flow import InstalledAppFlow # type: ignore -from sqlalchemy.orm import Session - -from danswer.configs.app_configs import WEB_DOMAIN -from danswer.configs.constants import DocumentSource -from danswer.configs.constants import KV_CRED_KEY -from danswer.configs.constants import KV_GOOGLE_DRIVE_CRED_KEY -from danswer.configs.constants import KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY -from danswer.connectors.google_drive.constants import ( - DB_CREDENTIALS_DICT_DELEGATED_USER_KEY, -) -from danswer.connectors.google_drive.constants import ( - DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY, -) -from danswer.connectors.google_drive.constants import DB_CREDENTIALS_DICT_TOKEN_KEY -from danswer.connectors.google_drive.constants import SCOPES -from danswer.db.credentials import update_credential_json -from danswer.db.models import User -from danswer.dynamic_configs.factory import get_dynamic_config_store -from danswer.server.documents.models import CredentialBase -from danswer.server.documents.models import GoogleAppCredentials -from danswer.server.documents.models import GoogleServiceAccountKey -from danswer.utils.logger import setup_logger - -logger = setup_logger() - - -def _build_frontend_google_drive_redirect() -> str: - return f"{WEB_DOMAIN}/admin/connectors/google-drive/auth/callback" - - -def get_google_drive_creds_for_authorized_user( - token_json_str: str, -) -> OAuthCredentials | None: - creds_json = json.loads(token_json_str) - creds = OAuthCredentials.from_authorized_user_info(creds_json, SCOPES) - if creds.valid: - return creds - - if creds.expired and creds.refresh_token: - try: - creds.refresh(Request()) - if creds.valid: - logger.notice("Refreshed Google Drive tokens.") - return creds - except Exception as e: - logger.exception(f"Failed to refresh google drive access token due to: {e}") - return None - - return None - - -def get_google_drive_creds_for_service_account( - service_account_key_json_str: str, -) -> ServiceAccountCredentials | None: - service_account_key = json.loads(service_account_key_json_str) - creds = ServiceAccountCredentials.from_service_account_info( - service_account_key, scopes=SCOPES - ) - if not creds.valid or not creds.expired: - creds.refresh(Request()) - return creds if creds.valid else None - - -def verify_csrf(credential_id: int, state: str) -> None: - csrf = get_dynamic_config_store().load(KV_CRED_KEY.format(str(credential_id))) - if csrf != state: - raise PermissionError( - "State from Google Drive Connector callback does not match expected" - ) - - -def get_auth_url(credential_id: int) -> str: - creds_str = str(get_dynamic_config_store().load(KV_GOOGLE_DRIVE_CRED_KEY)) - credential_json = json.loads(creds_str) - flow = InstalledAppFlow.from_client_config( - credential_json, - scopes=SCOPES, - redirect_uri=_build_frontend_google_drive_redirect(), - ) - auth_url, _ = flow.authorization_url(prompt="consent") - - parsed_url = cast(ParseResult, urlparse(auth_url)) - params = parse_qs(parsed_url.query) - - get_dynamic_config_store().store( - KV_CRED_KEY.format(credential_id), params.get("state", [None])[0], encrypt=True - ) # type: ignore - return str(auth_url) - - -def update_credential_access_tokens( - auth_code: str, - credential_id: int, - user: User, - db_session: Session, -) -> OAuthCredentials | None: - app_credentials = get_google_app_cred() - flow = InstalledAppFlow.from_client_config( - app_credentials.model_dump(), - scopes=SCOPES, - redirect_uri=_build_frontend_google_drive_redirect(), - ) - flow.fetch_token(code=auth_code) - creds = flow.credentials - token_json_str = creds.to_json() - new_creds_dict = {DB_CREDENTIALS_DICT_TOKEN_KEY: token_json_str} - - if not update_credential_json(credential_id, new_creds_dict, user, db_session): - return None - return creds - - -def build_service_account_creds( - source: DocumentSource, - delegated_user_email: str | None = None, -) -> CredentialBase: - service_account_key = get_service_account_key() - - credential_dict = { - DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY: service_account_key.json(), - } - if delegated_user_email: - credential_dict[DB_CREDENTIALS_DICT_DELEGATED_USER_KEY] = delegated_user_email - - return CredentialBase( - credential_json=credential_dict, - admin_public=True, - source=DocumentSource.GOOGLE_DRIVE, - ) - - -def get_google_app_cred() -> GoogleAppCredentials: - creds_str = str(get_dynamic_config_store().load(KV_GOOGLE_DRIVE_CRED_KEY)) - return GoogleAppCredentials(**json.loads(creds_str)) - - -def upsert_google_app_cred(app_credentials: GoogleAppCredentials) -> None: - get_dynamic_config_store().store( - KV_GOOGLE_DRIVE_CRED_KEY, app_credentials.json(), encrypt=True - ) - - -def delete_google_app_cred() -> None: - get_dynamic_config_store().delete(KV_GOOGLE_DRIVE_CRED_KEY) - - -def get_service_account_key() -> GoogleServiceAccountKey: - creds_str = str( - get_dynamic_config_store().load(KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY) - ) - return GoogleServiceAccountKey(**json.loads(creds_str)) - - -def upsert_service_account_key(service_account_key: GoogleServiceAccountKey) -> None: - get_dynamic_config_store().store( - KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY, service_account_key.json(), encrypt=True - ) - - -def delete_service_account_key() -> None: - get_dynamic_config_store().delete(KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY) diff --git a/backend/danswer/connectors/google_drive/constants.py b/backend/danswer/connectors/google_drive/constants.py index 214bfd5cb97..4fdfb23d57b 100644 --- a/backend/danswer/connectors/google_drive/constants.py +++ b/backend/danswer/connectors/google_drive/constants.py @@ -1,7 +1,4 @@ -DB_CREDENTIALS_DICT_TOKEN_KEY = "google_drive_tokens" -DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY = "google_drive_service_account_key" -DB_CREDENTIALS_DICT_DELEGATED_USER_KEY = "google_drive_delegated_user" -SCOPES = [ - "https://www.googleapis.com/auth/drive.readonly", - "https://www.googleapis.com/auth/drive.metadata.readonly", -] +UNSUPPORTED_FILE_TYPE_CONTENT = "" # keep empty for now +DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder" +DRIVE_SHORTCUT_TYPE = "application/vnd.google-apps.shortcut" +DRIVE_FILE_TYPE = "application/vnd.google-apps.file" diff --git a/backend/danswer/connectors/google_drive/doc_conversion.py b/backend/danswer/connectors/google_drive/doc_conversion.py new file mode 100644 index 00000000000..a3febd9d172 --- /dev/null +++ b/backend/danswer/connectors/google_drive/doc_conversion.py @@ -0,0 +1,260 @@ +import io +from datetime import datetime +from datetime import timezone + +from googleapiclient.discovery import build # type: ignore +from googleapiclient.errors import HttpError # type: ignore + +from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE +from danswer.configs.constants import DocumentSource +from danswer.configs.constants import IGNORE_FOR_QA +from danswer.connectors.google_drive.constants import DRIVE_FOLDER_TYPE +from danswer.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE +from danswer.connectors.google_drive.constants import UNSUPPORTED_FILE_TYPE_CONTENT +from danswer.connectors.google_drive.models import GDriveMimeType +from danswer.connectors.google_drive.models import GoogleDriveFileType +from danswer.connectors.google_drive.section_extraction import get_document_sections +from danswer.connectors.google_utils.resources import GoogleDocsService +from danswer.connectors.google_utils.resources import GoogleDriveService +from danswer.connectors.models import Document +from danswer.connectors.models import Section +from danswer.connectors.models import SlimDocument +from danswer.file_processing.extract_file_text import docx_to_text +from danswer.file_processing.extract_file_text import pptx_to_text +from danswer.file_processing.extract_file_text import read_pdf_file +from danswer.file_processing.unstructured import get_unstructured_api_key +from danswer.file_processing.unstructured import unstructured_to_text +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +# these errors don't represent a failure in the connector, but simply files +# that can't / shouldn't be indexed +ERRORS_TO_CONTINUE_ON = [ + "cannotExportFile", + "exportSizeLimitExceeded", + "cannotDownloadFile", +] + + +def _extract_sections_basic( + file: dict[str, str], service: GoogleDriveService +) -> list[Section]: + mime_type = file["mimeType"] + link = file["webViewLink"] + + if mime_type not in set(item.value for item in GDriveMimeType): + # Unsupported file types can still have a title, finding this way is still useful + return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] + + try: + if mime_type == GDriveMimeType.SPREADSHEET.value: + try: + sheets_service = build( + "sheets", "v4", credentials=service._http.credentials + ) + spreadsheet = ( + sheets_service.spreadsheets() + .get(spreadsheetId=file["id"]) + .execute() + ) + + sections = [] + for sheet in spreadsheet["sheets"]: + sheet_name = sheet["properties"]["title"] + sheet_id = sheet["properties"]["sheetId"] + + # Get sheet dimensions + grid_properties = sheet["properties"].get("gridProperties", {}) + row_count = grid_properties.get("rowCount", 1000) + column_count = grid_properties.get("columnCount", 26) + + # Convert column count to letter (e.g., 26 -> Z, 27 -> AA) + end_column = "" + while column_count: + column_count, remainder = divmod(column_count - 1, 26) + end_column = chr(65 + remainder) + end_column + + range_name = f"'{sheet_name}'!A1:{end_column}{row_count}" + + try: + result = ( + sheets_service.spreadsheets() + .values() + .get(spreadsheetId=file["id"], range=range_name) + .execute() + ) + values = result.get("values", []) + + if values: + text = f"Sheet: {sheet_name}\n" + for row in values: + text += "\t".join(str(cell) for cell in row) + "\n" + sections.append( + Section( + link=f"{link}#gid={sheet_id}", + text=text, + ) + ) + except HttpError as e: + logger.warning( + f"Error fetching data for sheet '{sheet_name}': {e}" + ) + continue + return sections + + except Exception as e: + logger.warning( + f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'." + " Falling back to basic extraction." + ) + + if mime_type in [ + GDriveMimeType.DOC.value, + GDriveMimeType.PPT.value, + GDriveMimeType.SPREADSHEET.value, + ]: + export_mime_type = ( + "text/plain" + if mime_type != GDriveMimeType.SPREADSHEET.value + else "text/csv" + ) + text = ( + service.files() + .export(fileId=file["id"], mimeType=export_mime_type) + .execute() + .decode("utf-8") + ) + return [Section(link=link, text=text)] + + elif mime_type in [ + GDriveMimeType.PLAIN_TEXT.value, + GDriveMimeType.MARKDOWN.value, + ]: + return [ + Section( + link=link, + text=service.files() + .get_media(fileId=file["id"]) + .execute() + .decode("utf-8"), + ) + ] + if mime_type in [ + GDriveMimeType.WORD_DOC.value, + GDriveMimeType.POWERPOINT.value, + GDriveMimeType.PDF.value, + ]: + response = service.files().get_media(fileId=file["id"]).execute() + if get_unstructured_api_key(): + return [ + Section( + link=link, + text=unstructured_to_text( + file=io.BytesIO(response), + file_name=file.get("name", file["id"]), + ), + ) + ] + + if mime_type == GDriveMimeType.WORD_DOC.value: + return [ + Section(link=link, text=docx_to_text(file=io.BytesIO(response))) + ] + elif mime_type == GDriveMimeType.PDF.value: + text, _ = read_pdf_file(file=io.BytesIO(response)) + return [Section(link=link, text=text)] + elif mime_type == GDriveMimeType.POWERPOINT.value: + return [ + Section(link=link, text=pptx_to_text(file=io.BytesIO(response))) + ] + + return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] + + except Exception: + return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)] + + +def convert_drive_item_to_document( + file: GoogleDriveFileType, + drive_service: GoogleDriveService, + docs_service: GoogleDocsService, +) -> Document | None: + try: + # Skip files that are shortcuts + if file.get("mimeType") == DRIVE_SHORTCUT_TYPE: + logger.info("Ignoring Drive Shortcut Filetype") + return None + # Skip files that are folders + if file.get("mimeType") == DRIVE_FOLDER_TYPE: + logger.info("Ignoring Drive Folder Filetype") + return None + + sections: list[Section] = [] + + # Special handling for Google Docs to preserve structure, link + # to headers + if file.get("mimeType") == GDriveMimeType.DOC.value: + try: + sections = get_document_sections(docs_service, file["id"]) + except Exception as e: + logger.warning( + f"Ran into exception '{e}' when pulling sections from Google Doc '{file['name']}'." + " Falling back to basic extraction." + ) + # NOTE: this will run for either (1) the above failed or (2) the file is not a Google Doc + if not sections: + try: + # For all other file types just extract the text + sections = _extract_sections_basic(file, drive_service) + + except HttpError as e: + reason = e.error_details[0]["reason"] if e.error_details else e.reason + message = e.error_details[0]["message"] if e.error_details else e.reason + if e.status_code == 403 and reason in ERRORS_TO_CONTINUE_ON: + logger.warning( + f"Could not export file '{file['name']}' due to '{message}', skipping..." + ) + return None + + raise + if not sections: + return None + + return Document( + id=file["webViewLink"], + sections=sections, + source=DocumentSource.GOOGLE_DRIVE, + semantic_identifier=file["name"], + doc_updated_at=datetime.fromisoformat(file["modifiedTime"]).astimezone( + timezone.utc + ), + metadata={} + if any(section.text for section in sections) + else {IGNORE_FOR_QA: "True"}, + additional_info=file.get("id"), + ) + except Exception as e: + if not CONTINUE_ON_CONNECTOR_FAILURE: + raise e + + logger.exception("Ran into exception when pulling a file from Google Drive") + return None + + +def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None: + # Skip files that are folders or shortcuts + if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]: + return None + + return SlimDocument( + id=file["webViewLink"], + perm_sync_data={ + "doc_id": file.get("id"), + "permissions": file.get("permissions", []), + "permission_ids": file.get("permissionIds", []), + "name": file.get("name"), + "owner_email": file.get("owners", [{}])[0].get("emailAddress"), + }, + ) diff --git a/backend/danswer/connectors/google_drive/file_retrieval.py b/backend/danswer/connectors/google_drive/file_retrieval.py new file mode 100644 index 00000000000..962d531b076 --- /dev/null +++ b/backend/danswer/connectors/google_drive/file_retrieval.py @@ -0,0 +1,258 @@ +from collections.abc import Callable +from collections.abc import Iterator +from datetime import datetime +from typing import Any + +from googleapiclient.discovery import Resource # type: ignore + +from danswer.connectors.google_drive.constants import DRIVE_FOLDER_TYPE +from danswer.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE +from danswer.connectors.google_drive.models import GoogleDriveFileType +from danswer.connectors.google_utils.google_utils import execute_paginated_retrieval +from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.utils.logger import setup_logger + +logger = setup_logger() + +FILE_FIELDS = ( + "nextPageToken, files(mimeType, id, name, permissions, modifiedTime, webViewLink, " + "shortcutDetails, owners(emailAddress))" +) +SLIM_FILE_FIELDS = ( + "nextPageToken, files(mimeType, id, name, permissions(emailAddress, type), " + "permissionIds, webViewLink, owners(emailAddress))" +) +FOLDER_FIELDS = "nextPageToken, files(id, name, permissions, modifiedTime, webViewLink, shortcutDetails)" + + +def _generate_time_range_filter( + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, +) -> str: + time_range_filter = "" + if start is not None: + time_start = datetime.utcfromtimestamp(start).isoformat() + "Z" + time_range_filter += f" and modifiedTime >= '{time_start}'" + if end is not None: + time_stop = datetime.utcfromtimestamp(end).isoformat() + "Z" + time_range_filter += f" and modifiedTime <= '{time_stop}'" + return time_range_filter + + +def _get_folders_in_parent( + service: Resource, + parent_id: str | None = None, +) -> Iterator[GoogleDriveFileType]: + # Follow shortcuts to folders + query = f"(mimeType = '{DRIVE_FOLDER_TYPE}' or mimeType = '{DRIVE_SHORTCUT_TYPE}')" + query += " and trashed = false" + + if parent_id: + query += f" and '{parent_id}' in parents" + + for file in execute_paginated_retrieval( + retrieval_function=service.files().list, + list_key="files", + continue_on_404_or_403=True, + corpora="allDrives", + supportsAllDrives=True, + includeItemsFromAllDrives=True, + fields=FOLDER_FIELDS, + q=query, + ): + yield file + + +def _get_files_in_parent( + service: Resource, + parent_id: str, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + is_slim: bool = False, +) -> Iterator[GoogleDriveFileType]: + query = f"mimeType != '{DRIVE_FOLDER_TYPE}' and '{parent_id}' in parents" + query += " and trashed = false" + query += _generate_time_range_filter(start, end) + + for file in execute_paginated_retrieval( + retrieval_function=service.files().list, + list_key="files", + continue_on_404_or_403=True, + corpora="allDrives", + supportsAllDrives=True, + includeItemsFromAllDrives=True, + fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS, + q=query, + ): + yield file + + +def crawl_folders_for_files( + service: Resource, + parent_id: str, + traversed_parent_ids: set[str], + update_traversed_ids_func: Callable[[str], None], + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, +) -> Iterator[GoogleDriveFileType]: + """ + This function starts crawling from any folder. It is slower though. + """ + if parent_id in traversed_parent_ids: + logger.info(f"Skipping subfolder since already traversed: {parent_id}") + return + + found_files = False + for file in _get_files_in_parent( + service=service, + start=start, + end=end, + parent_id=parent_id, + ): + found_files = True + yield file + + if found_files: + update_traversed_ids_func(parent_id) + + for subfolder in _get_folders_in_parent( + service=service, + parent_id=parent_id, + ): + logger.info("Fetching all files in subfolder: " + subfolder["name"]) + yield from crawl_folders_for_files( + service=service, + parent_id=subfolder["id"], + traversed_parent_ids=traversed_parent_ids, + update_traversed_ids_func=update_traversed_ids_func, + start=start, + end=end, + ) + + +def get_files_in_shared_drive( + service: Resource, + drive_id: str, + is_slim: bool = False, + update_traversed_ids_func: Callable[[str], None] = lambda _: None, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, +) -> Iterator[GoogleDriveFileType]: + # If we know we are going to folder crawl later, we can cache the folders here + # Get all folders being queried and add them to the traversed set + folder_query = f"mimeType = '{DRIVE_FOLDER_TYPE}'" + folder_query += " and trashed = false" + found_folders = False + for file in execute_paginated_retrieval( + retrieval_function=service.files().list, + list_key="files", + continue_on_404_or_403=True, + corpora="drive", + driveId=drive_id, + supportsAllDrives=True, + includeItemsFromAllDrives=True, + fields="nextPageToken, files(id)", + q=folder_query, + ): + update_traversed_ids_func(file["id"]) + found_folders = True + if found_folders: + update_traversed_ids_func(drive_id) + + # Get all files in the shared drive + file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'" + file_query += " and trashed = false" + file_query += _generate_time_range_filter(start, end) + yield from execute_paginated_retrieval( + retrieval_function=service.files().list, + list_key="files", + continue_on_404_or_403=True, + corpora="drive", + driveId=drive_id, + supportsAllDrives=True, + includeItemsFromAllDrives=True, + fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS, + q=file_query, + ) + + +def get_all_files_in_my_drive( + service: Any, + update_traversed_ids_func: Callable, + is_slim: bool = False, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, +) -> Iterator[GoogleDriveFileType]: + # If we know we are going to folder crawl later, we can cache the folders here + # Get all folders being queried and add them to the traversed set + folder_query = f"mimeType = '{DRIVE_FOLDER_TYPE}'" + folder_query += " and trashed = false" + folder_query += " and 'me' in owners" + found_folders = False + for file in execute_paginated_retrieval( + retrieval_function=service.files().list, + list_key="files", + corpora="user", + fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS, + q=folder_query, + ): + update_traversed_ids_func(file["id"]) + found_folders = True + if found_folders: + update_traversed_ids_func(get_root_folder_id(service)) + + # Then get the files + file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'" + file_query += " and trashed = false" + file_query += " and 'me' in owners" + file_query += _generate_time_range_filter(start, end) + yield from execute_paginated_retrieval( + retrieval_function=service.files().list, + list_key="files", + corpora="user", + fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS, + q=file_query, + ) + + +def get_all_files_for_oauth( + service: Any, + include_files_shared_with_me: bool, + include_my_drives: bool, + # One of the above 2 should be true + include_shared_drives: bool, + is_slim: bool = False, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, +) -> Iterator[GoogleDriveFileType]: + should_get_all = ( + include_shared_drives and include_my_drives and include_files_shared_with_me + ) + corpora = "allDrives" if should_get_all else "user" + + file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'" + file_query += " and trashed = false" + file_query += _generate_time_range_filter(start, end) + + if not should_get_all: + if include_files_shared_with_me and not include_my_drives: + file_query += " and not 'me' in owners" + if not include_files_shared_with_me and include_my_drives: + file_query += " and 'me' in owners" + + yield from execute_paginated_retrieval( + retrieval_function=service.files().list, + list_key="files", + corpora=corpora, + includeItemsFromAllDrives=should_get_all, + supportsAllDrives=should_get_all, + fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS, + q=file_query, + ) + + +# Just in case we need to get the root folder id +def get_root_folder_id(service: Resource) -> str: + # we dont paginate here because there is only one root folder per user + # https://developers.google.com/drive/api/guides/v2-to-v3-reference + return service.files().get(fileId="root", fields="id").execute()["id"] diff --git a/backend/danswer/connectors/google_drive/models.py b/backend/danswer/connectors/google_drive/models.py new file mode 100644 index 00000000000..5bb06f3c206 --- /dev/null +++ b/backend/danswer/connectors/google_drive/models.py @@ -0,0 +1,18 @@ +from enum import Enum +from typing import Any + + +class GDriveMimeType(str, Enum): + DOC = "application/vnd.google-apps.document" + SPREADSHEET = "application/vnd.google-apps.spreadsheet" + PDF = "application/pdf" + WORD_DOC = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + PPT = "application/vnd.google-apps.presentation" + POWERPOINT = ( + "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + PLAIN_TEXT = "text/plain" + MARKDOWN = "text/markdown" + + +GoogleDriveFileType = dict[str, Any] diff --git a/backend/danswer/connectors/google_drive/section_extraction.py b/backend/danswer/connectors/google_drive/section_extraction.py new file mode 100644 index 00000000000..b5809204893 --- /dev/null +++ b/backend/danswer/connectors/google_drive/section_extraction.py @@ -0,0 +1,105 @@ +from typing import Any + +from pydantic import BaseModel + +from danswer.connectors.google_utils.resources import GoogleDocsService +from danswer.connectors.models import Section + + +class CurrentHeading(BaseModel): + id: str + text: str + + +def _build_gdoc_section_link(doc_id: str, heading_id: str) -> str: + """Builds a Google Doc link that jumps to a specific heading""" + # NOTE: doesn't support docs with multiple tabs atm, if we need that ask + # @Chris + return ( + f"https://docs.google.com/document/d/{doc_id}/edit?tab=t.0#heading={heading_id}" + ) + + +def _extract_id_from_heading(paragraph: dict[str, Any]) -> str: + """Extracts the id from a heading paragraph element""" + return paragraph["paragraphStyle"]["headingId"] + + +def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str: + """Extracts the text content from a paragraph element""" + text_elements = [] + for element in paragraph.get("elements", []): + if "textRun" in element: + text_elements.append(element["textRun"].get("content", "")) + return "".join(text_elements) + + +def get_document_sections( + docs_service: GoogleDocsService, + doc_id: str, +) -> list[Section]: + """Extracts sections from a Google Doc, including their headings and content""" + # Fetch the document structure + doc = docs_service.documents().get(documentId=doc_id).execute() + + # Get the content + content = doc.get("body", {}).get("content", []) + + sections: list[Section] = [] + current_section: list[str] = [] + current_heading: CurrentHeading | None = None + + for element in content: + if "paragraph" not in element: + continue + + paragraph = element["paragraph"] + + # Check if this is a heading + if ( + "paragraphStyle" in paragraph + and "namedStyleType" in paragraph["paragraphStyle"] + ): + style = paragraph["paragraphStyle"]["namedStyleType"] + is_heading = style.startswith("HEADING_") + is_title = style.startswith("TITLE") + + if is_heading or is_title: + # If we were building a previous section, add it to sections list + if current_heading is not None and current_section: + heading_text = current_heading.text + section_text = f"{heading_text}\n" + "\n".join(current_section) + sections.append( + Section( + text=section_text.strip(), + link=_build_gdoc_section_link(doc_id, current_heading.id), + ) + ) + current_section = [] + + # Start new heading + heading_id = _extract_id_from_heading(paragraph) + heading_text = _extract_text_from_paragraph(paragraph) + current_heading = CurrentHeading( + id=heading_id, + text=heading_text, + ) + continue + + # Add content to current section + if current_heading is not None: + text = _extract_text_from_paragraph(paragraph) + if text.strip(): + current_section.append(text) + + # Don't forget to add the last section + if current_heading is not None and current_section: + section_text = f"{current_heading.text}\n" + "\n".join(current_section) + sections.append( + Section( + text=section_text.strip(), + link=_build_gdoc_section_link(doc_id, current_heading.id), + ) + ) + + return sections diff --git a/backend/ee/danswer/connectors/confluence/__init__.py b/backend/danswer/connectors/google_utils/__init__.py similarity index 100% rename from backend/ee/danswer/connectors/confluence/__init__.py rename to backend/danswer/connectors/google_utils/__init__.py diff --git a/backend/danswer/connectors/google_utils/google_auth.py b/backend/danswer/connectors/google_utils/google_auth.py new file mode 100644 index 00000000000..8a8c59d6af3 --- /dev/null +++ b/backend/danswer/connectors/google_utils/google_auth.py @@ -0,0 +1,107 @@ +import json +from typing import cast + +from google.auth.transport.requests import Request # type: ignore +from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore +from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore + +from danswer.configs.constants import DocumentSource +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY, +) +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_DICT_TOKEN_KEY, +) +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_PRIMARY_ADMIN_KEY, +) +from danswer.connectors.google_utils.shared_constants import ( + GOOGLE_SCOPES, +) +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +def get_google_oauth_creds( + token_json_str: str, source: DocumentSource +) -> OAuthCredentials | None: + creds_json = json.loads(token_json_str) + creds = OAuthCredentials.from_authorized_user_info( + info=creds_json, + scopes=GOOGLE_SCOPES[source], + ) + if creds.valid: + return creds + + if creds.expired and creds.refresh_token: + try: + creds.refresh(Request()) + if creds.valid: + logger.notice("Refreshed Google Drive tokens.") + return creds + except Exception: + logger.exception("Failed to refresh google drive access token due to:") + return None + + return None + + +def get_google_creds( + credentials: dict[str, str], + source: DocumentSource, +) -> tuple[ServiceAccountCredentials | OAuthCredentials, dict[str, str] | None]: + """Checks for two different types of credentials. + (1) A credential which holds a token acquired via a user going thorough + the Google OAuth flow. + (2) A credential which holds a service account key JSON file, which + can then be used to impersonate any user in the workspace. + """ + oauth_creds = None + service_creds = None + new_creds_dict = None + if DB_CREDENTIALS_DICT_TOKEN_KEY in credentials: + # OAUTH + access_token_json_str = cast(str, credentials[DB_CREDENTIALS_DICT_TOKEN_KEY]) + oauth_creds = get_google_oauth_creds( + token_json_str=access_token_json_str, source=source + ) + + # tell caller to update token stored in DB if it has changed + # (e.g. the token has been refreshed) + new_creds_json_str = oauth_creds.to_json() if oauth_creds else "" + if new_creds_json_str != access_token_json_str: + new_creds_dict = { + DB_CREDENTIALS_DICT_TOKEN_KEY: new_creds_json_str, + DB_CREDENTIALS_PRIMARY_ADMIN_KEY: credentials[ + DB_CREDENTIALS_PRIMARY_ADMIN_KEY + ], + } + elif DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY in credentials: + # SERVICE ACCOUNT + service_account_key_json_str = credentials[ + DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY + ] + service_account_key = json.loads(service_account_key_json_str) + + service_creds = ServiceAccountCredentials.from_service_account_info( + service_account_key, scopes=GOOGLE_SCOPES[source] + ) + + if not service_creds.valid or not service_creds.expired: + service_creds.refresh(Request()) + + if not service_creds.valid: + raise PermissionError( + f"Unable to access {source} - service account credentials are invalid." + ) + + creds: ServiceAccountCredentials | OAuthCredentials | None = ( + oauth_creds or service_creds + ) + if creds is None: + raise PermissionError( + f"Unable to access {source} - unknown credential structure." + ) + + return creds, new_creds_dict diff --git a/backend/danswer/connectors/google_utils/google_kv.py b/backend/danswer/connectors/google_utils/google_kv.py new file mode 100644 index 00000000000..7984681ed81 --- /dev/null +++ b/backend/danswer/connectors/google_utils/google_kv.py @@ -0,0 +1,237 @@ +import json +from typing import cast +from urllib.parse import parse_qs +from urllib.parse import ParseResult +from urllib.parse import urlparse + +from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore +from google_auth_oauthlib.flow import InstalledAppFlow # type: ignore +from sqlalchemy.orm import Session + +from danswer.configs.app_configs import WEB_DOMAIN +from danswer.configs.constants import DocumentSource +from danswer.configs.constants import KV_CRED_KEY +from danswer.configs.constants import KV_GMAIL_CRED_KEY +from danswer.configs.constants import KV_GMAIL_SERVICE_ACCOUNT_KEY +from danswer.configs.constants import KV_GOOGLE_DRIVE_CRED_KEY +from danswer.configs.constants import KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY +from danswer.connectors.google_utils.resources import get_drive_service +from danswer.connectors.google_utils.resources import get_gmail_service +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY, +) +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_DICT_TOKEN_KEY, +) +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_PRIMARY_ADMIN_KEY, +) +from danswer.connectors.google_utils.shared_constants import ( + GOOGLE_SCOPES, +) +from danswer.connectors.google_utils.shared_constants import ( + MISSING_SCOPES_ERROR_STR, +) +from danswer.connectors.google_utils.shared_constants import ( + ONYX_SCOPE_INSTRUCTIONS, +) +from danswer.db.credentials import update_credential_json +from danswer.db.models import User +from danswer.key_value_store.factory import get_kv_store +from danswer.server.documents.models import CredentialBase +from danswer.server.documents.models import GoogleAppCredentials +from danswer.server.documents.models import GoogleServiceAccountKey +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +def _build_frontend_google_drive_redirect(source: DocumentSource) -> str: + if source == DocumentSource.GOOGLE_DRIVE: + return f"{WEB_DOMAIN}/admin/connectors/google-drive/auth/callback" + elif source == DocumentSource.GMAIL: + return f"{WEB_DOMAIN}/admin/connectors/gmail/auth/callback" + else: + raise ValueError(f"Unsupported source: {source}") + + +def _get_current_oauth_user(creds: OAuthCredentials, source: DocumentSource) -> str: + if source == DocumentSource.GOOGLE_DRIVE: + drive_service = get_drive_service(creds) + user_info = ( + drive_service.about() + .get( + fields="user(emailAddress)", + ) + .execute() + ) + email = user_info.get("user", {}).get("emailAddress") + elif source == DocumentSource.GMAIL: + gmail_service = get_gmail_service(creds) + user_info = ( + gmail_service.users() + .getProfile( + userId="me", + fields="emailAddress", + ) + .execute() + ) + email = user_info.get("emailAddress") + else: + raise ValueError(f"Unsupported source: {source}") + return email + + +def verify_csrf(credential_id: int, state: str) -> None: + csrf = get_kv_store().load(KV_CRED_KEY.format(str(credential_id))) + if csrf != state: + raise PermissionError( + "State from Google Drive Connector callback does not match expected" + ) + + +def update_credential_access_tokens( + auth_code: str, + credential_id: int, + user: User, + db_session: Session, + source: DocumentSource, +) -> OAuthCredentials | None: + app_credentials = get_google_app_cred(source) + flow = InstalledAppFlow.from_client_config( + app_credentials.model_dump(), + scopes=GOOGLE_SCOPES[source], + redirect_uri=_build_frontend_google_drive_redirect(source), + ) + flow.fetch_token(code=auth_code) + creds = flow.credentials + token_json_str = creds.to_json() + + # Get user email from Google API so we know who + # the primary admin is for this connector + try: + email = _get_current_oauth_user(creds, source) + except Exception as e: + if MISSING_SCOPES_ERROR_STR in str(e): + raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e + raise e + + new_creds_dict = { + DB_CREDENTIALS_DICT_TOKEN_KEY: token_json_str, + DB_CREDENTIALS_PRIMARY_ADMIN_KEY: email, + } + + if not update_credential_json(credential_id, new_creds_dict, user, db_session): + return None + return creds + + +def build_service_account_creds( + source: DocumentSource, + primary_admin_email: str | None = None, +) -> CredentialBase: + service_account_key = get_service_account_key(source=source) + + credential_dict = { + DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY: service_account_key.json(), + } + if primary_admin_email: + credential_dict[DB_CREDENTIALS_PRIMARY_ADMIN_KEY] = primary_admin_email + + return CredentialBase( + credential_json=credential_dict, + admin_public=True, + source=source, + ) + + +def get_auth_url(credential_id: int, source: DocumentSource) -> str: + if source == DocumentSource.GOOGLE_DRIVE: + creds_str = str(get_kv_store().load(KV_GOOGLE_DRIVE_CRED_KEY)) + elif source == DocumentSource.GMAIL: + creds_str = str(get_kv_store().load(KV_GMAIL_CRED_KEY)) + else: + raise ValueError(f"Unsupported source: {source}") + credential_json = json.loads(creds_str) + flow = InstalledAppFlow.from_client_config( + credential_json, + scopes=GOOGLE_SCOPES[source], + redirect_uri=_build_frontend_google_drive_redirect(source), + ) + auth_url, _ = flow.authorization_url(prompt="consent") + + parsed_url = cast(ParseResult, urlparse(auth_url)) + params = parse_qs(parsed_url.query) + + get_kv_store().store( + KV_CRED_KEY.format(credential_id), params.get("state", [None])[0], encrypt=True + ) # type: ignore + return str(auth_url) + + +def get_google_app_cred(source: DocumentSource) -> GoogleAppCredentials: + if source == DocumentSource.GOOGLE_DRIVE: + creds_str = str(get_kv_store().load(KV_GOOGLE_DRIVE_CRED_KEY)) + elif source == DocumentSource.GMAIL: + creds_str = str(get_kv_store().load(KV_GMAIL_CRED_KEY)) + else: + raise ValueError(f"Unsupported source: {source}") + return GoogleAppCredentials(**json.loads(creds_str)) + + +def upsert_google_app_cred( + app_credentials: GoogleAppCredentials, source: DocumentSource +) -> None: + if source == DocumentSource.GOOGLE_DRIVE: + get_kv_store().store( + KV_GOOGLE_DRIVE_CRED_KEY, app_credentials.json(), encrypt=True + ) + elif source == DocumentSource.GMAIL: + get_kv_store().store(KV_GMAIL_CRED_KEY, app_credentials.json(), encrypt=True) + else: + raise ValueError(f"Unsupported source: {source}") + + +def delete_google_app_cred(source: DocumentSource) -> None: + if source == DocumentSource.GOOGLE_DRIVE: + get_kv_store().delete(KV_GOOGLE_DRIVE_CRED_KEY) + elif source == DocumentSource.GMAIL: + get_kv_store().delete(KV_GMAIL_CRED_KEY) + else: + raise ValueError(f"Unsupported source: {source}") + + +def get_service_account_key(source: DocumentSource) -> GoogleServiceAccountKey: + if source == DocumentSource.GOOGLE_DRIVE: + creds_str = str(get_kv_store().load(KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY)) + elif source == DocumentSource.GMAIL: + creds_str = str(get_kv_store().load(KV_GMAIL_SERVICE_ACCOUNT_KEY)) + else: + raise ValueError(f"Unsupported source: {source}") + return GoogleServiceAccountKey(**json.loads(creds_str)) + + +def upsert_service_account_key( + service_account_key: GoogleServiceAccountKey, source: DocumentSource +) -> None: + if source == DocumentSource.GOOGLE_DRIVE: + get_kv_store().store( + KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY, + service_account_key.json(), + encrypt=True, + ) + elif source == DocumentSource.GMAIL: + get_kv_store().store( + KV_GMAIL_SERVICE_ACCOUNT_KEY, service_account_key.json(), encrypt=True + ) + else: + raise ValueError(f"Unsupported source: {source}") + + +def delete_service_account_key(source: DocumentSource) -> None: + if source == DocumentSource.GOOGLE_DRIVE: + get_kv_store().delete(KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY) + elif source == DocumentSource.GMAIL: + get_kv_store().delete(KV_GMAIL_SERVICE_ACCOUNT_KEY) + else: + raise ValueError(f"Unsupported source: {source}") diff --git a/backend/danswer/connectors/google_utils/google_utils.py b/backend/danswer/connectors/google_utils/google_utils.py new file mode 100644 index 00000000000..59a56c8db80 --- /dev/null +++ b/backend/danswer/connectors/google_utils/google_utils.py @@ -0,0 +1,125 @@ +import re +import time +from collections.abc import Callable +from collections.abc import Iterator +from datetime import datetime +from datetime import timezone +from typing import Any + +from googleapiclient.errors import HttpError # type: ignore + +from danswer.connectors.google_drive.models import GoogleDriveFileType +from danswer.utils.logger import setup_logger +from danswer.utils.retry_wrapper import retry_builder + +logger = setup_logger() + + +# Google Drive APIs are quite flakey and may 500 for an +# extended period of time. Trying to combat here by adding a very +# long retry period (~20 minutes of trying every minute) +add_retries = retry_builder(tries=50, max_delay=30) + + +def _execute_with_retry(request: Any) -> Any: + max_attempts = 10 + attempt = 1 + + while attempt < max_attempts: + # Note for reasons unknown, the Google API will sometimes return a 429 + # and even after waiting the retry period, it will return another 429. + # It could be due to a few possibilities: + # 1. Other things are also requesting from the Gmail API with the same key + # 2. It's a rolling rate limit so the moment we get some amount of requests cleared, we hit it again very quickly + # 3. The retry-after has a maximum and we've already hit the limit for the day + # or it's something else... + try: + return request.execute() + except HttpError as error: + attempt += 1 + + if error.resp.status == 429: + # Attempt to get 'Retry-After' from headers + retry_after = error.resp.get("Retry-After") + if retry_after: + sleep_time = int(retry_after) + else: + # Extract 'Retry after' timestamp from error message + match = re.search( + r"Retry after (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z)", + str(error), + ) + if match: + retry_after_timestamp = match.group(1) + retry_after_dt = datetime.strptime( + retry_after_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ" + ).replace(tzinfo=timezone.utc) + current_time = datetime.now(timezone.utc) + sleep_time = max( + int((retry_after_dt - current_time).total_seconds()), + 0, + ) + else: + logger.error( + f"No Retry-After header or timestamp found in error message: {error}" + ) + sleep_time = 60 + + sleep_time += 3 # Add a buffer to be safe + + logger.info( + f"Rate limit exceeded. Attempt {attempt}/{max_attempts}. Sleeping for {sleep_time} seconds." + ) + time.sleep(sleep_time) + + else: + raise + + # If we've exhausted all attempts + raise Exception(f"Failed to execute request after {max_attempts} attempts") + + +def execute_paginated_retrieval( + retrieval_function: Callable, + list_key: str | None = None, + continue_on_404_or_403: bool = False, + **kwargs: Any, +) -> Iterator[GoogleDriveFileType]: + """Execute a paginated retrieval from Google Drive API + Args: + retrieval_function: The specific list function to call (e.g., service.files().list) + **kwargs: Arguments to pass to the list function + """ + next_page_token = "" + while next_page_token is not None: + request_kwargs = kwargs.copy() + if next_page_token: + request_kwargs["pageToken"] = next_page_token + + try: + results = retrieval_function(**request_kwargs).execute() + except HttpError as e: + if e.resp.status >= 500: + results = add_retries( + lambda: retrieval_function(**request_kwargs).execute() + )() + elif e.resp.status == 404 or e.resp.status == 403: + if continue_on_404_or_403: + logger.debug(f"Error executing request: {e}") + results = {} + else: + raise e + elif e.resp.status == 429: + results = _execute_with_retry( + lambda: retrieval_function(**request_kwargs).execute() + ) + else: + logger.exception("Error executing request:") + raise e + + next_page_token = results.get("nextPageToken") + if list_key: + for item in results.get(list_key, []): + yield item + else: + yield results diff --git a/backend/danswer/connectors/google_utils/resources.py b/backend/danswer/connectors/google_utils/resources.py new file mode 100644 index 00000000000..c93c5809755 --- /dev/null +++ b/backend/danswer/connectors/google_utils/resources.py @@ -0,0 +1,63 @@ +from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore +from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore +from googleapiclient.discovery import build # type: ignore +from googleapiclient.discovery import Resource # type: ignore + + +class GoogleDriveService(Resource): + pass + + +class GoogleDocsService(Resource): + pass + + +class AdminService(Resource): + pass + + +class GmailService(Resource): + pass + + +def _get_google_service( + service_name: str, + service_version: str, + creds: ServiceAccountCredentials | OAuthCredentials, + user_email: str | None = None, +) -> GoogleDriveService | GoogleDocsService | AdminService | GmailService: + if isinstance(creds, ServiceAccountCredentials): + creds = creds.with_subject(user_email) + service = build(service_name, service_version, credentials=creds) + elif isinstance(creds, OAuthCredentials): + service = build(service_name, service_version, credentials=creds) + + return service + + +def get_google_docs_service( + creds: ServiceAccountCredentials | OAuthCredentials, + user_email: str | None = None, +) -> GoogleDocsService: + return _get_google_service("docs", "v1", creds, user_email) + + +def get_drive_service( + creds: ServiceAccountCredentials | OAuthCredentials, + user_email: str | None = None, +) -> GoogleDriveService: + return _get_google_service("drive", "v3", creds, user_email) + + +def get_admin_service( + creds: ServiceAccountCredentials | OAuthCredentials, + user_email: str | None = None, +) -> AdminService: + return _get_google_service("admin", "directory_v1", creds, user_email) + + +def get_gmail_service( + creds: ServiceAccountCredentials | OAuthCredentials, + user_email: str | None = None, +) -> GmailService: + return _get_google_service("gmail", "v1", creds, user_email) diff --git a/backend/danswer/connectors/google_utils/shared_constants.py b/backend/danswer/connectors/google_utils/shared_constants.py new file mode 100644 index 00000000000..ef3c0bb0302 --- /dev/null +++ b/backend/danswer/connectors/google_utils/shared_constants.py @@ -0,0 +1,40 @@ +from danswer.configs.constants import DocumentSource + +# NOTE: do not need https://www.googleapis.com/auth/documents.readonly +# this is counted under `/auth/drive.readonly` +GOOGLE_SCOPES = { + DocumentSource.GOOGLE_DRIVE: [ + "https://www.googleapis.com/auth/drive.readonly", + "https://www.googleapis.com/auth/drive.metadata.readonly", + "https://www.googleapis.com/auth/admin.directory.group.readonly", + "https://www.googleapis.com/auth/admin.directory.user.readonly", + ], + DocumentSource.GMAIL: [ + "https://www.googleapis.com/auth/gmail.readonly", + "https://www.googleapis.com/auth/admin.directory.user.readonly", + "https://www.googleapis.com/auth/admin.directory.group.readonly", + ], +} + +# This is the Oauth token +DB_CREDENTIALS_DICT_TOKEN_KEY = "google_tokens" +# This is the service account key +DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY = "google_service_account_key" +# The email saved for both auth types +DB_CREDENTIALS_PRIMARY_ADMIN_KEY = "google_primary_admin" + +USER_FIELDS = "nextPageToken, users(primaryEmail)" + +# Error message substrings +MISSING_SCOPES_ERROR_STR = "client not authorized for any of the scopes requested" + +# Documentation and error messages +SCOPE_DOC_URL = "https://docs.danswer.dev/connectors/google_drive/overview" +ONYX_SCOPE_INSTRUCTIONS = ( + "You have upgraded Danswer without updating the Google Auth scopes. " + f"Please refer to the documentation to learn how to update the scopes: {SCOPE_DOC_URL}" +) + + +# This is the maximum number of threads that can be retrieved at once +SLIM_BATCH_SIZE = 500 diff --git a/backend/danswer/connectors/guru/connector.py b/backend/danswer/connectors/guru/connector.py index a27546425d3..510105b872f 100644 --- a/backend/danswer/connectors/guru/connector.py +++ b/backend/danswer/connectors/guru/connector.py @@ -19,13 +19,14 @@ from danswer.file_processing.html_utils import parse_html_page_basic from danswer.utils.logger import setup_logger + +logger = setup_logger() + # Potential Improvements # 1. Support fetching per collection via collection token (configured at connector creation) - GURU_API_BASE = "https://api.getguru.com/api/v1/" GURU_QUERY_ENDPOINT = GURU_API_BASE + "search/query" GURU_CARDS_URL = "https://app.getguru.com/card/" -logger = setup_logger() def unixtime_to_guru_time_str(unix_time: SecondsSinceUnixEpoch) -> str: diff --git a/backend/danswer/connectors/interfaces.py b/backend/danswer/connectors/interfaces.py index 3bd99792cce..c53b3de5f2f 100644 --- a/backend/danswer/connectors/interfaces.py +++ b/backend/danswer/connectors/interfaces.py @@ -3,14 +3,18 @@ from typing import Any from danswer.connectors.models import Document +from danswer.connectors.models import SlimDocument SecondsSinceUnixEpoch = float GenerateDocumentsOutput = Iterator[list[Document]] +GenerateSlimDocumentOutput = Iterator[list[SlimDocument]] class BaseConnector(abc.ABC): + REDIS_KEY_PREFIX = "da_connector_data:" + @abc.abstractmethod def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: raise NotImplementedError @@ -50,9 +54,13 @@ def poll_source( raise NotImplementedError -class IdConnector(BaseConnector): +class SlimConnector(BaseConnector): @abc.abstractmethod - def retrieve_all_source_ids(self) -> set[str]: + def retrieve_all_slim_documents( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> GenerateSlimDocumentOutput: raise NotImplementedError diff --git a/backend/danswer/connectors/linear/connector.py b/backend/danswer/connectors/linear/connector.py index 8455b20f5ba..22b769562d1 100644 --- a/backend/danswer/connectors/linear/connector.py +++ b/backend/danswer/connectors/linear/connector.py @@ -18,6 +18,7 @@ from danswer.connectors.models import Section from danswer.utils.logger import setup_logger + logger = setup_logger() _NUM_RETRIES = 5 diff --git a/backend/danswer/connectors/loopio/connector.py b/backend/danswer/connectors/loopio/connector.py index e10bed87617..d3bdfe503f7 100644 --- a/backend/danswer/connectors/loopio/connector.py +++ b/backend/danswer/connectors/loopio/connector.py @@ -161,7 +161,7 @@ def _process_entries( ] doc_batch.append( Document( - id=entry["id"], + id=str(entry["id"]), sections=[Section(link=link, text=content_text)], source=DocumentSource.LOOPIO, semantic_identifier=questions[0], diff --git a/backend/danswer/connectors/mediawiki/family.py b/backend/danswer/connectors/mediawiki/family.py index 0d953066700..7554dd0f3cd 100644 --- a/backend/danswer/connectors/mediawiki/family.py +++ b/backend/danswer/connectors/mediawiki/family.py @@ -3,6 +3,7 @@ import builtins import functools import itertools +import tempfile from typing import Any from unittest import mock from urllib.parse import urlparse @@ -18,6 +19,8 @@ logger = setup_logger() +pywikibot.config.base_dir = tempfile.TemporaryDirectory().name + @mock.patch.object( builtins, "print", lambda *args: logger.info("\t".join(map(str, args))) @@ -45,8 +48,7 @@ def __init__( if any(x not in generate_family_file.NAME_CHARACTERS for x in name): raise ValueError( - 'ERROR: Name of family "{}" must be ASCII letters and digits [a-zA-Z0-9]', - name, + f'ERROR: Name of family "{name}" must be ASCII letters and digits [a-zA-Z0-9]', ) if isinstance(dointerwiki, bool): diff --git a/backend/danswer/connectors/mediawiki/wiki.py b/backend/danswer/connectors/mediawiki/wiki.py index f4ec1e02311..27edea871dc 100644 --- a/backend/danswer/connectors/mediawiki/wiki.py +++ b/backend/danswer/connectors/mediawiki/wiki.py @@ -2,7 +2,9 @@ import datetime import itertools +import tempfile from collections.abc import Generator +from collections.abc import Iterator from typing import Any from typing import ClassVar @@ -19,6 +21,12 @@ from danswer.connectors.mediawiki.family import family_class_dispatch from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.utils.logger import setup_logger + + +logger = setup_logger() + +pywikibot.config.base_dir = tempfile.TemporaryDirectory().name def pywikibot_timestamp_to_utc_datetime( @@ -74,7 +82,7 @@ def get_doc_from_page( sections=sections, semantic_identifier=page.title(), metadata={"categories": [category.title() for category in page.categories()]}, - id=page.pageid, + id=f"MEDIAWIKI_{page.pageid}_{page.full_url()}", ) @@ -116,14 +124,18 @@ def __init__( self.batch_size = batch_size # short names can only have ascii letters and digits - - self.family = family_class_dispatch(hostname, "Wikipedia Connector")() + self.family = family_class_dispatch(hostname, "WikipediaConnector")() self.site = pywikibot.Site(fam=self.family, code=language_code) self.categories = [ pywikibot.Category(self.site, f"Category:{category.replace(' ', '_')}") for category in categories ] - self.pages = [pywikibot.Page(self.site, page) for page in pages] + + self.pages = [] + for page in pages: + if not page: + continue + self.pages.append(pywikibot.Page(self.site, page)) def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: """Load credentials for a MediaWiki site. @@ -169,8 +181,13 @@ def _get_doc_batch( ] # Since we can specify both individual pages and categories, we need to iterate over all of them. - all_pages = itertools.chain(self.pages, *category_pages) + all_pages: Iterator[pywikibot.Page] = itertools.chain( + self.pages, *category_pages + ) for page in all_pages: + logger.info( + f"MediaWikiConnector: title='{page.title()}' url={page.full_url()}" + ) doc_batch.append( get_doc_from_page(page, self.site, self.document_source_type) ) @@ -216,5 +233,7 @@ def poll_source( print("All docs", all_docs) current = datetime.datetime.now().timestamp() one_day_ago = current - 30 * 24 * 60 * 60 # 30 days + latest_docs = list(test_connector.poll_source(one_day_ago, current)) + print("Latest docs", latest_docs) diff --git a/backend/danswer/connectors/models.py b/backend/danswer/connectors/models.py index 192aa1b206a..ba1368dc944 100644 --- a/backend/danswer/connectors/models.py +++ b/backend/danswer/connectors/models.py @@ -14,7 +14,7 @@ class InputType(str, Enum): LOAD_STATE = "load_state" # e.g. loading a current full state or a save state, such as from a file POLL = "poll" # e.g. calling an API to get all documents in the last hour EVENT = "event" # e.g. registered an endpoint as a listener, and processing connector events - PRUNE = "prune" + SLIM_RETRIEVAL = "slim_retrieval" class ConnectorMissingCredentialError(PermissionError): @@ -113,6 +113,9 @@ class DocumentBase(BaseModel): # The default title is semantic_identifier though unless otherwise specified title: str | None = None from_ingestion_api: bool = False + # Anything else that may be useful that is specific to this particular connector type that other + # parts of the code may need. If you're unsure, this can be left as None + additional_info: Any = None def get_title_for_document_index( self, @@ -166,6 +169,11 @@ def from_base(cls, base: DocumentBase) -> "Document": ) +class SlimDocument(BaseModel): + id: str + perm_sync_data: Any | None = None + + class DocumentErrorSummary(BaseModel): id: str semantic_id: str diff --git a/backend/danswer/connectors/notion/connector.py b/backend/danswer/connectors/notion/connector.py index 7878434da04..4680c3d044b 100644 --- a/backend/danswer/connectors/notion/connector.py +++ b/backend/danswer/connectors/notion/connector.py @@ -29,6 +29,9 @@ _NOTION_CALL_TIMEOUT = 30 # 30 seconds +# TODO: Tables need to be ingested, Pages need to have their metadata ingested + + @dataclass class NotionPage: """Represents a Notion Page object""" @@ -40,6 +43,8 @@ class NotionPage: properties: dict[str, Any] url: str + database_name: str | None # Only applicable to the database type page (wiki) + def __init__(self, **kwargs: dict[str, Any]) -> None: names = set([f.name for f in fields(self)]) for k, v in kwargs.items(): @@ -47,6 +52,17 @@ def __init__(self, **kwargs: dict[str, Any]) -> None: setattr(self, k, v) +@dataclass +class NotionBlock: + """Represents a Notion Block object""" + + id: str # Used for the URL + text: str + # In a plaintext representation of the page, how this block should be joined + # with the existing text up to this point, separated out from text for clarity + prefix: str + + @dataclass class NotionSearchResponse: """Represents the response from the Notion Search API""" @@ -62,7 +78,6 @@ def __init__(self, **kwargs: dict[str, Any]) -> None: setattr(self, k, v) -# TODO - Add the ability to optionally limit to specific Notion databases class NotionConnector(LoadConnector, PollConnector): """Notion Page connector that reads all Notion pages this integration has been granted access to. @@ -119,28 +134,59 @@ def _fetch_child_blocks( f"This is likely due to the block not being shared " f"with the Danswer integration. Exact exception:\n\n{e}" ) - return None - logger.exception(f"Error fetching blocks - {res.json()}") - raise e + else: + logger.exception( + f"Error fetching blocks with status code {res.status_code}: {res.json()}" + ) + + # This can occasionally happen, the reason is unknown and cannot be reproduced on our internal Notion + # Assuming this will not be a critical loss of data + return None return res.json() @retry(tries=3, delay=1, backoff=2) def _fetch_page(self, page_id: str) -> NotionPage: - """Fetch a page from it's ID via the Notion API.""" + """Fetch a page from its ID via the Notion API, retry with database if page fetch fails.""" logger.debug(f"Fetching page for ID '{page_id}'") - block_url = f"https://api.notion.com/v1/pages/{page_id}" + page_url = f"https://api.notion.com/v1/pages/{page_id}" res = rl_requests.get( - block_url, + page_url, headers=self.headers, timeout=_NOTION_CALL_TIMEOUT, ) try: res.raise_for_status() except Exception as e: - logger.exception(f"Error fetching page - {res.json()}") - raise e + logger.warning( + f"Failed to fetch page, trying database for ID '{page_id}'. Exception: {e}" + ) + # Try fetching as a database if page fetch fails, this happens if the page is set to a wiki + # it becomes a database from the notion perspective + return self._fetch_database_as_page(page_id) return NotionPage(**res.json()) + @retry(tries=3, delay=1, backoff=2) + def _fetch_database_as_page(self, database_id: str) -> NotionPage: + """Attempt to fetch a database as a page.""" + logger.debug(f"Fetching database for ID '{database_id}' as a page") + database_url = f"https://api.notion.com/v1/databases/{database_id}" + res = rl_requests.get( + database_url, + headers=self.headers, + timeout=_NOTION_CALL_TIMEOUT, + ) + try: + res.raise_for_status() + except Exception as e: + logger.exception(f"Error fetching database as page - {res.json()}") + raise e + database_name = res.json().get("title") + database_name = ( + database_name[0].get("text", {}).get("content") if database_name else None + ) + + return NotionPage(**res.json(), database_name=database_name) + @retry(tries=3, delay=1, backoff=2) def _fetch_database( self, database_id: str, cursor: str | None = None @@ -171,8 +217,86 @@ def _fetch_database( raise e return res.json() - def _read_pages_from_database(self, database_id: str) -> list[str]: - """Returns a list of all page IDs in the database""" + @staticmethod + def _properties_to_str(properties: dict[str, Any]) -> str: + """Converts Notion properties to a string""" + + def _recurse_properties(inner_dict: dict[str, Any]) -> str | None: + while "type" in inner_dict: + type_name = inner_dict["type"] + inner_dict = inner_dict[type_name] + + # If the innermost layer is None, the value is not set + if not inner_dict: + return None + + if isinstance(inner_dict, list): + list_properties = [ + _recurse_properties(item) for item in inner_dict if item + ] + return ( + ", ".join( + [ + list_property + for list_property in list_properties + if list_property + ] + ) + or None + ) + + # TODO there may be more types to handle here + if isinstance(inner_dict, str): + # For some objects the innermost value could just be a string, not sure what causes this + return inner_dict + + elif isinstance(inner_dict, dict): + if "name" in inner_dict: + return inner_dict["name"] + if "content" in inner_dict: + return inner_dict["content"] + start = inner_dict.get("start") + end = inner_dict.get("end") + if start is not None: + if end is not None: + return f"{start} - {end}" + return start + elif end is not None: + return f"Until {end}" + + if "id" in inner_dict: + # This is not useful to index, it's a reference to another Notion object + # and this ID value in plaintext is useless outside of the Notion context + logger.debug("Skipping Notion object id field property") + return None + + logger.debug(f"Unreadable property from innermost prop: {inner_dict}") + return None + + result = "" + for prop_name, prop in properties.items(): + if not prop: + continue + + try: + inner_value = _recurse_properties(prop) + except Exception as e: + # This is not a critical failure, these properties are not the actual contents of the page + # more similar to metadata + logger.warning(f"Error recursing properties for {prop_name}: {e}") + continue + # Not a perfect way to format Notion database tables but there's no perfect representation + # since this must be represented as plaintext + if inner_value: + result += f"{prop_name}: {inner_value}\t" + + return result + + def _read_pages_from_database( + self, database_id: str + ) -> tuple[list[NotionBlock], list[str]]: + """Returns a list of top level blocks and all page IDs in the database""" + result_blocks: list[NotionBlock] = [] result_pages: list[str] = [] cursor = None while True: @@ -181,29 +305,34 @@ def _read_pages_from_database(self, database_id: str) -> list[str]: for result in data["results"]: obj_id = result["id"] obj_type = result["object"] - if obj_type == "page": - logger.debug( - f"Found page with ID '{obj_id}' in database '{database_id}'" - ) - result_pages.append(result["id"]) - elif obj_type == "database": - logger.debug( - f"Found database with ID '{obj_id}' in database '{database_id}'" - ) - result_pages.extend(self._read_pages_from_database(obj_id)) + text = self._properties_to_str(result.get("properties", {})) + if text: + result_blocks.append(NotionBlock(id=obj_id, text=text, prefix="\n")) + + if self.recursive_index_enabled: + if obj_type == "page": + logger.debug( + f"Found page with ID '{obj_id}' in database '{database_id}'" + ) + result_pages.append(result["id"]) + elif obj_type == "database": + logger.debug( + f"Found database with ID '{obj_id}' in database '{database_id}'" + ) + # The inner contents are ignored at this level + _, child_pages = self._read_pages_from_database(obj_id) + result_pages.extend(child_pages) if data["next_cursor"] is None: break cursor = data["next_cursor"] - return result_pages + return result_blocks, result_pages - def _read_blocks( - self, base_block_id: str - ) -> tuple[list[tuple[str, str]], list[str]]: - """Reads all child blocks for the specified block""" - result_lines: list[tuple[str, str]] = [] + def _read_blocks(self, base_block_id: str) -> tuple[list[NotionBlock], list[str]]: + """Reads all child blocks for the specified block, returns a list of blocks and child page ids""" + result_blocks: list[NotionBlock] = [] child_pages: list[str] = [] cursor = None while True: @@ -211,7 +340,7 @@ def _read_blocks( # this happens when a block is not shared with the integration if data is None: - return result_lines, child_pages + return result_blocks, child_pages for result in data["results"]: logger.debug( @@ -255,46 +384,70 @@ def _read_blocks( if result["has_children"]: if result_type == "child_page": + # Child pages will not be included at this top level, it will be a separate document child_pages.append(result_block_id) else: logger.debug(f"Entering sub-block: {result_block_id}") - subblock_result_lines, subblock_child_pages = self._read_blocks( + subblocks, subblock_child_pages = self._read_blocks( result_block_id ) logger.debug(f"Finished sub-block: {result_block_id}") - result_lines.extend(subblock_result_lines) + result_blocks.extend(subblocks) child_pages.extend(subblock_child_pages) - if result_type == "child_database" and self.recursive_index_enabled: - child_pages.extend(self._read_pages_from_database(result_block_id)) - - cur_result_text = "\n".join(cur_result_text_arr) - if cur_result_text: - result_lines.append((cur_result_text, result_block_id)) + if result_type == "child_database": + inner_blocks, inner_child_pages = self._read_pages_from_database( + result_block_id + ) + # A database on a page often looks like a table, we need to include it for the contents + # of the page but the children (cells) should be processed as other Documents + result_blocks.extend(inner_blocks) + + if self.recursive_index_enabled: + child_pages.extend(inner_child_pages) + + if cur_result_text_arr: + new_block = NotionBlock( + id=result_block_id, + text="\n".join(cur_result_text_arr), + prefix="\n", + ) + result_blocks.append(new_block) if data["next_cursor"] is None: break cursor = data["next_cursor"] - return result_lines, child_pages + return result_blocks, child_pages - def _read_page_title(self, page: NotionPage) -> str: + def _read_page_title(self, page: NotionPage) -> str | None: """Extracts the title from a Notion page""" page_title = None + if hasattr(page, "database_name") and page.database_name: + return page.database_name for _, prop in page.properties.items(): if prop["type"] == "title" and len(prop["title"]) > 0: page_title = " ".join([t["plain_text"] for t in prop["title"]]).strip() break - if page_title is None: - page_title = f"Untitled Page [{page.id}]" + return page_title def _read_pages( self, pages: list[NotionPage], ) -> Generator[Document, None, None]: - """Reads pages for rich text content and generates Documents""" + """Reads pages for rich text content and generates Documents + + Note that a page which is turned into a "wiki" becomes a database but both top level pages and top level databases + do not seem to have any properties associated with them. + + Pages that are part of a database can have properties which are like the values of the row in the "database" table + in which they exist + + This is not clearly outlined in the Notion API docs but it is observable empirically. + https://developers.notion.com/docs/working-with-page-content + """ all_child_page_ids: list[str] = [] for page in pages: if page.id in self.indexed_pages: @@ -304,18 +457,23 @@ def _read_pages( logger.info(f"Reading page with ID '{page.id}', with url {page.url}") page_blocks, child_page_ids = self._read_blocks(page.id) all_child_page_ids.extend(child_page_ids) - page_title = self._read_page_title(page) + + if not page_blocks: + continue + + page_title = ( + self._read_page_title(page) or f"Untitled Page with ID {page.id}" + ) + yield ( Document( id=page.id, - # Will add title to the first section later in processing - sections=[Section(link=page.url, text="")] - + [ + sections=[ Section( - link=f"{page.url}#{block_id.replace('-', '')}", - text=block_text, + link=f"{page.url}#{block.id.replace('-', '')}", + text=block.prefix + block.text, ) - for block_text, block_id in page_blocks + for block in page_blocks ], source=DocumentSource.NOTION, semantic_identifier=page_title, diff --git a/backend/danswer/connectors/requesttracker/connector.py b/backend/danswer/connectors/requesttracker/connector.py index 9c4590fc2ef..b520d0d7acf 100644 --- a/backend/danswer/connectors/requesttracker/connector.py +++ b/backend/danswer/connectors/requesttracker/connector.py @@ -1,153 +1,124 @@ -from datetime import datetime -from datetime import timezone -from logging import DEBUG as LOG_LVL_DEBUG -from typing import Any -from typing import List -from typing import Optional - -from rt.rest1 import ALL_QUEUES -from rt.rest1 import Rt - -from danswer.configs.app_configs import INDEX_BATCH_SIZE -from danswer.configs.constants import DocumentSource -from danswer.connectors.interfaces import GenerateDocumentsOutput -from danswer.connectors.interfaces import PollConnector -from danswer.connectors.interfaces import SecondsSinceUnixEpoch -from danswer.connectors.models import ConnectorMissingCredentialError -from danswer.connectors.models import Document -from danswer.connectors.models import Section -from danswer.utils.logger import setup_logger - -logger = setup_logger() - - -class RequestTrackerError(Exception): - pass - - -class RequestTrackerConnector(PollConnector): - def __init__( - self, - batch_size: int = INDEX_BATCH_SIZE, - ) -> None: - self.batch_size = batch_size - - def txn_link(self, tid: int, txn: int) -> str: - return f"{self.rt_base_url}/Ticket/Display.html?id={tid}&txn={txn}" - - def build_doc_sections_from_txn( - self, connection: Rt, ticket_id: int - ) -> List[Section]: - Sections: List[Section] = [] - - get_history_resp = connection.get_history(ticket_id) - - if get_history_resp is None: - raise RequestTrackerError(f"Ticket {ticket_id} cannot be found") - - for tx in get_history_resp: - Sections.append( - Section( - link=self.txn_link(ticket_id, int(tx["id"])), - text="\n".join( - [ - f"{k}:\n{v}\n" if k != "Attachments" else "" - for (k, v) in tx.items() - ] - ), - ) - ) - return Sections - - def load_credentials(self, credentials: dict[str, Any]) -> Optional[dict[str, Any]]: - self.rt_username = credentials.get("requesttracker_username") - self.rt_password = credentials.get("requesttracker_password") - self.rt_base_url = credentials.get("requesttracker_base_url") - return None - - # This does not include RT file attachments yet. - def _process_tickets( - self, start: datetime, end: datetime - ) -> GenerateDocumentsOutput: - if any([self.rt_username, self.rt_password, self.rt_base_url]) is None: - raise ConnectorMissingCredentialError("requesttracker") - - Rt0 = Rt( - f"{self.rt_base_url}/REST/1.0/", - self.rt_username, - self.rt_password, - ) - - Rt0.login() - - d0 = start.strftime("%Y-%m-%d %H:%M:%S") - d1 = end.strftime("%Y-%m-%d %H:%M:%S") - - tickets = Rt0.search( - Queue=ALL_QUEUES, - raw_query=f"Updated > '{d0}' AND Updated < '{d1}'", - ) - - doc_batch: List[Document] = [] - - for ticket in tickets: - ticket_keys_to_omit = ["id", "Subject"] - tid: int = int(ticket["numerical_id"]) - ticketLink: str = f"{self.rt_base_url}/Ticket/Display.html?id={tid}" - logger.info(f"Processing ticket {tid}") - doc = Document( - id=ticket["id"], - # Will add title to the first section later in processing - sections=[Section(link=ticketLink, text="")] - + self.build_doc_sections_from_txn(Rt0, tid), - source=DocumentSource.REQUESTTRACKER, - semantic_identifier=ticket["Subject"], - metadata={ - key: value - for key, value in ticket.items() - if key not in ticket_keys_to_omit - }, - ) - - doc_batch.append(doc) - - if len(doc_batch) >= self.batch_size: - yield doc_batch - doc_batch = [] - - if doc_batch: - yield doc_batch - - def poll_source( - self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch - ) -> GenerateDocumentsOutput: - # Keep query short, only look behind 1 day at maximum - one_day_ago: float = end - (24 * 60 * 60) - _start: float = start if start > one_day_ago else one_day_ago - start_datetime = datetime.fromtimestamp(_start, tz=timezone.utc) - end_datetime = datetime.fromtimestamp(end, tz=timezone.utc) - yield from self._process_tickets(start_datetime, end_datetime) - - -if __name__ == "__main__": - import time - import os - from dotenv import load_dotenv - - load_dotenv() - logger.setLevel(LOG_LVL_DEBUG) - rt_connector = RequestTrackerConnector() - rt_connector.load_credentials( - { - "requesttracker_username": os.getenv("RT_USERNAME"), - "requesttracker_password": os.getenv("RT_PASSWORD"), - "requesttracker_base_url": os.getenv("RT_BASE_URL"), - } - ) - - current = time.time() - one_day_ago = current - (24 * 60 * 60) # 1 days - latest_docs = rt_connector.poll_source(one_day_ago, current) - - for doc in latest_docs: - print(doc) +# from datetime import datetime +# from datetime import timezone +# from logging import DEBUG as LOG_LVL_DEBUG +# from typing import Any +# from typing import List +# from typing import Optional +# from rt.rest1 import ALL_QUEUES +# from rt.rest1 import Rt +# from danswer.configs.app_configs import INDEX_BATCH_SIZE +# from danswer.configs.constants import DocumentSource +# from danswer.connectors.interfaces import GenerateDocumentsOutput +# from danswer.connectors.interfaces import PollConnector +# from danswer.connectors.interfaces import SecondsSinceUnixEpoch +# from danswer.connectors.models import ConnectorMissingCredentialError +# from danswer.connectors.models import Document +# from danswer.connectors.models import Section +# from danswer.utils.logger import setup_logger +# logger = setup_logger() +# class RequestTrackerError(Exception): +# pass +# class RequestTrackerConnector(PollConnector): +# def __init__( +# self, +# batch_size: int = INDEX_BATCH_SIZE, +# ) -> None: +# self.batch_size = batch_size +# def txn_link(self, tid: int, txn: int) -> str: +# return f"{self.rt_base_url}/Ticket/Display.html?id={tid}&txn={txn}" +# def build_doc_sections_from_txn( +# self, connection: Rt, ticket_id: int +# ) -> List[Section]: +# Sections: List[Section] = [] +# get_history_resp = connection.get_history(ticket_id) +# if get_history_resp is None: +# raise RequestTrackerError(f"Ticket {ticket_id} cannot be found") +# for tx in get_history_resp: +# Sections.append( +# Section( +# link=self.txn_link(ticket_id, int(tx["id"])), +# text="\n".join( +# [ +# f"{k}:\n{v}\n" if k != "Attachments" else "" +# for (k, v) in tx.items() +# ] +# ), +# ) +# ) +# return Sections +# def load_credentials(self, credentials: dict[str, Any]) -> Optional[dict[str, Any]]: +# self.rt_username = credentials.get("requesttracker_username") +# self.rt_password = credentials.get("requesttracker_password") +# self.rt_base_url = credentials.get("requesttracker_base_url") +# return None +# # This does not include RT file attachments yet. +# def _process_tickets( +# self, start: datetime, end: datetime +# ) -> GenerateDocumentsOutput: +# if any([self.rt_username, self.rt_password, self.rt_base_url]) is None: +# raise ConnectorMissingCredentialError("requesttracker") +# Rt0 = Rt( +# f"{self.rt_base_url}/REST/1.0/", +# self.rt_username, +# self.rt_password, +# ) +# Rt0.login() +# d0 = start.strftime("%Y-%m-%d %H:%M:%S") +# d1 = end.strftime("%Y-%m-%d %H:%M:%S") +# tickets = Rt0.search( +# Queue=ALL_QUEUES, +# raw_query=f"Updated > '{d0}' AND Updated < '{d1}'", +# ) +# doc_batch: List[Document] = [] +# for ticket in tickets: +# ticket_keys_to_omit = ["id", "Subject"] +# tid: int = int(ticket["numerical_id"]) +# ticketLink: str = f"{self.rt_base_url}/Ticket/Display.html?id={tid}" +# logger.info(f"Processing ticket {tid}") +# doc = Document( +# id=ticket["id"], +# # Will add title to the first section later in processing +# sections=[Section(link=ticketLink, text="")] +# + self.build_doc_sections_from_txn(Rt0, tid), +# source=DocumentSource.REQUESTTRACKER, +# semantic_identifier=ticket["Subject"], +# metadata={ +# key: value +# for key, value in ticket.items() +# if key not in ticket_keys_to_omit +# }, +# ) +# doc_batch.append(doc) +# if len(doc_batch) >= self.batch_size: +# yield doc_batch +# doc_batch = [] +# if doc_batch: +# yield doc_batch +# def poll_source( +# self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch +# ) -> GenerateDocumentsOutput: +# # Keep query short, only look behind 1 day at maximum +# one_day_ago: float = end - (24 * 60 * 60) +# _start: float = start if start > one_day_ago else one_day_ago +# start_datetime = datetime.fromtimestamp(_start, tz=timezone.utc) +# end_datetime = datetime.fromtimestamp(end, tz=timezone.utc) +# yield from self._process_tickets(start_datetime, end_datetime) +# if __name__ == "__main__": +# import time +# import os +# from dotenv import load_dotenv +# load_dotenv() +# logger.setLevel(LOG_LVL_DEBUG) +# rt_connector = RequestTrackerConnector() +# rt_connector.load_credentials( +# { +# "requesttracker_username": os.getenv("RT_USERNAME"), +# "requesttracker_password": os.getenv("RT_PASSWORD"), +# "requesttracker_base_url": os.getenv("RT_BASE_URL"), +# } +# ) +# current = time.time() +# one_day_ago = current - (24 * 60 * 60) # 1 days +# latest_docs = rt_connector.poll_source(one_day_ago, current) +# for doc in latest_docs: +# print(doc) diff --git a/backend/danswer/connectors/salesforce/connector.py b/backend/danswer/connectors/salesforce/connector.py index 03326df4efd..1e0fe9e1d3a 100644 --- a/backend/danswer/connectors/salesforce/connector.py +++ b/backend/danswer/connectors/salesforce/connector.py @@ -11,17 +11,25 @@ from danswer.configs.constants import DocumentSource from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc from danswer.connectors.interfaces import GenerateDocumentsOutput -from danswer.connectors.interfaces import IdConnector +from danswer.connectors.interfaces import GenerateSlimDocumentOutput from danswer.connectors.interfaces import LoadConnector from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.interfaces import SlimConnector from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.connectors.models import SlimDocument from danswer.connectors.salesforce.utils import extract_dict_text from danswer.utils.logger import setup_logger + +# TODO: this connector does not work well at large scales +# the large query against a large Salesforce instance has been reported to take 1.5 hours. +# Additionally it seems to eat up more memory over time if the connection is long running (again a scale issue). + + DEFAULT_PARENT_OBJECT_TYPES = ["Account"] MAX_QUERY_LENGTH = 10000 # max query length is 20,000 characters ID_PREFIX = "SALESFORCE_" @@ -29,7 +37,7 @@ logger = setup_logger() -class SalesforceConnector(LoadConnector, PollConnector, IdConnector): +class SalesforceConnector(LoadConnector, PollConnector, SlimConnector): def __init__( self, batch_size: int = INDEX_BATCH_SIZE, @@ -243,19 +251,26 @@ def poll_source( end_datetime = datetime.utcfromtimestamp(end) return self._fetch_from_salesforce(start=start_datetime, end=end_datetime) - def retrieve_all_source_ids(self) -> set[str]: + def retrieve_all_slim_documents( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> GenerateSlimDocumentOutput: if self.sf_client is None: raise ConnectorMissingCredentialError("Salesforce") - all_retrieved_ids: set[str] = set() + doc_metadata_list: list[SlimDocument] = [] for parent_object_type in self.parent_object_list: query = f"SELECT Id FROM {parent_object_type}" query_result = self.sf_client.query_all(query) - all_retrieved_ids.update( - f"{ID_PREFIX}{instance_dict.get('Id', '')}" + doc_metadata_list.extend( + SlimDocument( + id=f"{ID_PREFIX}{instance_dict.get('Id', '')}", + perm_sync_data={}, + ) for instance_dict in query_result["records"] ) - return all_retrieved_ids + yield doc_metadata_list if __name__ == "__main__": diff --git a/backend/danswer/connectors/sharepoint/connector.py b/backend/danswer/connectors/sharepoint/connector.py index e74dcbf7edd..8d99baffe33 100644 --- a/backend/danswer/connectors/sharepoint/connector.py +++ b/backend/danswer/connectors/sharepoint/connector.py @@ -25,6 +25,7 @@ from danswer.file_processing.extract_file_text import extract_file_text from danswer.utils.logger import setup_logger + logger = setup_logger() @@ -40,8 +41,8 @@ def _convert_driveitem_to_document( driveitem: DriveItem, ) -> Document: file_text = extract_file_text( - file_name=driveitem.name, file=io.BytesIO(driveitem.get_content().execute_query().value), + file_name=driveitem.name, break_on_unprocessable=False, ) diff --git a/backend/danswer/connectors/slab/connector.py b/backend/danswer/connectors/slab/connector.py index 80380ff7c29..ae76332838b 100644 --- a/backend/danswer/connectors/slab/connector.py +++ b/backend/danswer/connectors/slab/connector.py @@ -20,10 +20,13 @@ from danswer.connectors.models import Section from danswer.utils.logger import setup_logger + +logger = setup_logger() + + # Fairly generous retry because it's not understood why occasionally GraphQL requests fail even with timeout > 1 min SLAB_GRAPHQL_MAX_TRIES = 10 SLAB_API_URL = "https://api.slab.com/v1/graphql" -logger = setup_logger() def run_graphql_request( diff --git a/backend/danswer/connectors/slack/connector.py b/backend/danswer/connectors/slack/connector.py index 975653f5f61..22ace603bd4 100644 --- a/backend/danswer/connectors/slack/connector.py +++ b/backend/danswer/connectors/slack/connector.py @@ -8,24 +8,24 @@ from slack_sdk import WebClient from slack_sdk.errors import SlackApiError -from slack_sdk.web import SlackResponse from danswer.configs.app_configs import ENABLE_EXPENSIVE_EXPERT_CALLS from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.constants import DocumentSource -from danswer.connectors.cross_connector_utils.retry_wrapper import retry_builder from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import GenerateSlimDocumentOutput from danswer.connectors.interfaces import PollConnector from danswer.connectors.interfaces import SecondsSinceUnixEpoch +from danswer.connectors.interfaces import SlimConnector from danswer.connectors.models import BasicExpertInfo from danswer.connectors.models import ConnectorMissingCredentialError from danswer.connectors.models import Document from danswer.connectors.models import Section +from danswer.connectors.models import SlimDocument from danswer.connectors.slack.utils import expert_info_from_slack_id from danswer.connectors.slack.utils import get_message_link -from danswer.connectors.slack.utils import make_slack_api_call_logged -from danswer.connectors.slack.utils import make_slack_api_call_paginated -from danswer.connectors.slack.utils import make_slack_api_rate_limited +from danswer.connectors.slack.utils import make_paginated_slack_api_call_w_retries +from danswer.connectors.slack.utils import make_slack_api_call_w_retries from danswer.connectors.slack.utils import SlackTextCleaner from danswer.utils.logger import setup_logger @@ -38,47 +38,18 @@ # list of messages in a thread ThreadType = list[MessageType] -basic_retry_wrapper = retry_builder() - -def _make_paginated_slack_api_call( - call: Callable[..., SlackResponse], **kwargs: Any -) -> Generator[dict[str, Any], None, None]: - return make_slack_api_call_paginated( - basic_retry_wrapper( - make_slack_api_rate_limited(make_slack_api_call_logged(call)) - ) - )(**kwargs) - - -def _make_slack_api_call( - call: Callable[..., SlackResponse], **kwargs: Any -) -> SlackResponse: - return basic_retry_wrapper( - make_slack_api_rate_limited(make_slack_api_call_logged(call)) - )(**kwargs) - - -def get_channel_info(client: WebClient, channel_id: str) -> ChannelType: - """Get information about a channel. Needed to convert channel ID to channel name""" - return _make_slack_api_call(client.conversations_info, channel=channel_id)[0][ - "channel" - ] - - -def _get_channels( +def _collect_paginated_channels( client: WebClient, exclude_archived: bool, - get_private: bool, + channel_types: list[str], ) -> list[ChannelType]: channels: list[dict[str, Any]] = [] - for result in _make_paginated_slack_api_call( + for result in make_paginated_slack_api_call_w_retries( client.conversations_list, exclude_archived=exclude_archived, # also get private channels the bot is added to - types=["public_channel", "private_channel"] - if get_private - else ["public_channel"], + types=channel_types, ): channels.extend(result["channels"]) @@ -88,19 +59,38 @@ def _get_channels( def get_channels( client: WebClient, exclude_archived: bool = True, + get_public: bool = True, + get_private: bool = True, ) -> list[ChannelType]: """Get all channels in the workspace""" + channels: list[dict[str, Any]] = [] + channel_types = [] + if get_public: + channel_types.append("public_channel") + if get_private: + channel_types.append("private_channel") # try getting private channels as well at first try: - return _get_channels( - client=client, exclude_archived=exclude_archived, get_private=True + channels = _collect_paginated_channels( + client=client, + exclude_archived=exclude_archived, + channel_types=channel_types, ) except SlackApiError as e: logger.info(f"Unable to fetch private channels due to - {e}") + logger.info("trying again without private channels") + if get_public: + channel_types = ["public_channel"] + else: + logger.warning("No channels to fetch") + return [] + channels = _collect_paginated_channels( + client=client, + exclude_archived=exclude_archived, + channel_types=channel_types, + ) - return _get_channels( - client=client, exclude_archived=exclude_archived, get_private=False - ) + return channels def get_channel_messages( @@ -112,14 +102,14 @@ def get_channel_messages( """Get all messages in a channel""" # join so that the bot can access messages if not channel["is_member"]: - _make_slack_api_call( + make_slack_api_call_w_retries( client.conversations_join, channel=channel["id"], is_private=channel["is_private"], ) logger.info(f"Successfully joined '{channel['name']}'") - for result in _make_paginated_slack_api_call( + for result in make_paginated_slack_api_call_w_retries( client.conversations_history, channel=channel["id"], oldest=oldest, @@ -131,7 +121,7 @@ def get_channel_messages( def get_thread(client: WebClient, channel_id: str, thread_id: str) -> ThreadType: """Get all messages in a thread""" threads: list[MessageType] = [] - for result in _make_paginated_slack_api_call( + for result in make_paginated_slack_api_call_w_retries( client.conversations_replies, channel=channel_id, ts=thread_id ): threads.extend(result["messages"]) @@ -217,12 +207,17 @@ def thread_to_doc( "group_leave", "group_archive", "group_unarchive", + "channel_leave", + "channel_name", + "channel_join", } -def _default_msg_filter(message: MessageType) -> bool: +def default_msg_filter(message: MessageType) -> bool: # Don't keep messages from bots if message.get("bot_id") or message.get("app_id"): + if message.get("bot_profile", {}).get("name") == "DanswerConnector": + return False return True # Uninformative @@ -266,14 +261,14 @@ def filter_channels( ] -def get_all_docs( +def _get_all_docs( client: WebClient, workspace: str, channels: list[str] | None = None, channel_name_regex_enabled: bool = False, oldest: str | None = None, latest: str | None = None, - msg_filter_func: Callable[[MessageType], bool] = _default_msg_filter, + msg_filter_func: Callable[[MessageType], bool] = default_msg_filter, ) -> Generator[Document, None, None]: """Get all documents in the workspace, channel by channel""" slack_cleaner = SlackTextCleaner(client=client) @@ -328,7 +323,54 @@ def get_all_docs( ) -class SlackPollConnector(PollConnector): +def _get_all_doc_ids( + client: WebClient, + channels: list[str] | None = None, + channel_name_regex_enabled: bool = False, + msg_filter_func: Callable[[MessageType], bool] = default_msg_filter, +) -> GenerateSlimDocumentOutput: + """ + Get all document ids in the workspace, channel by channel + This is pretty identical to get_all_docs, but it returns a set of ids instead of documents + This makes it an order of magnitude faster than get_all_docs + """ + + all_channels = get_channels(client) + filtered_channels = filter_channels( + all_channels, channels, channel_name_regex_enabled + ) + + for channel in filtered_channels: + channel_id = channel["id"] + channel_message_batches = get_channel_messages( + client=client, + channel=channel, + ) + + message_ts_set: set[str] = set() + for message_batch in channel_message_batches: + for message in message_batch: + if msg_filter_func(message): + continue + + # The document id is the channel id and the ts of the first message in the thread + # Since we already have the first message of the thread, we dont have to + # fetch the thread for id retrieval, saving time and API calls + message_ts_set.add(message["ts"]) + + channel_metadata_list: list[SlimDocument] = [] + for message_ts in message_ts_set: + channel_metadata_list.append( + SlimDocument( + id=f"{channel_id}__{message_ts}", + perm_sync_data={"channel_id": channel_id}, + ) + ) + + yield channel_metadata_list + + +class SlackPollConnector(PollConnector, SlimConnector): def __init__( self, workspace: str, @@ -349,6 +391,20 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None self.client = WebClient(token=bot_token) return None + def retrieve_all_slim_documents( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + ) -> GenerateSlimDocumentOutput: + if self.client is None: + raise ConnectorMissingCredentialError("Slack") + + return _get_all_doc_ids( + client=self.client, + channels=self.channels, + channel_name_regex_enabled=self.channel_regex_enabled, + ) + def poll_source( self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch ) -> GenerateDocumentsOutput: @@ -356,7 +412,7 @@ def poll_source( raise ConnectorMissingCredentialError("Slack") documents: list[Document] = [] - for document in get_all_docs( + for document in _get_all_docs( client=self.client, workspace=self.workspace, channels=self.channels, @@ -389,6 +445,7 @@ def poll_source( current = time.time() one_day_ago = current - 24 * 60 * 60 # 1 day + document_batches = connector.poll_source(one_day_ago, current) print(next(document_batches)) diff --git a/backend/danswer/connectors/slack/load_connector.py b/backend/danswer/connectors/slack/load_connector.py index ebcfce5b845..7350ac6284d 100644 --- a/backend/danswer/connectors/slack/load_connector.py +++ b/backend/danswer/connectors/slack/load_connector.py @@ -16,6 +16,7 @@ from danswer.connectors.slack.utils import get_message_link from danswer.utils.logger import setup_logger + logger = setup_logger() diff --git a/backend/danswer/connectors/slack/utils.py b/backend/danswer/connectors/slack/utils.py index 8650ce9ddc9..78bc42a0926 100644 --- a/backend/danswer/connectors/slack/utils.py +++ b/backend/danswer/connectors/slack/utils.py @@ -12,9 +12,11 @@ from danswer.connectors.models import BasicExpertInfo from danswer.utils.logger import setup_logger +from danswer.utils.retry_wrapper import retry_builder logger = setup_logger() +basic_retry_wrapper = retry_builder() # number of messages we request per page when fetching paginated slack messages _SLACK_LIMIT = 900 @@ -34,7 +36,7 @@ def get_message_link( ) -def make_slack_api_call_logged( +def _make_slack_api_call_logged( call: Callable[..., SlackResponse], ) -> Callable[..., SlackResponse]: @wraps(call) @@ -47,7 +49,7 @@ def logged_call(**kwargs: Any) -> SlackResponse: return logged_call -def make_slack_api_call_paginated( +def _make_slack_api_call_paginated( call: Callable[..., SlackResponse], ) -> Callable[..., Generator[dict[str, Any], None, None]]: """Wraps calls to slack API so that they automatically handle pagination""" @@ -116,6 +118,24 @@ def rate_limited_call(**kwargs: Any) -> SlackResponse: return rate_limited_call +def make_slack_api_call_w_retries( + call: Callable[..., SlackResponse], **kwargs: Any +) -> SlackResponse: + return basic_retry_wrapper( + make_slack_api_rate_limited(_make_slack_api_call_logged(call)) + )(**kwargs) + + +def make_paginated_slack_api_call_w_retries( + call: Callable[..., SlackResponse], **kwargs: Any +) -> Generator[dict[str, Any], None, None]: + return _make_slack_api_call_paginated( + basic_retry_wrapper( + make_slack_api_rate_limited(_make_slack_api_call_logged(call)) + ) + )(**kwargs) + + def expert_info_from_slack_id( user_id: str | None, client: WebClient, diff --git a/backend/danswer/connectors/web/connector.py b/backend/danswer/connectors/web/connector.py index bb1f64efdfe..9e406b71674 100644 --- a/backend/danswer/connectors/web/connector.py +++ b/backend/danswer/connectors/web/connector.py @@ -128,6 +128,9 @@ def get_internal_links( if not href: continue + # Account for malformed backslashes in URLs + href = href.replace("\\", "/") + if should_ignore_pound and "#" in href: href = href.split("#")[0] @@ -370,7 +373,7 @@ def load_from_state(self) -> GenerateDocumentsOutput: page.close() except Exception as e: last_error = f"Failed to fetch '{current_url}': {e}" - logger.error(last_error) + logger.exception(last_error) playwright.stop() restart_playwright = True continue diff --git a/backend/throttle.ctrl b/backend/danswer/connectors/xenforo/__init__.py similarity index 100% rename from backend/throttle.ctrl rename to backend/danswer/connectors/xenforo/__init__.py diff --git a/backend/danswer/connectors/xenforo/connector.py b/backend/danswer/connectors/xenforo/connector.py new file mode 100644 index 00000000000..7f5221543f1 --- /dev/null +++ b/backend/danswer/connectors/xenforo/connector.py @@ -0,0 +1,244 @@ +""" +This is the XenforoConnector class. It is used to connect to a Xenforo forum and load or update documents from the forum. + +To use this class, you need to provide the URL of the Xenforo forum board you want to connect to when creating an instance +of the class. The URL should be a string that starts with 'http://' or 'https://', followed by the domain name of the +forum, followed by the board name. For example: + + base_url = 'https://www.example.com/forum/boards/some-topic/' + +The `load_from_state` method is used to load documents from the forum. It takes an optional `state` parameter, which +can be used to specify a state from which to start loading documents. +""" +import re +from datetime import datetime +from datetime import timedelta +from datetime import timezone +from typing import Any +from urllib.parse import urlparse + +import pytz +import requests +from bs4 import BeautifulSoup +from bs4 import Tag + +from danswer.configs.constants import DocumentSource +from danswer.connectors.cross_connector_utils.miscellaneous_utils import datetime_to_utc +from danswer.connectors.interfaces import GenerateDocumentsOutput +from danswer.connectors.interfaces import LoadConnector +from danswer.connectors.models import BasicExpertInfo +from danswer.connectors.models import Document +from danswer.connectors.models import Section +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +def get_title(soup: BeautifulSoup) -> str: + el = soup.find("h1", "p-title-value") + if not el: + return "" + title = el.text + for char in (";", ":", "!", "*", "/", "\\", "?", '"', "<", ">", "|"): + title = title.replace(char, "_") + return title + + +def get_pages(soup: BeautifulSoup, url: str) -> list[str]: + page_tags = soup.select("li.pageNav-page") + page_numbers = [] + for button in page_tags: + if re.match(r"^\d+$", button.text): + page_numbers.append(button.text) + + max_pages = int(max(page_numbers, key=int)) if page_numbers else 1 + + all_pages = [] + for x in range(1, int(max_pages) + 1): + all_pages.append(f"{url}page-{x}") + return all_pages + + +def parse_post_date(post_element: BeautifulSoup) -> datetime: + el = post_element.find("time") + if not isinstance(el, Tag) or "datetime" not in el.attrs: + return datetime.utcfromtimestamp(0).replace(tzinfo=timezone.utc) + + date_value = el["datetime"] + + # Ensure date_value is a string (if it's a list, take the first element) + if isinstance(date_value, list): + date_value = date_value[0] + + post_date = datetime.strptime(date_value, "%Y-%m-%dT%H:%M:%S%z") + return datetime_to_utc(post_date) + + +def scrape_page_posts( + soup: BeautifulSoup, + page_index: int, + url: str, + initial_run: bool, + start_time: datetime, +) -> list: + title = get_title(soup) + + documents = [] + for post in soup.find_all("div", class_="message-inner"): + post_date = parse_post_date(post) + if initial_run or post_date > start_time: + el = post.find("div", class_="bbWrapper") + if not el: + continue + post_text = el.get_text(strip=True) + "\n" + author_tag = post.find("a", class_="username") + if author_tag is None: + author_tag = post.find("span", class_="username") + author = author_tag.get_text(strip=True) if author_tag else "Deleted author" + formatted_time = post_date.strftime("%Y-%m-%d %H:%M:%S") + + # TODO: if a caller calls this for each page of a thread, it may see the + # same post multiple times if there is a sticky post + # that appears on each page of a thread. + # it's important to generate unique doc id's, so page index is part of the + # id. We may want to de-dupe this stuff inside the indexing service. + document = Document( + id=f"{DocumentSource.XENFORO.value}_{title}_{page_index}_{formatted_time}", + sections=[Section(link=url, text=post_text)], + title=title, + source=DocumentSource.XENFORO, + semantic_identifier=title, + primary_owners=[BasicExpertInfo(display_name=author)], + metadata={ + "type": "post", + "author": author, + "time": formatted_time, + }, + doc_updated_at=post_date, + ) + + documents.append(document) + return documents + + +class XenforoConnector(LoadConnector): + # Class variable to track if the connector has been run before + has_been_run_before = False + + def __init__(self, base_url: str) -> None: + self.base_url = base_url + self.initial_run = not XenforoConnector.has_been_run_before + self.start = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=1) + self.cookies: dict[str, str] = {} + # mimic user browser to avoid being blocked by the website (see: https://www.useragents.me/) + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/121.0.0.0 Safari/537.36" + } + + def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: + if credentials: + logger.warning("Unexpected credentials provided for Xenforo Connector") + return None + + def load_from_state(self) -> GenerateDocumentsOutput: + # Standardize URL to always end in /. + if self.base_url[-1] != "/": + self.base_url += "/" + + # Remove all extra parameters from the end such as page, post. + matches = ("threads/", "boards/", "forums/") + for each in matches: + if each in self.base_url: + try: + self.base_url = self.base_url[ + 0 : self.base_url.index( + "/", self.base_url.index(each) + len(each) + ) + + 1 + ] + except ValueError: + pass + + doc_batch: list[Document] = [] + all_threads = [] + + # If the URL contains "boards/" or "forums/", find all threads. + if "boards/" in self.base_url or "forums/" in self.base_url: + pages = get_pages(self.requestsite(self.base_url), self.base_url) + + # Get all pages on thread_list_page + for pre_count, thread_list_page in enumerate(pages, start=1): + logger.info( + f"Getting pages from thread_list_page.. Current: {pre_count}/{len(pages)}\r" + ) + all_threads += self.get_threads(thread_list_page) + # If the URL contains "threads/", add the thread to the list. + elif "threads/" in self.base_url: + all_threads.append(self.base_url) + + # Process all threads + for thread_count, thread_url in enumerate(all_threads, start=1): + soup = self.requestsite(thread_url) + if soup is None: + logger.error(f"Failed to load page: {self.base_url}") + continue + pages = get_pages(soup, thread_url) + # Getting all pages for all threads + for page_index, page in enumerate(pages, start=1): + logger.info( + f"Progress: Page {page_index}/{len(pages)} - Thread {thread_count}/{len(all_threads)}\r" + ) + soup_page = self.requestsite(page) + doc_batch.extend( + scrape_page_posts( + soup_page, page_index, thread_url, self.initial_run, self.start + ) + ) + if doc_batch: + yield doc_batch + + # Mark the initial run finished after all threads and pages have been processed + XenforoConnector.has_been_run_before = True + + def get_threads(self, url: str) -> list[str]: + soup = self.requestsite(url) + thread_tags = soup.find_all(class_="structItem-title") + base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url)) + threads = [] + for x in thread_tags: + y = x.find_all(href=True) + for element in y: + link = element["href"] + if "threads/" in link: + stripped = link[0 : link.rfind("/") + 1] + if base_url + stripped not in threads: + threads.append(base_url + stripped) + return threads + + def requestsite(self, url: str) -> BeautifulSoup: + try: + response = requests.get( + url, cookies=self.cookies, headers=self.headers, timeout=10 + ) + if response.status_code != 200: + logger.error( + f"<{url}> Request Error: {response.status_code} - {response.reason}" + ) + return BeautifulSoup(response.text, "html.parser") + except TimeoutError: + logger.error("Timed out Error.") + except Exception as e: + logger.error(f"Error on {url}") + logger.exception(e) + return BeautifulSoup("", "html.parser") + + +if __name__ == "__main__": + connector = XenforoConnector( + # base_url="https://cassiopaea.org/forum/threads/how-to-change-your-emotional-state.41381/" + base_url="https://xenforo.com/community/threads/whats-new-with-enhanced-search-resource-manager-and-media-gallery-in-xenforo-2-3.220935/" + ) + document_batches = connector.load_from_state() + print(next(document_batches)) diff --git a/backend/danswer/connectors/zendesk/connector.py b/backend/danswer/connectors/zendesk/connector.py index f85f2efff57..170da788a76 100644 --- a/backend/danswer/connectors/zendesk/connector.py +++ b/backend/danswer/connectors/zendesk/connector.py @@ -1,10 +1,7 @@ +from collections.abc import Iterator from typing import Any import requests -from retry import retry -from zenpy import Zenpy # type: ignore -from zenpy.lib.api_objects import Ticket # type: ignore -from zenpy.lib.api_objects.help_centre_objects import Article # type: ignore from danswer.configs.app_configs import INDEX_BATCH_SIZE from danswer.configs.app_configs import ZENDESK_CONNECTOR_SKIP_ARTICLE_LABELS @@ -20,43 +17,252 @@ from danswer.connectors.models import Document from danswer.connectors.models import Section from danswer.file_processing.html_utils import parse_html_page_basic +from danswer.utils.retry_wrapper import retry_builder -def _article_to_document(article: Article, content_tags: dict[str, str]) -> Document: - author = BasicExpertInfo( - display_name=article.author.name, email=article.author.email +MAX_PAGE_SIZE = 30 # Zendesk API maximum + + +class ZendeskCredentialsNotSetUpError(PermissionError): + def __init__(self) -> None: + super().__init__( + "Zendesk Credentials are not set up, was load_credentials called?" + ) + + +class ZendeskClient: + def __init__(self, subdomain: str, email: str, token: str): + self.base_url = f"https://{subdomain}.zendesk.com/api/v2" + self.auth = (f"{email}/token", token) + + @retry_builder() + def make_request(self, endpoint: str, params: dict[str, Any]) -> dict[str, Any]: + response = requests.get( + f"{self.base_url}/{endpoint}", auth=self.auth, params=params + ) + response.raise_for_status() + return response.json() + + +def _get_content_tag_mapping(client: ZendeskClient) -> dict[str, str]: + content_tags: dict[str, str] = {} + params = {"page[size]": MAX_PAGE_SIZE} + + try: + while True: + data = client.make_request("guide/content_tags", params) + + for tag in data.get("records", []): + content_tags[tag["id"]] = tag["name"] + + # Check if there are more pages + if data.get("meta", {}).get("has_more", False): + params["page[after]"] = data["meta"]["after_cursor"] + else: + break + + return content_tags + except Exception as e: + raise Exception(f"Error fetching content tags: {str(e)}") + + +def _get_articles( + client: ZendeskClient, start_time: int | None = None, page_size: int = MAX_PAGE_SIZE +) -> Iterator[dict[str, Any]]: + params = ( + {"start_time": start_time, "page[size]": page_size} + if start_time + else {"page[size]": page_size} ) - update_time = time_str_to_utc(article.updated_at) - # build metadata + while True: + data = client.make_request("help_center/articles", params) + for article in data["articles"]: + yield article + + if not data.get("meta", {}).get("has_more"): + break + params["page[after]"] = data["meta"]["after_cursor"] + + +def _get_tickets( + client: ZendeskClient, start_time: int | None = None +) -> Iterator[dict[str, Any]]: + params = {"start_time": start_time} if start_time else {"start_time": 0} + + while True: + data = client.make_request("incremental/tickets.json", params) + for ticket in data["tickets"]: + yield ticket + + if not data.get("end_of_stream", False): + params["start_time"] = data["end_time"] + else: + break + + +def _fetch_author(client: ZendeskClient, author_id: str) -> BasicExpertInfo | None: + # Skip fetching if author_id is invalid + if not author_id or author_id == "-1": + return None + + try: + author_data = client.make_request(f"users/{author_id}", {}) + user = author_data.get("user") + return ( + BasicExpertInfo(display_name=user.get("name"), email=user.get("email")) + if user and user.get("name") and user.get("email") + else None + ) + except requests.exceptions.HTTPError: + # Handle any API errors gracefully + return None + + +def _article_to_document( + article: dict[str, Any], + content_tags: dict[str, str], + author_map: dict[str, BasicExpertInfo], + client: ZendeskClient, +) -> tuple[dict[str, BasicExpertInfo] | None, Document]: + author_id = article.get("author_id") + if not author_id: + author = None + else: + author = ( + author_map.get(author_id) + if author_id in author_map + else _fetch_author(client, author_id) + ) + + new_author_mapping = {author_id: author} if author_id and author else None + + updated_at = article.get("updated_at") + update_time = time_str_to_utc(updated_at) if updated_at else None + + # Build metadata metadata: dict[str, str | list[str]] = { - "labels": [str(label) for label in article.label_names if label], + "labels": [str(label) for label in article.get("label_names", []) if label], "content_tags": [ content_tags[tag_id] - for tag_id in article.content_tag_ids + for tag_id in article.get("content_tag_ids", []) if tag_id in content_tags ], } - # remove empty values + # Remove empty values metadata = {k: v for k, v in metadata.items() if v} - return Document( - id=f"article:{article.id}", + return new_author_mapping, Document( + id=f"article:{article['id']}", sections=[ - Section(link=article.html_url, text=parse_html_page_basic(article.body)) + Section( + link=article.get("html_url"), + text=parse_html_page_basic(article["body"]), + ) ], source=DocumentSource.ZENDESK, - semantic_identifier=article.title, + semantic_identifier=article["title"], doc_updated_at=update_time, - primary_owners=[author], + primary_owners=[author] if author else None, metadata=metadata, ) -class ZendeskClientNotSetUpError(PermissionError): - def __init__(self) -> None: - super().__init__("Zendesk Client is not set up, was load_credentials called?") +def _get_comment_text( + comment: dict[str, Any], + author_map: dict[str, BasicExpertInfo], + client: ZendeskClient, +) -> tuple[dict[str, BasicExpertInfo] | None, str]: + author_id = comment.get("author_id") + if not author_id: + author = None + else: + author = ( + author_map.get(author_id) + if author_id in author_map + else _fetch_author(client, author_id) + ) + + new_author_mapping = {author_id: author} if author_id and author else None + + comment_text = f"Comment{' by ' + author.display_name if author and author.display_name else ''}" + comment_text += f"{' at ' + comment['created_at'] if comment.get('created_at') else ''}:\n{comment['body']}" + + return new_author_mapping, comment_text + + +def _ticket_to_document( + ticket: dict[str, Any], + author_map: dict[str, BasicExpertInfo], + client: ZendeskClient, + default_subdomain: str, +) -> tuple[dict[str, BasicExpertInfo] | None, Document]: + submitter_id = ticket.get("submitter") + if not submitter_id: + submitter = None + else: + submitter = ( + author_map.get(submitter_id) + if submitter_id in author_map + else _fetch_author(client, submitter_id) + ) + + new_author_mapping = ( + {submitter_id: submitter} if submitter_id and submitter else None + ) + + updated_at = ticket.get("updated_at") + update_time = time_str_to_utc(updated_at) if updated_at else None + + metadata: dict[str, str | list[str]] = {} + if status := ticket.get("status"): + metadata["status"] = status + if priority := ticket.get("priority"): + metadata["priority"] = priority + if tags := ticket.get("tags"): + metadata["tags"] = tags + if ticket_type := ticket.get("type"): + metadata["ticket_type"] = ticket_type + + # Fetch comments for the ticket + comments_data = client.make_request(f"tickets/{ticket.get('id')}/comments", {}) + comments = comments_data.get("comments", []) + + comment_texts = [] + for comment in comments: + new_author_mapping, comment_text = _get_comment_text( + comment, author_map, client + ) + if new_author_mapping: + author_map.update(new_author_mapping) + comment_texts.append(comment_text) + + comments_text = "\n\n".join(comment_texts) + + subject = ticket.get("subject") + full_text = f"Ticket Subject:\n{subject}\n\nComments:\n{comments_text}" + + ticket_url = ticket.get("url") + subdomain = ( + ticket_url.split("//")[1].split(".zendesk.com")[0] + if ticket_url + else default_subdomain + ) + + ticket_display_url = ( + f"https://{subdomain}.zendesk.com/agent/tickets/{ticket.get('id')}" + ) + + return new_author_mapping, Document( + id=f"zendesk_ticket_{ticket['id']}", + sections=[Section(link=ticket_display_url, text=full_text)], + source=DocumentSource.ZENDESK, + semantic_identifier=f"Ticket #{ticket['id']}: {subject or 'No Subject'}", + doc_updated_at=update_time, + primary_owners=[submitter] if submitter else None, + metadata=metadata, + ) class ZendeskConnector(LoadConnector, PollConnector): @@ -66,44 +272,10 @@ def __init__( content_type: str = "articles", ) -> None: self.batch_size = batch_size - self.zendesk_client: Zenpy | None = None - self.content_tags: dict[str, str] = {} self.content_type = content_type - - @retry(tries=3, delay=2, backoff=2) - def _set_content_tags( - self, subdomain: str, email: str, token: str, page_size: int = 30 - ) -> None: - # Construct the base URL - base_url = f"https://{subdomain}.zendesk.com/api/v2/guide/content_tags" - - # Set up authentication - auth = (f"{email}/token", token) - - # Set up pagination parameters - params = {"page[size]": page_size} - - try: - while True: - # Make the GET request - response = requests.get(base_url, auth=auth, params=params) - - # Check if the request was successful - if response.status_code == 200: - data = response.json() - content_tag_list = data.get("records", []) - for tag in content_tag_list: - self.content_tags[tag["id"]] = tag["name"] - - # Check if there are more pages - if data.get("meta", {}).get("has_more", False): - params["page[after]"] = data["meta"]["after_cursor"] - else: - break - else: - raise Exception(f"Error: {response.status_code}\n{response.text}") - except Exception as e: - raise Exception(f"Error fetching content tags: {str(e)}") + self.subdomain = "" + # Fetch all tags ahead of time + self.content_tags: dict[str, str] = {} def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: # Subdomain is actually the whole URL @@ -112,87 +284,23 @@ def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None .replace("https://", "") .split(".zendesk.com")[0] ) + self.subdomain = subdomain - self.zendesk_client = Zenpy( - subdomain=subdomain, - email=credentials["zendesk_email"], - token=credentials["zendesk_token"], - ) - self._set_content_tags( - subdomain, - credentials["zendesk_email"], - credentials["zendesk_token"], + self.client = ZendeskClient( + subdomain, credentials["zendesk_email"], credentials["zendesk_token"] ) return None def load_from_state(self) -> GenerateDocumentsOutput: return self.poll_source(None, None) - def _ticket_to_document(self, ticket: Ticket) -> Document: - if self.zendesk_client is None: - raise ZendeskClientNotSetUpError() - - owner = None - if ticket.requester and ticket.requester.name and ticket.requester.email: - owner = [ - BasicExpertInfo( - display_name=ticket.requester.name, email=ticket.requester.email - ) - ] - update_time = time_str_to_utc(ticket.updated_at) if ticket.updated_at else None - - metadata: dict[str, str | list[str]] = {} - if ticket.status is not None: - metadata["status"] = ticket.status - if ticket.priority is not None: - metadata["priority"] = ticket.priority - if ticket.tags: - metadata["tags"] = ticket.tags - if ticket.type is not None: - metadata["ticket_type"] = ticket.type - - # Fetch comments for the ticket - comments = self.zendesk_client.tickets.comments(ticket=ticket) - - # Combine all comments into a single text - comments_text = "\n\n".join( - [ - f"Comment{f' by {comment.author.name}' if comment.author and comment.author.name else ''}" - f"{f' at {comment.created_at}' if comment.created_at else ''}:\n{comment.body}" - for comment in comments - if comment.body - ] - ) - - # Combine ticket description and comments - description = ( - ticket.description - if hasattr(ticket, "description") and ticket.description - else "" - ) - full_text = f"Ticket Description:\n{description}\n\nComments:\n{comments_text}" - - # Extract subdomain from ticket.url - subdomain = ticket.url.split("//")[1].split(".zendesk.com")[0] - - # Build the html url for the ticket - ticket_url = f"https://{subdomain}.zendesk.com/agent/tickets/{ticket.id}" - - return Document( - id=f"zendesk_ticket_{ticket.id}", - sections=[Section(link=ticket_url, text=full_text)], - source=DocumentSource.ZENDESK, - semantic_identifier=f"Ticket #{ticket.id}: {ticket.subject or 'No Subject'}", - doc_updated_at=update_time, - primary_owners=owner, - metadata=metadata, - ) - def poll_source( self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None ) -> GenerateDocumentsOutput: - if self.zendesk_client is None: - raise ZendeskClientNotSetUpError() + if self.client is None: + raise ZendeskCredentialsNotSetUpError() + + self.content_tags = _get_content_tag_mapping(self.client) if self.content_type == "articles": yield from self._poll_articles(start) @@ -204,26 +312,30 @@ def poll_source( def _poll_articles( self, start: SecondsSinceUnixEpoch | None ) -> GenerateDocumentsOutput: - articles = ( - self.zendesk_client.help_center.articles(cursor_pagination=True) # type: ignore - if start is None - else self.zendesk_client.help_center.articles.incremental( # type: ignore - start_time=int(start) - ) - ) + articles = _get_articles(self.client, start_time=int(start) if start else None) + + # This one is built on the fly as there may be more many more authors than tags + author_map: dict[str, BasicExpertInfo] = {} + doc_batch = [] for article in articles: if ( - article.body is None - or article.draft + article.get("body") is None + or article.get("draft") or any( label in ZENDESK_CONNECTOR_SKIP_ARTICLE_LABELS - for label in article.label_names + for label in article.get("label_names", []) ) ): continue - doc_batch.append(_article_to_document(article, self.content_tags)) + new_author_map, documents = _article_to_document( + article, self.content_tags, author_map, self.client + ) + if new_author_map: + author_map.update(new_author_map) + + doc_batch.append(documents) if len(doc_batch) >= self.batch_size: yield doc_batch doc_batch.clear() @@ -234,10 +346,14 @@ def _poll_articles( def _poll_tickets( self, start: SecondsSinceUnixEpoch | None ) -> GenerateDocumentsOutput: - if self.zendesk_client is None: - raise ZendeskClientNotSetUpError() + if self.client is None: + raise ZendeskCredentialsNotSetUpError() - ticket_generator = self.zendesk_client.tickets.incremental(start_time=start) + author_map: dict[str, BasicExpertInfo] = {} + + ticket_generator = _get_tickets( + self.client, start_time=int(start) if start else None + ) while True: doc_batch = [] @@ -246,10 +362,20 @@ def _poll_tickets( ticket = next(ticket_generator) # Check if the ticket status is deleted and skip it if so - if ticket.status == "deleted": + if ticket.get("status") == "deleted": continue - doc_batch.append(self._ticket_to_document(ticket)) + new_author_map, documents = _ticket_to_document( + ticket=ticket, + author_map=author_map, + client=self.client, + default_subdomain=self.subdomain, + ) + + if new_author_map: + author_map.update(new_author_map) + + doc_batch.append(documents) if len(doc_batch) >= self.batch_size: yield doc_batch @@ -267,7 +393,6 @@ def _poll_tickets( if __name__ == "__main__": import os - import time connector = ZendeskConnector() diff --git a/backend/danswer/context/search/__init__.py b/backend/danswer/context/search/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/danswer/search/enums.py b/backend/danswer/context/search/enums.py similarity index 100% rename from backend/danswer/search/enums.py rename to backend/danswer/context/search/enums.py diff --git a/backend/danswer/search/models.py b/backend/danswer/context/search/models.py similarity index 98% rename from backend/danswer/search/models.py rename to backend/danswer/context/search/models.py index 503b07653ef..73ef28dc3b6 100644 --- a/backend/danswer/search/models.py +++ b/backend/danswer/context/search/models.py @@ -8,13 +8,13 @@ from danswer.configs.chat_configs import NUM_RETURNED_HITS from danswer.configs.constants import DocumentSource +from danswer.context.search.enums import LLMEvaluationType +from danswer.context.search.enums import OptionalSearchSetting +from danswer.context.search.enums import SearchType from danswer.db.models import Persona from danswer.db.models import SearchSettings from danswer.indexing.models import BaseChunk from danswer.indexing.models import IndexingSetting -from danswer.search.enums import LLMEvaluationType -from danswer.search.enums import OptionalSearchSetting -from danswer.search.enums import SearchType from shared_configs.enums import RerankerProvider @@ -102,6 +102,7 @@ class BaseFilters(BaseModel): class IndexFilters(BaseFilters): access_control_list: list[str] | None + tenant_id: str | None = None class ChunkMetric(BaseModel): diff --git a/backend/danswer/search/pipeline.py b/backend/danswer/context/search/pipeline.py similarity index 93% rename from backend/danswer/search/pipeline.py rename to backend/danswer/context/search/pipeline.py index 183c8729d67..21c518348e7 100644 --- a/backend/danswer/search/pipeline.py +++ b/backend/danswer/context/search/pipeline.py @@ -7,6 +7,22 @@ from danswer.chat.models import SectionRelevancePiece from danswer.configs.chat_configs import DISABLE_LLM_DOC_RELEVANCE +from danswer.context.search.enums import LLMEvaluationType +from danswer.context.search.enums import QueryFlow +from danswer.context.search.enums import SearchType +from danswer.context.search.models import IndexFilters +from danswer.context.search.models import InferenceChunk +from danswer.context.search.models import InferenceSection +from danswer.context.search.models import RerankMetricsContainer +from danswer.context.search.models import RetrievalMetricsContainer +from danswer.context.search.models import SearchQuery +from danswer.context.search.models import SearchRequest +from danswer.context.search.postprocessing.postprocessing import cleanup_chunks +from danswer.context.search.postprocessing.postprocessing import search_postprocessing +from danswer.context.search.preprocessing.preprocessing import retrieval_preprocessing +from danswer.context.search.retrieval.search_runner import retrieve_chunks +from danswer.context.search.utils import inference_section_from_chunks +from danswer.context.search.utils import relevant_sections_to_indices from danswer.db.models import User from danswer.db.search_settings import get_current_search_settings from danswer.document_index.factory import get_default_document_index @@ -16,22 +32,6 @@ from danswer.llm.answering.prune_and_merge import ChunkRange from danswer.llm.answering.prune_and_merge import merge_chunk_intervals from danswer.llm.interfaces import LLM -from danswer.search.enums import LLMEvaluationType -from danswer.search.enums import QueryFlow -from danswer.search.enums import SearchType -from danswer.search.models import IndexFilters -from danswer.search.models import InferenceChunk -from danswer.search.models import InferenceSection -from danswer.search.models import RerankMetricsContainer -from danswer.search.models import RetrievalMetricsContainer -from danswer.search.models import SearchQuery -from danswer.search.models import SearchRequest -from danswer.search.postprocessing.postprocessing import cleanup_chunks -from danswer.search.postprocessing.postprocessing import search_postprocessing -from danswer.search.preprocessing.preprocessing import retrieval_preprocessing -from danswer.search.retrieval.search_runner import retrieve_chunks -from danswer.search.utils import inference_section_from_chunks -from danswer.search.utils import relevant_sections_to_indices from danswer.secondary_llm_flows.agentic_evaluation import evaluate_inference_section from danswer.utils.logger import setup_logger from danswer.utils.threadpool_concurrency import FunctionCall diff --git a/backend/danswer/search/postprocessing/postprocessing.py b/backend/danswer/context/search/postprocessing/postprocessing.py similarity index 96% rename from backend/danswer/search/postprocessing/postprocessing.py rename to backend/danswer/context/search/postprocessing/postprocessing.py index b4a1e48bd39..a32abca9c43 100644 --- a/backend/danswer/search/postprocessing/postprocessing.py +++ b/backend/danswer/context/search/postprocessing/postprocessing.py @@ -9,19 +9,19 @@ from danswer.configs.constants import RETURN_SEPARATOR from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MAX from danswer.configs.model_configs import CROSS_ENCODER_RANGE_MIN +from danswer.context.search.enums import LLMEvaluationType +from danswer.context.search.models import ChunkMetric +from danswer.context.search.models import InferenceChunk +from danswer.context.search.models import InferenceChunkUncleaned +from danswer.context.search.models import InferenceSection +from danswer.context.search.models import MAX_METRICS_CONTENT +from danswer.context.search.models import RerankMetricsContainer +from danswer.context.search.models import SearchQuery from danswer.document_index.document_index_utils import ( translate_boost_count_to_multiplier, ) from danswer.llm.interfaces import LLM from danswer.natural_language_processing.search_nlp_models import RerankingModel -from danswer.search.enums import LLMEvaluationType -from danswer.search.models import ChunkMetric -from danswer.search.models import InferenceChunk -from danswer.search.models import InferenceChunkUncleaned -from danswer.search.models import InferenceSection -from danswer.search.models import MAX_METRICS_CONTENT -from danswer.search.models import RerankMetricsContainer -from danswer.search.models import SearchQuery from danswer.secondary_llm_flows.chunk_usefulness import llm_batch_eval_sections from danswer.utils.logger import setup_logger from danswer.utils.threadpool_concurrency import FunctionCall diff --git a/backend/danswer/search/preprocessing/access_filters.py b/backend/danswer/context/search/preprocessing/access_filters.py similarity index 91% rename from backend/danswer/search/preprocessing/access_filters.py rename to backend/danswer/context/search/preprocessing/access_filters.py index e8141864d11..d4e0c6033bf 100644 --- a/backend/danswer/search/preprocessing/access_filters.py +++ b/backend/danswer/context/search/preprocessing/access_filters.py @@ -1,8 +1,8 @@ from sqlalchemy.orm import Session from danswer.access.access import get_acl_for_user +from danswer.context.search.models import IndexFilters from danswer.db.models import User -from danswer.search.models import IndexFilters def build_access_filters_for_user(user: User | None, session: Session) -> list[str]: diff --git a/backend/danswer/search/preprocessing/preprocessing.py b/backend/danswer/context/search/preprocessing/preprocessing.py similarity index 90% rename from backend/danswer/search/preprocessing/preprocessing.py rename to backend/danswer/context/search/preprocessing/preprocessing.py index 43a6a43ce88..fe886ccd9e0 100644 --- a/backend/danswer/search/preprocessing/preprocessing.py +++ b/backend/danswer/context/search/preprocessing/preprocessing.py @@ -9,26 +9,32 @@ from danswer.configs.chat_configs import HYBRID_ALPHA_KEYWORD from danswer.configs.chat_configs import NUM_POSTPROCESSED_RESULTS from danswer.configs.chat_configs import NUM_RETURNED_HITS +from danswer.context.search.enums import LLMEvaluationType +from danswer.context.search.enums import RecencyBiasSetting +from danswer.context.search.enums import SearchType +from danswer.context.search.models import BaseFilters +from danswer.context.search.models import IndexFilters +from danswer.context.search.models import RerankingDetails +from danswer.context.search.models import SearchQuery +from danswer.context.search.models import SearchRequest +from danswer.context.search.preprocessing.access_filters import ( + build_access_filters_for_user, +) +from danswer.context.search.retrieval.search_runner import ( + remove_stop_words_and_punctuation, +) +from danswer.db.engine import CURRENT_TENANT_ID_CONTEXTVAR from danswer.db.models import User from danswer.db.search_settings import get_current_search_settings from danswer.llm.interfaces import LLM from danswer.natural_language_processing.search_nlp_models import QueryAnalysisModel -from danswer.search.enums import LLMEvaluationType -from danswer.search.enums import RecencyBiasSetting -from danswer.search.enums import SearchType -from danswer.search.models import BaseFilters -from danswer.search.models import IndexFilters -from danswer.search.models import RerankingDetails -from danswer.search.models import SearchQuery -from danswer.search.models import SearchRequest -from danswer.search.preprocessing.access_filters import build_access_filters_for_user -from danswer.search.retrieval.search_runner import remove_stop_words_and_punctuation from danswer.secondary_llm_flows.source_filter import extract_source_filter from danswer.secondary_llm_flows.time_filter import extract_time_filter from danswer.utils.logger import setup_logger from danswer.utils.threadpool_concurrency import FunctionCall from danswer.utils.threadpool_concurrency import run_functions_in_parallel from danswer.utils.timing import log_function_time +from shared_configs.configs import MULTI_TENANT logger = setup_logger() @@ -67,6 +73,9 @@ def retrieval_preprocessing( ] time_filter = preset_filters.time_cutoff + if time_filter is None and persona: + time_filter = persona.search_start_date + source_filter = preset_filters.source_type auto_detect_time_filter = True @@ -154,9 +163,10 @@ def retrieval_preprocessing( final_filters = IndexFilters( source_type=preset_filters.source_type or predicted_source_filters, document_set=preset_filters.document_set, - time_cutoff=preset_filters.time_cutoff or predicted_time_cutoff, + time_cutoff=time_filter or predicted_time_cutoff, tags=preset_filters.tags, # Tags are never auto-extracted access_control_list=user_acl_filters, + tenant_id=CURRENT_TENANT_ID_CONTEXTVAR.get() if MULTI_TENANT else None, ) llm_evaluation_type = LLMEvaluationType.BASIC diff --git a/backend/danswer/search/retrieval/search_runner.py b/backend/danswer/context/search/retrieval/search_runner.py similarity index 95% rename from backend/danswer/search/retrieval/search_runner.py rename to backend/danswer/context/search/retrieval/search_runner.py index 30347464ff8..770a19900f2 100644 --- a/backend/danswer/search/retrieval/search_runner.py +++ b/backend/danswer/context/search/retrieval/search_runner.py @@ -6,6 +6,16 @@ from nltk.tokenize import word_tokenize # type:ignore from sqlalchemy.orm import Session +from danswer.context.search.models import ChunkMetric +from danswer.context.search.models import IndexFilters +from danswer.context.search.models import InferenceChunk +from danswer.context.search.models import InferenceChunkUncleaned +from danswer.context.search.models import InferenceSection +from danswer.context.search.models import MAX_METRICS_CONTENT +from danswer.context.search.models import RetrievalMetricsContainer +from danswer.context.search.models import SearchQuery +from danswer.context.search.postprocessing.postprocessing import cleanup_chunks +from danswer.context.search.utils import inference_section_from_chunks from danswer.db.search_settings import get_current_search_settings from danswer.db.search_settings import get_multilingual_expansion from danswer.document_index.interfaces import DocumentIndex @@ -14,16 +24,6 @@ replace_invalid_doc_id_characters, ) from danswer.natural_language_processing.search_nlp_models import EmbeddingModel -from danswer.search.models import ChunkMetric -from danswer.search.models import IndexFilters -from danswer.search.models import InferenceChunk -from danswer.search.models import InferenceChunkUncleaned -from danswer.search.models import InferenceSection -from danswer.search.models import MAX_METRICS_CONTENT -from danswer.search.models import RetrievalMetricsContainer -from danswer.search.models import SearchQuery -from danswer.search.postprocessing.postprocessing import cleanup_chunks -from danswer.search.utils import inference_section_from_chunks from danswer.secondary_llm_flows.query_expansion import multilingual_query_expansion from danswer.utils.logger import setup_logger from danswer.utils.threadpool_concurrency import run_functions_tuples_in_parallel diff --git a/backend/danswer/search/search_settings.py b/backend/danswer/context/search/search_settings.py similarity index 81% rename from backend/danswer/search/search_settings.py rename to backend/danswer/context/search/search_settings.py index d502205dfe7..917b2c9f532 100644 --- a/backend/danswer/search/search_settings.py +++ b/backend/danswer/context/search/search_settings.py @@ -1,9 +1,9 @@ from typing import cast from danswer.configs.constants import KV_SEARCH_SETTINGS -from danswer.dynamic_configs.factory import get_dynamic_config_store -from danswer.dynamic_configs.interface import ConfigNotFoundError -from danswer.search.models import SavedSearchSettings +from danswer.context.search.models import SavedSearchSettings +from danswer.key_value_store.factory import get_kv_store +from danswer.key_value_store.interface import KvKeyNotFoundError from danswer.utils.logger import setup_logger logger = setup_logger() @@ -17,10 +17,10 @@ def get_kv_search_settings() -> SavedSearchSettings | None: if the value is updated by another process/instance of the API server. If this reads from an in memory cache like reddis then it will be ok. Until then this has some performance implications (though minor) """ - kv_store = get_dynamic_config_store() + kv_store = get_kv_store() try: return SavedSearchSettings(**cast(dict, kv_store.load(KV_SEARCH_SETTINGS))) - except ConfigNotFoundError: + except KvKeyNotFoundError: return None except Exception as e: logger.error(f"Error loading search settings: {e}") diff --git a/backend/danswer/search/utils.py b/backend/danswer/context/search/utils.py similarity index 92% rename from backend/danswer/search/utils.py rename to backend/danswer/context/search/utils.py index 21a95320ef5..ecbdaf35ffb 100644 --- a/backend/danswer/search/utils.py +++ b/backend/danswer/context/search/utils.py @@ -2,12 +2,12 @@ from typing import TypeVar from danswer.chat.models import SectionRelevancePiece +from danswer.context.search.models import InferenceChunk +from danswer.context.search.models import InferenceSection +from danswer.context.search.models import SavedSearchDoc +from danswer.context.search.models import SavedSearchDocWithContent +from danswer.context.search.models import SearchDoc from danswer.db.models import SearchDoc as DBSearchDoc -from danswer.search.models import InferenceChunk -from danswer.search.models import InferenceSection -from danswer.search.models import SavedSearchDoc -from danswer.search.models import SavedSearchDocWithContent -from danswer.search.models import SearchDoc T = TypeVar( diff --git a/backend/danswer/danswerbot/slack/blocks.py b/backend/danswer/danswerbot/slack/blocks.py index 4107a381554..a5e6868fd37 100644 --- a/backend/danswer/danswerbot/slack/blocks.py +++ b/backend/danswer/danswerbot/slack/blocks.py @@ -18,20 +18,30 @@ from danswer.chat.models import DanswerQuote from danswer.configs.app_configs import DISABLE_GENERATIVE_AI +from danswer.configs.app_configs import WEB_DOMAIN from danswer.configs.constants import DocumentSource from danswer.configs.constants import SearchFeedbackType from danswer.configs.danswerbot_configs import DANSWER_BOT_NUM_DOCS_TO_DISPLAY +from danswer.context.search.models import SavedSearchDoc +from danswer.danswerbot.slack.constants import CONTINUE_IN_WEB_UI_ACTION_ID from danswer.danswerbot.slack.constants import DISLIKE_BLOCK_ACTION_ID from danswer.danswerbot.slack.constants import FEEDBACK_DOC_BUTTON_BLOCK_ACTION_ID from danswer.danswerbot.slack.constants import FOLLOWUP_BUTTON_ACTION_ID from danswer.danswerbot.slack.constants import FOLLOWUP_BUTTON_RESOLVED_ACTION_ID from danswer.danswerbot.slack.constants import IMMEDIATE_RESOLVED_BUTTON_ACTION_ID from danswer.danswerbot.slack.constants import LIKE_BLOCK_ACTION_ID +from danswer.danswerbot.slack.formatting import format_slack_message from danswer.danswerbot.slack.icons import source_to_github_img_link +from danswer.danswerbot.slack.models import SlackMessageInfo +from danswer.danswerbot.slack.utils import build_continue_in_web_ui_id from danswer.danswerbot.slack.utils import build_feedback_id from danswer.danswerbot.slack.utils import remove_slack_text_interactions from danswer.danswerbot.slack.utils import translate_vespa_highlight_to_slack -from danswer.search.models import SavedSearchDoc +from danswer.db.chat import get_chat_session_by_message_id +from danswer.db.engine import get_session_with_tenant +from danswer.db.models import ChannelConfig +from danswer.db.models import Persona +from danswer.one_shot_answer.models import OneShotQAResponse from danswer.utils.text_processing import decode_escapes from danswer.utils.text_processing import replace_whitespaces_w_space @@ -101,12 +111,12 @@ def _split_text(text: str, limit: int = 3000) -> list[str]: return chunks -def clean_markdown_link_text(text: str) -> str: +def _clean_markdown_link_text(text: str) -> str: # Remove any newlines within the text return text.replace("\n", " ").strip() -def build_qa_feedback_block( +def _build_qa_feedback_block( message_id: int, feedback_reminder_id: str | None = None ) -> Block: return ActionsBlock( @@ -115,7 +125,6 @@ def build_qa_feedback_block( ButtonElement( action_id=LIKE_BLOCK_ACTION_ID, text="👍 Helpful", - style="primary", value=feedback_reminder_id, ), ButtonElement( @@ -155,7 +164,7 @@ def get_document_feedback_blocks() -> Block: ) -def build_doc_feedback_block( +def _build_doc_feedback_block( message_id: int, document_id: str, document_rank: int, @@ -182,7 +191,7 @@ def get_restate_blocks( ] -def build_documents_blocks( +def _build_documents_blocks( documents: list[SavedSearchDoc], message_id: int | None, num_docs_to_display: int = DANSWER_BOT_NUM_DOCS_TO_DISPLAY, @@ -223,7 +232,7 @@ def build_documents_blocks( feedback: ButtonElement | dict = {} if message_id is not None: - feedback = build_doc_feedback_block( + feedback = _build_doc_feedback_block( message_id=message_id, document_id=d.document_id, document_rank=rank, @@ -241,7 +250,7 @@ def build_documents_blocks( return section_blocks -def build_sources_blocks( +def _build_sources_blocks( cited_documents: list[tuple[int, SavedSearchDoc]], num_docs_to_display: int = DANSWER_BOT_NUM_DOCS_TO_DISPLAY, ) -> list[Block]: @@ -286,7 +295,7 @@ def build_sources_blocks( + ([days_ago_str] if days_ago_str else []) ) - document_title = clean_markdown_link_text(doc_sem_id) + document_title = _clean_markdown_link_text(doc_sem_id) img_link = source_to_github_img_link(d.source_type) section_blocks.append( @@ -317,7 +326,50 @@ def build_sources_blocks( return section_blocks -def build_quotes_block( +def _priority_ordered_documents_blocks( + answer: OneShotQAResponse, +) -> list[Block]: + docs_response = answer.docs if answer.docs else None + top_docs = docs_response.top_documents if docs_response else [] + llm_doc_inds = answer.llm_selected_doc_indices or [] + llm_docs = [top_docs[i] for i in llm_doc_inds] + remaining_docs = [ + doc for idx, doc in enumerate(top_docs) if idx not in llm_doc_inds + ] + priority_ordered_docs = llm_docs + remaining_docs + if not priority_ordered_docs: + return [] + + document_blocks = _build_documents_blocks( + documents=priority_ordered_docs, + message_id=answer.chat_message_id, + ) + if document_blocks: + document_blocks = [DividerBlock()] + document_blocks + return document_blocks + + +def _build_citations_blocks( + answer: OneShotQAResponse, +) -> list[Block]: + docs_response = answer.docs if answer.docs else None + top_docs = docs_response.top_documents if docs_response else [] + citations = answer.citations or [] + cited_docs = [] + for citation in citations: + matching_doc = next( + (d for d in top_docs if d.document_id == citation.document_id), + None, + ) + if matching_doc: + cited_docs.append((citation.citation_num, matching_doc)) + + cited_docs.sort() + citations_block = _build_sources_blocks(cited_documents=cited_docs) + return citations_block + + +def _build_quotes_block( quotes: list[DanswerQuote], ) -> list[Block]: quote_lines: list[str] = [] @@ -359,58 +411,70 @@ def build_quotes_block( return [SectionBlock(text="*Relevant Snippets*\n" + "\n".join(quote_lines))] -def build_qa_response_blocks( - message_id: int | None, - answer: str | None, - quotes: list[DanswerQuote] | None, - source_filters: list[DocumentSource] | None, - time_cutoff: datetime | None, - favor_recent: bool, +def _build_qa_response_blocks( + answer: OneShotQAResponse, skip_quotes: bool = False, process_message_for_citations: bool = False, - skip_ai_feedback: bool = False, - feedback_reminder_id: str | None = None, ) -> list[Block]: + retrieval_info = answer.docs + if not retrieval_info: + # This should not happen, even with no docs retrieved, there is still info returned + raise RuntimeError("Failed to retrieve docs, cannot answer question.") + + formatted_answer = format_slack_message(answer.answer) if answer.answer else None + quotes = answer.quotes.quotes if answer.quotes else None + if DISABLE_GENERATIVE_AI: return [] quotes_blocks: list[Block] = [] filter_block: Block | None = None - if time_cutoff or favor_recent or source_filters: + if ( + retrieval_info.applied_time_cutoff + or retrieval_info.recency_bias_multiplier > 1 + or retrieval_info.applied_source_filters + ): filter_text = "Filters: " - if source_filters: - sources_str = ", ".join([s.value for s in source_filters]) + if retrieval_info.applied_source_filters: + sources_str = ", ".join( + [s.value for s in retrieval_info.applied_source_filters] + ) filter_text += f"`Sources in [{sources_str}]`" - if time_cutoff or favor_recent: + if ( + retrieval_info.applied_time_cutoff + or retrieval_info.recency_bias_multiplier > 1 + ): filter_text += " and " - if time_cutoff is not None: - time_str = time_cutoff.strftime("%b %d, %Y") + if retrieval_info.applied_time_cutoff is not None: + time_str = retrieval_info.applied_time_cutoff.strftime("%b %d, %Y") filter_text += f"`Docs Updated >= {time_str}` " - if favor_recent: - if time_cutoff is not None: + if retrieval_info.recency_bias_multiplier > 1: + if retrieval_info.applied_time_cutoff is not None: filter_text += "+ " filter_text += "`Prioritize Recently Updated Docs`" filter_block = SectionBlock(text=f"_{filter_text}_") - if not answer: + if not formatted_answer: answer_blocks = [ SectionBlock( text="Sorry, I was unable to find an answer, but I did find some potentially relevant docs 🤓" ) ] else: - answer_processed = decode_escapes(remove_slack_text_interactions(answer)) + answer_processed = decode_escapes( + remove_slack_text_interactions(formatted_answer) + ) if process_message_for_citations: answer_processed = _process_citations_for_slack(answer_processed) answer_blocks = [ SectionBlock(text=text) for text in _split_text(answer_processed) ] if quotes: - quotes_blocks = build_quotes_block(quotes) + quotes_blocks = _build_quotes_block(quotes) - # if no quotes OR `build_quotes_block()` did not give back any blocks + # if no quotes OR `_build_quotes_block()` did not give back any blocks if not quotes_blocks: quotes_blocks = [ SectionBlock( @@ -425,20 +489,37 @@ def build_qa_response_blocks( response_blocks.extend(answer_blocks) - if message_id is not None and not skip_ai_feedback: - response_blocks.append( - build_qa_feedback_block( - message_id=message_id, feedback_reminder_id=feedback_reminder_id - ) - ) - if not skip_quotes: response_blocks.extend(quotes_blocks) return response_blocks -def build_follow_up_block(message_id: int | None) -> ActionsBlock: +def _build_continue_in_web_ui_block( + tenant_id: str | None, + message_id: int | None, +) -> Block: + if message_id is None: + raise ValueError("No message id provided to build continue in web ui block") + with get_session_with_tenant(tenant_id) as db_session: + chat_session = get_chat_session_by_message_id( + db_session=db_session, + message_id=message_id, + ) + return ActionsBlock( + block_id=build_continue_in_web_ui_id(message_id), + elements=[ + ButtonElement( + action_id=CONTINUE_IN_WEB_UI_ACTION_ID, + text="Continue Chat in Danswer!", + style="primary", + url=f"{WEB_DOMAIN}/chat?slackChatId={chat_session.id}", + ), + ], + ) + + +def _build_follow_up_block(message_id: int | None) -> ActionsBlock: return ActionsBlock( block_id=build_feedback_id(message_id) if message_id is not None else None, elements=[ @@ -483,3 +564,77 @@ def build_follow_up_resolved_blocks( ] ) return [text_block, button_block] + + +def build_slack_response_blocks( + tenant_id: str | None, + message_info: SlackMessageInfo, + answer: OneShotQAResponse, + persona: Persona | None, + channel_conf: ChannelConfig | None, + use_citations: bool, + feedback_reminder_id: str | None, + skip_ai_feedback: bool = False, +) -> list[Block]: + """ + This function is a top level function that builds all the blocks for the Slack response. + It also handles combining all the blocks together. + """ + # If called with the DanswerBot slash command, the question is lost so we have to reshow it + restate_question_block = get_restate_blocks( + message_info.thread_messages[-1].message, message_info.is_bot_msg + ) + + answer_blocks = _build_qa_response_blocks( + answer=answer, + skip_quotes=persona is not None or use_citations, + process_message_for_citations=use_citations, + ) + + web_follow_up_block = [] + if channel_conf and channel_conf.get("show_continue_in_web_ui"): + web_follow_up_block.append( + _build_continue_in_web_ui_block( + tenant_id=tenant_id, + message_id=answer.chat_message_id, + ) + ) + + follow_up_block = [] + if channel_conf and channel_conf.get("follow_up_tags") is not None: + follow_up_block.append( + _build_follow_up_block(message_id=answer.chat_message_id) + ) + + ai_feedback_block = [] + if answer.chat_message_id is not None and not skip_ai_feedback: + ai_feedback_block.append( + _build_qa_feedback_block( + message_id=answer.chat_message_id, + feedback_reminder_id=feedback_reminder_id, + ) + ) + + citations_blocks = [] + document_blocks = [] + if use_citations: + # if citations are enabled, only show cited documents + citations_blocks = _build_citations_blocks(answer) + else: + document_blocks = _priority_ordered_documents_blocks(answer) + + citations_divider = [DividerBlock()] if citations_blocks else [] + buttons_divider = [DividerBlock()] if web_follow_up_block or follow_up_block else [] + + all_blocks = ( + restate_question_block + + answer_blocks + + ai_feedback_block + + citations_divider + + citations_blocks + + document_blocks + + buttons_divider + + web_follow_up_block + + follow_up_block + ) + return all_blocks diff --git a/backend/danswer/danswerbot/slack/config.py b/backend/danswer/danswerbot/slack/config.py index 3f3f211419e..d09672705e3 100644 --- a/backend/danswer/danswerbot/slack/config.py +++ b/backend/danswer/danswerbot/slack/config.py @@ -1,7 +1,9 @@ +import os + from sqlalchemy.orm import Session -from danswer.db.models import SlackBotConfig -from danswer.db.slack_bot_config import fetch_slack_bot_configs +from danswer.db.models import SlackChannelConfig +from danswer.db.slack_channel_config import fetch_slack_channel_configs VALID_SLACK_FILTERS = [ @@ -11,15 +13,19 @@ ] -def get_slack_bot_config_for_channel( - channel_name: str | None, db_session: Session -) -> SlackBotConfig | None: +def get_slack_channel_config_for_bot_and_channel( + db_session: Session, + slack_bot_id: int, + channel_name: str | None, +) -> SlackChannelConfig | None: if not channel_name: return None - slack_bot_configs = fetch_slack_bot_configs(db_session=db_session) + slack_bot_configs = fetch_slack_channel_configs( + db_session=db_session, slack_bot_id=slack_bot_id + ) for config in slack_bot_configs: - if channel_name in config.channel_config["channel_names"]: + if channel_name in config.channel_config["channel_name"]: return config if channel_name.startswith("inc-"): # return the config that contains the string "inc-" in the channel name @@ -29,26 +35,41 @@ def get_slack_bot_config_for_channel( return None -def validate_channel_names( - channel_names: list[str], - current_slack_bot_config_id: int | None, +def validate_channel_name( db_session: Session, -) -> list[str]: - """Make sure that these channel_names don't exist in other slack bot configs. - Returns a list of cleaned up channel names (e.g. '#' removed if present)""" - slack_bot_configs = fetch_slack_bot_configs(db_session=db_session) - cleaned_channel_names = [ - channel_name.lstrip("#").lower() for channel_name in channel_names - ] - for slack_bot_config in slack_bot_configs: - if slack_bot_config.id == current_slack_bot_config_id: + current_slack_bot_id: int, + channel_name: str, + current_slack_channel_config_id: int | None, +) -> str: + """Make sure that this channel_name does not exist in other Slack channel configs. + Returns a cleaned up channel name (e.g. '#' removed if present)""" + slack_bot_configs = fetch_slack_channel_configs( + db_session=db_session, + slack_bot_id=current_slack_bot_id, + ) + cleaned_channel_name = channel_name.lstrip("#").lower() + for slack_channel_config in slack_bot_configs: + if slack_channel_config.id == current_slack_channel_config_id: continue - for channel_name in cleaned_channel_names: - if channel_name in slack_bot_config.channel_config["channel_names"]: - raise ValueError( - f"Channel name '{channel_name}' already exists in " - "another slack bot config" - ) + if cleaned_channel_name == slack_channel_config.channel_config["channel_name"]: + raise ValueError( + f"Channel name '{channel_name}' already exists in " + "another Slack channel config with in Slack Bot with name: " + f"{slack_channel_config.slack_bot.name}" + ) + + return cleaned_channel_name + + +# Scaling configurations for multi-tenant Slack channel handling +TENANT_LOCK_EXPIRATION = 1800 # How long a pod can hold exclusive access to a tenant before other pods can acquire it +TENANT_HEARTBEAT_INTERVAL = ( + 15 # How often pods send heartbeats to indicate they are still processing a tenant +) +TENANT_HEARTBEAT_EXPIRATION = ( + 30 # How long before a tenant's heartbeat expires, allowing other pods to take over +) +TENANT_ACQUISITION_INTERVAL = 60 # How often pods attempt to acquire unprocessed tenants and checks for new tokens - return cleaned_channel_names +MAX_TENANTS_PER_POD = int(os.getenv("MAX_TENANTS_PER_POD", 50)) diff --git a/backend/danswer/danswerbot/slack/constants.py b/backend/danswer/danswerbot/slack/constants.py index cf2b38032c3..6a5b3ed43ed 100644 --- a/backend/danswer/danswerbot/slack/constants.py +++ b/backend/danswer/danswerbot/slack/constants.py @@ -2,6 +2,7 @@ LIKE_BLOCK_ACTION_ID = "feedback-like" DISLIKE_BLOCK_ACTION_ID = "feedback-dislike" +CONTINUE_IN_WEB_UI_ACTION_ID = "continue-in-web-ui" FEEDBACK_DOC_BUTTON_BLOCK_ACTION_ID = "feedback-doc-button" IMMEDIATE_RESOLVED_BUTTON_ACTION_ID = "immediate-resolved-button" FOLLOWUP_BUTTON_ACTION_ID = "followup-button" diff --git a/backend/danswer/danswerbot/slack/formatting.py b/backend/danswer/danswerbot/slack/formatting.py new file mode 100644 index 00000000000..604c879df27 --- /dev/null +++ b/backend/danswer/danswerbot/slack/formatting.py @@ -0,0 +1,66 @@ +from mistune import Markdown # type: ignore +from mistune import Renderer # type: ignore + + +def format_slack_message(message: str | None) -> str: + renderer = Markdown(renderer=SlackRenderer()) + return renderer.render(message) + + +class SlackRenderer(Renderer): + SPECIALS: dict[str, str] = {"&": "&", "<": "<", ">": ">"} + + def escape_special(self, text: str) -> str: + for special, replacement in self.SPECIALS.items(): + text = text.replace(special, replacement) + return text + + def header(self, text: str, level: int, raw: str | None = None) -> str: + return f"*{text}*\n" + + def emphasis(self, text: str) -> str: + return f"_{text}_" + + def double_emphasis(self, text: str) -> str: + return f"*{text}*" + + def strikethrough(self, text: str) -> str: + return f"~{text}~" + + def list(self, body: str, ordered: bool = True) -> str: + lines = body.split("\n") + count = 0 + for i, line in enumerate(lines): + if line.startswith("li: "): + count += 1 + prefix = f"{count}. " if ordered else "• " + lines[i] = f"{prefix}{line[4:]}" + return "\n".join(lines) + + def list_item(self, text: str) -> str: + return f"li: {text}\n" + + def link(self, link: str, title: str | None, content: str | None) -> str: + escaped_link = self.escape_special(link) + if content: + return f"<{escaped_link}|{content}>" + if title: + return f"<{escaped_link}|{title}>" + return f"<{escaped_link}>" + + def image(self, src: str, title: str | None, text: str | None) -> str: + escaped_src = self.escape_special(src) + display_text = title or text + return f"<{escaped_src}|{display_text}>" if display_text else f"<{escaped_src}>" + + def codespan(self, text: str) -> str: + return f"`{text}`" + + def block_code(self, text: str, lang: str | None) -> str: + return f"```\n{text}\n```\n" + + def paragraph(self, text: str) -> str: + return f"{text}\n" + + def autolink(self, link: str, is_email: bool) -> str: + return link if is_email else self.link(link, None, None) diff --git a/backend/danswer/danswerbot/slack/handlers/handle_buttons.py b/backend/danswer/danswerbot/slack/handlers/handle_buttons.py index 9e1c171ee4f..9335b96874f 100644 --- a/backend/danswer/danswerbot/slack/handlers/handle_buttons.py +++ b/backend/danswer/danswerbot/slack/handlers/handle_buttons.py @@ -4,9 +4,7 @@ from slack_sdk import WebClient from slack_sdk.models.blocks import SectionBlock from slack_sdk.models.views import View -from slack_sdk.socket_mode import SocketModeClient from slack_sdk.socket_mode.request import SocketModeRequest -from sqlalchemy.orm import Session from danswer.configs.constants import MessageType from danswer.configs.constants import SearchFeedbackType @@ -15,7 +13,7 @@ from danswer.connectors.slack.utils import make_slack_api_rate_limited from danswer.danswerbot.slack.blocks import build_follow_up_resolved_blocks from danswer.danswerbot.slack.blocks import get_document_feedback_blocks -from danswer.danswerbot.slack.config import get_slack_bot_config_for_channel +from danswer.danswerbot.slack.config import get_slack_channel_config_for_bot_and_channel from danswer.danswerbot.slack.constants import DISLIKE_BLOCK_ACTION_ID from danswer.danswerbot.slack.constants import FeedbackVisibility from danswer.danswerbot.slack.constants import LIKE_BLOCK_ACTION_ID @@ -30,25 +28,27 @@ from danswer.danswerbot.slack.utils import build_feedback_id from danswer.danswerbot.slack.utils import decompose_action_id from danswer.danswerbot.slack.utils import fetch_group_ids_from_names -from danswer.danswerbot.slack.utils import fetch_user_ids_from_emails +from danswer.danswerbot.slack.utils import fetch_slack_user_ids_from_emails from danswer.danswerbot.slack.utils import get_channel_name_from_id from danswer.danswerbot.slack.utils import get_feedback_visibility from danswer.danswerbot.slack.utils import read_slack_thread from danswer.danswerbot.slack.utils import respond_in_thread +from danswer.danswerbot.slack.utils import TenantSocketModeClient from danswer.danswerbot.slack.utils import update_emote_react -from danswer.db.engine import get_sqlalchemy_engine +from danswer.db.engine import get_session_with_tenant from danswer.db.feedback import create_chat_message_feedback from danswer.db.feedback import create_doc_retrieval_feedback from danswer.document_index.document_index_utils import get_both_index_names from danswer.document_index.factory import get_default_document_index from danswer.utils.logger import setup_logger + logger = setup_logger() def handle_doc_feedback_button( req: SocketModeRequest, - client: SocketModeClient, + client: TenantSocketModeClient, ) -> None: if not (actions := req.payload.get("actions")): logger.error("Missing actions. Unable to build the source feedback view") @@ -81,7 +81,7 @@ def handle_doc_feedback_button( def handle_generate_answer_button( req: SocketModeRequest, - client: SocketModeClient, + client: TenantSocketModeClient, ) -> None: channel_id = req.payload["channel"]["id"] channel_name = req.payload["channel"]["name"] @@ -116,9 +116,11 @@ def handle_generate_answer_button( thread_ts=thread_ts, ) - with Session(get_sqlalchemy_engine()) as db_session: - slack_bot_config = get_slack_bot_config_for_channel( - channel_name=channel_name, db_session=db_session + with get_session_with_tenant(client.tenant_id) as db_session: + slack_channel_config = get_slack_channel_config_for_bot_and_channel( + db_session=db_session, + slack_bot_id=client.slack_bot_id, + channel_name=channel_name, ) handle_regular_answer( @@ -133,9 +135,10 @@ def handle_generate_answer_button( is_bot_msg=False, is_bot_dm=False, ), - slack_bot_config=slack_bot_config, + slack_channel_config=slack_channel_config, receiver_ids=None, client=client.web_client, + tenant_id=client.tenant_id, channel=channel_id, logger=logger, feedback_reminder_id=None, @@ -150,12 +153,11 @@ def handle_slack_feedback( user_id_to_post_confirmation: str, channel_id_to_post_confirmation: str, thread_ts_to_post_confirmation: str, + tenant_id: str | None, ) -> None: - engine = get_sqlalchemy_engine() - message_id, doc_id, doc_rank = decompose_action_id(feedback_id) - with Session(engine) as db_session: + with get_session_with_tenant(tenant_id) as db_session: if feedback_type in [LIKE_BLOCK_ACTION_ID, DISLIKE_BLOCK_ACTION_ID]: create_chat_message_feedback( is_positive=feedback_type == LIKE_BLOCK_ACTION_ID, @@ -232,7 +234,7 @@ def handle_slack_feedback( def handle_followup_button( req: SocketModeRequest, - client: SocketModeClient, + client: TenantSocketModeClient, ) -> None: action_id = None if actions := req.payload.get("actions"): @@ -252,18 +254,20 @@ def handle_followup_button( tag_ids: list[str] = [] group_ids: list[str] = [] - with Session(get_sqlalchemy_engine()) as db_session: + with get_session_with_tenant(client.tenant_id) as db_session: channel_name, is_dm = get_channel_name_from_id( client=client.web_client, channel_id=channel_id ) - slack_bot_config = get_slack_bot_config_for_channel( - channel_name=channel_name, db_session=db_session + slack_channel_config = get_slack_channel_config_for_bot_and_channel( + db_session=db_session, + slack_bot_id=client.slack_bot_id, + channel_name=channel_name, ) - if slack_bot_config: - tag_names = slack_bot_config.channel_config.get("follow_up_tags") + if slack_channel_config: + tag_names = slack_channel_config.channel_config.get("follow_up_tags") remaining = None if tag_names: - tag_ids, remaining = fetch_user_ids_from_emails( + tag_ids, remaining = fetch_slack_user_ids_from_emails( tag_names, client.web_client ) if remaining: @@ -295,7 +299,7 @@ def handle_followup_button( def get_clicker_name( req: SocketModeRequest, - client: SocketModeClient, + client: TenantSocketModeClient, ) -> str: clicker_name = req.payload.get("user", {}).get("name", "Someone") clicker_real_name = None @@ -316,7 +320,7 @@ def get_clicker_name( def handle_followup_resolved_button( req: SocketModeRequest, - client: SocketModeClient, + client: TenantSocketModeClient, immediate: bool = False, ) -> None: channel_id = req.payload["container"]["channel_id"] diff --git a/backend/danswer/danswerbot/slack/handlers/handle_message.py b/backend/danswer/danswerbot/slack/handlers/handle_message.py index a70a2423671..9d42859e6c9 100644 --- a/backend/danswer/danswerbot/slack/handlers/handle_message.py +++ b/backend/danswer/danswerbot/slack/handlers/handle_message.py @@ -10,19 +10,18 @@ handle_standard_answers, ) from danswer.danswerbot.slack.models import SlackMessageInfo -from danswer.danswerbot.slack.utils import fetch_user_ids_from_emails +from danswer.danswerbot.slack.utils import fetch_slack_user_ids_from_emails from danswer.danswerbot.slack.utils import fetch_user_ids_from_groups from danswer.danswerbot.slack.utils import respond_in_thread from danswer.danswerbot.slack.utils import slack_usage_report from danswer.danswerbot.slack.utils import update_emote_react -from danswer.db.engine import get_sqlalchemy_engine -from danswer.db.models import SlackBotConfig -from danswer.db.users import add_non_web_user_if_not_exists +from danswer.db.engine import get_session_with_tenant +from danswer.db.models import SlackChannelConfig +from danswer.db.users import add_slack_user_if_not_exists from danswer.utils.logger import setup_logger from shared_configs.configs import SLACK_CHANNEL_ID from slack_sdk import WebClient from slack_sdk.errors import SlackApiError -from sqlalchemy.orm import Session logger_base = setup_logger() @@ -106,9 +105,10 @@ def remove_scheduled_feedback_reminder( def handle_message( message_info: SlackMessageInfo, - slack_bot_config: SlackBotConfig | None, + slack_channel_config: SlackChannelConfig | None, client: WebClient, feedback_reminder_id: str | None, + tenant_id: str | None, ) -> bool: """Potentially respond to the user message depending on filters and if an answer was generated @@ -134,10 +134,12 @@ def handle_message( action = "slack_tag_message" elif is_bot_dm: action = "slack_dm_message" - slack_usage_report(action=action, sender_id=sender_id, client=client) + slack_usage_report( + action=action, sender_id=sender_id, client=client, tenant_id=tenant_id + ) document_set_names: list[str] | None = None - persona = slack_bot_config.persona if slack_bot_config else None + persona = slack_channel_config.persona if slack_channel_config else None prompt = None if persona: document_set_names = [ @@ -149,8 +151,8 @@ def handle_message( respond_member_group_list = None channel_conf = None - if slack_bot_config and slack_bot_config.channel_config: - channel_conf = slack_bot_config.channel_config + if slack_channel_config and slack_channel_config.channel_config: + channel_conf = slack_channel_config.channel_config if not bypass_filters and "answer_filters" in channel_conf: if ( "questionmark_prefilter" in channel_conf["answer_filters"] @@ -181,7 +183,7 @@ def handle_message( send_to: list[str] | None = None missing_users: list[str] | None = None if respond_member_group_list: - send_to, missing_ids = fetch_user_ids_from_emails( + send_to, missing_ids = fetch_slack_user_ids_from_emails( respond_member_group_list, client ) @@ -208,31 +210,32 @@ def handle_message( except SlackApiError as e: logger.error(f"Was not able to react to user message due to: {e}") - with Session(get_sqlalchemy_engine()) as db_session: + with get_session_with_tenant(tenant_id) as db_session: if message_info.email: - add_non_web_user_if_not_exists(message_info.email, db_session) - - # first check if we need to respond with a standard answer - used_standard_answer = handle_standard_answers( - message_info=message_info, - receiver_ids=send_to, - slack_bot_config=slack_bot_config, - prompt=prompt, - logger=logger, - client=client, - db_session=db_session, - ) - if used_standard_answer: - return False + add_slack_user_if_not_exists(db_session, message_info.email) + + #first check if we need to respond with a standard answer + # used_standard_answer = handle_standard_answers( + # message_info=message_info, + # receiver_ids=send_to, + # slack_channel_config=slack_channel_config, + # prompt=prompt, + # logger=logger, + # client=client, + # db_session=db_session, + # ) + # if used_standard_answer: + # return False # if no standard answer applies, try a regular answer issue_with_regular_answer = handle_regular_answer( message_info=message_info, - slack_bot_config=slack_bot_config, + slack_channel_config=slack_channel_config, receiver_ids=send_to, client=client, channel=channel, logger=logger, feedback_reminder_id=feedback_reminder_id, + tenant_id=tenant_id, ) return issue_with_regular_answer diff --git a/backend/danswer/danswerbot/slack/handlers/handle_regular_answer.py b/backend/danswer/danswerbot/slack/handlers/handle_regular_answer.py index 77eebeac58a..d1d51e310ce 100644 --- a/backend/danswer/danswerbot/slack/handlers/handle_regular_answer.py +++ b/backend/danswer/danswerbot/slack/handlers/handle_regular_answer.py @@ -16,20 +16,20 @@ from danswer.configs.danswerbot_configs import DANSWER_FOLLOWUP_EMOJI from danswer.configs.danswerbot_configs import DANSWER_REACT_EMOJI from danswer.configs.danswerbot_configs import ENABLE_DANSWERBOT_REFLEXION -from danswer.danswerbot.slack.blocks import build_documents_blocks -from danswer.danswerbot.slack.blocks import build_follow_up_block -from danswer.danswerbot.slack.blocks import build_qa_response_blocks -from danswer.danswerbot.slack.blocks import build_sources_blocks -from danswer.danswerbot.slack.blocks import get_restate_blocks +from danswer.context.search.enums import OptionalSearchSetting +from danswer.context.search.models import BaseFilters +from danswer.context.search.models import RerankingDetails +from danswer.context.search.models import RetrievalDetails +from danswer.danswerbot.slack.blocks import build_slack_response_blocks from danswer.danswerbot.slack.handlers.utils import send_team_member_message from danswer.danswerbot.slack.models import SlackMessageInfo from danswer.danswerbot.slack.utils import respond_in_thread from danswer.danswerbot.slack.utils import SlackRateLimiter from danswer.danswerbot.slack.utils import update_emote_react -from danswer.db.engine import get_sqlalchemy_engine +from danswer.db.engine import get_session_with_tenant from danswer.db.models import Persona -from danswer.db.models import SlackBotConfig from danswer.db.models import SlackBotResponseType +from danswer.db.models import SlackChannelConfig from danswer.db.persona import fetch_persona_by_id from danswer.db.search_settings import get_current_search_settings from danswer.db.users import get_user_by_email @@ -42,21 +42,10 @@ from danswer.one_shot_answer.answer_question import get_search_answer from danswer.one_shot_answer.models import DirectQARequest from danswer.one_shot_answer.models import OneShotQAResponse -from danswer.search.enums import OptionalSearchSetting -from danswer.search.models import BaseFilters -from danswer.search.models import RerankingDetails -from danswer.search.models import RetrievalDetails from danswer.utils.logger import DanswerLoggingAdapter -from danswer.utils.logger import setup_logger -from fastapi import HTTPException from retry import retry from slack_sdk import WebClient -from slack_sdk.models.blocks import DividerBlock from slack_sdk.models.blocks import SectionBlock -from sqlalchemy.orm import Session - -logger = setup_logger() - srl = SlackRateLimiter() @@ -84,12 +73,13 @@ def wrapper(*args: Any, **kwargs: Any) -> RT: def handle_regular_answer( message_info: SlackMessageInfo, - slack_bot_config: SlackBotConfig | None, + slack_channel_config: SlackChannelConfig | None, receiver_ids: list[str] | None, client: WebClient, channel: str, logger: DanswerLoggingAdapter, feedback_reminder_id: str | None, + tenant_id: str | None, num_retries: int = DANSWER_BOT_NUM_RETRIES, answer_generation_timeout: int = DANSWER_BOT_ANSWER_GENERATION_TIMEOUT, thread_context_percent: float = DANSWER_BOT_TARGET_CHUNK_PERCENTAGE, @@ -98,19 +88,18 @@ def handle_regular_answer( disable_cot: bool = DANSWER_BOT_DISABLE_COT, reflexion: bool = ENABLE_DANSWERBOT_REFLEXION, ) -> bool: - channel_conf = slack_bot_config.channel_config if slack_bot_config else None + channel_conf = slack_channel_config.channel_config if slack_channel_config else None messages = message_info.thread_messages message_ts_to_respond_to = message_info.msg_to_respond is_bot_msg = message_info.is_bot_msg - + user = None if message_info.email: - engine = get_sqlalchemy_engine() - with Session(engine) as db_session: + with get_session_with_tenant(tenant_id) as db_session: user = get_user_by_email(message_info.email, db_session) document_set_names: list[str] | None = None - persona = slack_bot_config.persona if slack_bot_config else None + persona = slack_channel_config.persona if slack_channel_config else None prompt = None if persona: document_set_names = [ @@ -122,9 +111,9 @@ def handle_regular_answer( bypass_acl = False if ( - slack_bot_config - and slack_bot_config.persona - and slack_bot_config.persona.document_sets + slack_channel_config + and slack_channel_config.persona + and slack_channel_config.persona.document_sets ): # For Slack channels, use the full document set, admin will be warned when configuring it # with non-public document sets @@ -133,8 +122,8 @@ def handle_regular_answer( # figure out if we want to use citations or quotes use_citations = ( not DANSWER_BOT_USE_QUOTES - if slack_bot_config is None - else slack_bot_config.response_type == SlackBotResponseType.CITATIONS + if slack_channel_config is None + else slack_channel_config.response_type == SlackBotResponseType.CITATIONS ) if not message_ts_to_respond_to and not is_bot_msg: @@ -153,15 +142,11 @@ def _get_answer(new_message_request: DirectQARequest) -> OneShotQAResponse | Non max_document_tokens: int | None = None max_history_tokens: int | None = None - with Session(get_sqlalchemy_engine()) as db_session: + with get_session_with_tenant(tenant_id) as db_session: if len(new_message_request.messages) > 1: if new_message_request.persona_config: - raise HTTPException( - status_code=403, - detail="Slack bot does not support persona config", - ) - - elif new_message_request.persona_id: + raise RuntimeError("Slack bot does not support persona config") + elif new_message_request.persona_id is not None: persona = cast( Persona, fetch_persona_by_id( @@ -171,6 +156,10 @@ def _get_answer(new_message_request: DirectQARequest) -> OneShotQAResponse | Non get_editable=False, ), ) + else: + raise RuntimeError( + "No persona id provided, this should never happen." + ) llm, _ = get_llms_for_persona(persona) @@ -213,6 +202,7 @@ def _get_answer(new_message_request: DirectQARequest) -> OneShotQAResponse | Non use_citations=use_citations, danswerbot_flow=True, ) + if not answer.error_msg: return answer else: @@ -235,8 +225,8 @@ def _get_answer(new_message_request: DirectQARequest) -> OneShotQAResponse | Non # persona.llm_filter_extraction if persona is not None else True # ) auto_detect_filters = ( - slack_bot_config.enable_auto_filters - if slack_bot_config is not None + slack_channel_config.enable_auto_filters + if slack_channel_config is not None else False ) retrieval_details = RetrievalDetails( @@ -247,7 +237,7 @@ def _get_answer(new_message_request: DirectQARequest) -> OneShotQAResponse | Non ) # Always apply reranking settings if it exists, this is the non-streaming flow - with Session(get_sqlalchemy_engine()) as db_session: + with get_session_with_tenant(tenant_id) as db_session: saved_search_settings = get_current_search_settings(db_session) # This includes throwing out answer via reflexion @@ -416,61 +406,16 @@ def _get_answer(new_message_request: DirectQARequest) -> OneShotQAResponse | Non ) return True - # If called with the DanswerBot slash command, the question is lost so we have to reshow it - restate_question_block = get_restate_blocks(messages[-1].message, is_bot_msg) - - answer_blocks = build_qa_response_blocks( - message_id=answer.chat_message_id, - answer=answer.answer, - quotes=answer.quotes.quotes if answer.quotes else None, - source_filters=retrieval_info.applied_source_filters, - time_cutoff=retrieval_info.applied_time_cutoff, - favor_recent=retrieval_info.recency_bias_multiplier > 1, - # currently Personas don't support quotes - # if citations are enabled, also don't use quotes - skip_quotes=persona is not None or use_citations, - process_message_for_citations=use_citations, + all_blocks = build_slack_response_blocks( + tenant_id=tenant_id, + message_info=message_info, + answer=answer, + persona=persona, + channel_conf=channel_conf, + use_citations=use_citations, feedback_reminder_id=feedback_reminder_id, ) - # Get the chunks fed to the LLM only, then fill with other docs - llm_doc_inds = answer.llm_selected_doc_indices or [] - llm_docs = [top_docs[i] for i in llm_doc_inds] - remaining_docs = [ - doc for idx, doc in enumerate(top_docs) if idx not in llm_doc_inds - ] - priority_ordered_docs = llm_docs + remaining_docs - - document_blocks = [] - citations_block = [] - # if citations are enabled, only show cited documents - if use_citations: - citations = answer.citations or [] - cited_docs = [] - for citation in citations: - matching_doc = next( - (d for d in top_docs if d.document_id == citation.document_id), - None, - ) - if matching_doc: - cited_docs.append((citation.citation_num, matching_doc)) - - cited_docs.sort() - citations_block = build_sources_blocks(cited_documents=cited_docs) - elif priority_ordered_docs: - document_blocks = build_documents_blocks( - documents=priority_ordered_docs, - message_id=answer.chat_message_id, - ) - document_blocks = [DividerBlock()] + document_blocks - - all_blocks = ( - restate_question_block + answer_blocks + citations_block + document_blocks - ) - - if channel_conf and channel_conf.get("follow_up_tags") is not None: - all_blocks.append(build_follow_up_block(message_id=answer.chat_message_id)) - try: respond_in_thread( client=client, diff --git a/backend/danswer/danswerbot/slack/handlers/handle_standard_answers.py b/backend/danswer/danswerbot/slack/handlers/handle_standard_answers.py index e008e26e1b9..29652296ca7 100644 --- a/backend/danswer/danswerbot/slack/handlers/handle_standard_answers.py +++ b/backend/danswer/danswerbot/slack/handlers/handle_standard_answers.py @@ -1,10 +1,3 @@ -from slack_sdk import WebClient -from slack_sdk.models.blocks import ActionsBlock -from slack_sdk.models.blocks import Block -from slack_sdk.models.blocks import ButtonElement -from slack_sdk.models.blocks import SectionBlock -from sqlalchemy.orm import Session - from danswer.configs.constants import MessageType from danswer.configs.danswerbot_configs import DANSWER_REACT_EMOJI from danswer.danswerbot.slack.blocks import get_restate_blocks @@ -19,13 +12,18 @@ from danswer.db.chat import get_chat_sessions_by_slack_thread_id from danswer.db.chat import get_or_create_root_message from danswer.db.models import Prompt -from danswer.db.models import SlackBotConfig -from danswer.db.models import StandardAnswer as StandardAnswerModel -from danswer.utils.logger import DanswerLoggingAdapter -from danswer.utils.logger import setup_logger +from danswer.db.models import SlackChannelConfig from danswer.db.standard_answer import fetch_standard_answer_categories_by_names from danswer.db.standard_answer import find_matching_standard_answers +from danswer.utils.logger import DanswerLoggingAdapter +from danswer.utils.logger import setup_logger from ee.danswer.server.manage.models import StandardAnswer as PydanticStandardAnswer +from slack_sdk import WebClient +from slack_sdk.models.blocks import ActionsBlock +from slack_sdk.models.blocks import Block +from slack_sdk.models.blocks import ButtonElement +from slack_sdk.models.blocks import SectionBlock +from sqlalchemy.orm import Session logger = setup_logger() @@ -80,7 +78,7 @@ def oneoff_standard_answers( def handle_standard_answers( message_info: SlackMessageInfo, receiver_ids: list[str] | None, - slack_bot_config: SlackBotConfig | None, + slack_channel_config: SlackChannelConfig | None, prompt: Prompt | None, logger: DanswerLoggingAdapter, client: WebClient, @@ -102,39 +100,14 @@ def handle_standard_answers( configured_standard_answer_categories = ( slack_bot_config.standard_answer_categories if slack_bot_config else [] ) - configured_standard_answers = set( - [ - standard_answer - for standard_answer_category in configured_standard_answer_categories - for standard_answer in standard_answer_category.standard_answers - ] - ) - query_msg = message_info.thread_messages[-1] - - if slack_thread_id is None: - used_standard_answer_ids = set([]) - else: - chat_sessions = get_chat_sessions_by_slack_thread_id( - slack_thread_id=slack_thread_id, - user_id=None, - db_session=db_session, - ) - chat_messages = get_chat_messages_by_sessions( - chat_session_ids=[chat_session.id for chat_session in chat_sessions], - user_id=None, - db_session=db_session, - skip_permission_check=True, - ) - used_standard_answer_ids = set( - [ - standard_answer.id - for chat_message in chat_messages - for standard_answer in chat_message.standard_answers - ] - ) - - usable_standard_answers = configured_standard_answers.difference( - used_standard_answer_ids + return versioned_handle_standard_answers( + message_info=message_info, + receiver_ids=receiver_ids, + slack_channel_config=slack_channel_config, + prompt=prompt, + logger=logger, + client=client, + db_session=db_session, ) matching_standard_answers: list[tuple[StandardAnswerModel, str]] = [] @@ -145,16 +118,18 @@ def handle_standard_answers( db_session=db_session, ) - if matching_standard_answers: - chat_session = create_chat_session( - db_session=db_session, - description="", - user_id=None, - persona_id=slack_bot_config.persona.id if slack_bot_config.persona else 0, - danswerbot_flow=True, - slack_thread_id=slack_thread_id, - one_shot=True, - ) +def _handle_standard_answers( + message_info: SlackMessageInfo, + receiver_ids: list[str] | None, + slack_channel_config: SlackChannelConfig | None, + prompt: Prompt | None, + logger: DanswerLoggingAdapter, + client: WebClient, + db_session: Session, +) -> bool: + """ + Standard Answers are a paid Enterprise Edition feature. This is the fallback + function handling the case where EE features are not enabled. root_message = get_or_create_root_message( chat_session_id=chat_session.id, db_session=db_session @@ -236,3 +211,4 @@ def handle_standard_answers( return False else: return False +""" diff --git a/backend/danswer/danswerbot/slack/listener.py b/backend/danswer/danswerbot/slack/listener.py index c430f1b31b7..5f6cabb3406 100644 --- a/backend/danswer/danswerbot/slack/listener.py +++ b/backend/danswer/danswerbot/slack/listener.py @@ -1,20 +1,39 @@ +import asyncio +import os +import signal +import sys +import threading import time +from collections.abc import Callable from threading import Event +from types import FrameType from typing import Any from typing import cast +from typing import Dict +from typing import Set +from prometheus_client import Gauge +from prometheus_client import start_http_server from slack_sdk import WebClient -from slack_sdk.socket_mode import SocketModeClient from slack_sdk.socket_mode.request import SocketModeRequest from slack_sdk.socket_mode.response import SocketModeResponse from sqlalchemy.orm import Session +from danswer.configs.app_configs import POD_NAME +from danswer.configs.app_configs import POD_NAMESPACE +from danswer.configs.constants import DanswerRedisLocks from danswer.configs.constants import MessageType from danswer.configs.danswerbot_configs import DANSWER_BOT_REPHRASE_MESSAGE from danswer.configs.danswerbot_configs import DANSWER_BOT_RESPOND_EVERY_CHANNEL from danswer.configs.danswerbot_configs import NOTIFY_SLACKBOT_NO_ANSWER from danswer.connectors.slack.utils import expert_info_from_slack_id -from danswer.danswerbot.slack.config import get_slack_bot_config_for_channel +from danswer.context.search.retrieval.search_runner import download_nltk_data +from danswer.danswerbot.slack.config import get_slack_channel_config_for_bot_and_channel +from danswer.danswerbot.slack.config import MAX_TENANTS_PER_POD +from danswer.danswerbot.slack.config import TENANT_ACQUISITION_INTERVAL +from danswer.danswerbot.slack.config import TENANT_HEARTBEAT_EXPIRATION +from danswer.danswerbot.slack.config import TENANT_HEARTBEAT_INTERVAL +from danswer.danswerbot.slack.config import TENANT_LOCK_EXPIRATION from danswer.danswerbot.slack.constants import DISLIKE_BLOCK_ACTION_ID from danswer.danswerbot.slack.constants import FEEDBACK_DOC_BUTTON_BLOCK_ACTION_ID from danswer.danswerbot.slack.constants import FOLLOWUP_BUTTON_ACTION_ID @@ -38,31 +57,45 @@ ) from danswer.danswerbot.slack.handlers.handle_message import schedule_feedback_reminder from danswer.danswerbot.slack.models import SlackMessageInfo -from danswer.danswerbot.slack.tokens import fetch_tokens from danswer.danswerbot.slack.utils import check_message_limit from danswer.danswerbot.slack.utils import decompose_action_id from danswer.danswerbot.slack.utils import get_channel_name_from_id -from danswer.danswerbot.slack.utils import get_danswer_bot_app_id +from danswer.danswerbot.slack.utils import get_danswer_bot_slack_bot_id from danswer.danswerbot.slack.utils import read_slack_thread from danswer.danswerbot.slack.utils import remove_danswer_bot_tag from danswer.danswerbot.slack.utils import rephrase_slack_message from danswer.danswerbot.slack.utils import respond_in_thread -from danswer.db.engine import get_sqlalchemy_engine +from danswer.danswerbot.slack.utils import TenantSocketModeClient +from danswer.db.engine import get_all_tenant_ids +from danswer.db.engine import get_session_with_tenant +from danswer.db.models import SlackBot from danswer.db.search_settings import get_current_search_settings -from danswer.dynamic_configs.interface import ConfigNotFoundError +from danswer.db.slack_bot import fetch_slack_bots +from danswer.key_value_store.interface import KvKeyNotFoundError from danswer.natural_language_processing.search_nlp_models import EmbeddingModel from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder from danswer.one_shot_answer.models import ThreadMessage -from danswer.search.retrieval.search_runner import download_nltk_data +from danswer.redis.redis_pool import get_redis_client from danswer.server.manage.models import SlackBotTokens from danswer.utils.logger import setup_logger from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable +from shared_configs.configs import DISALLOWED_SLACK_BOT_TENANT_LIST from shared_configs.configs import MODEL_SERVER_HOST from shared_configs.configs import MODEL_SERVER_PORT +from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA from shared_configs.configs import SLACK_CHANNEL_ID +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR + logger = setup_logger() +# Prometheus metric for HPA +active_tenants_gauge = Gauge( + "active_tenants", + "Number of active tenants handled by this pod", + ["namespace", "pod"], +) + # In rare cases, some users have been experiencing a massive amount of trivial messages coming through # to the Slack Bot with trivial messages. Adding this to avoid exploding LLM costs while we track down # the cause. @@ -76,11 +109,258 @@ ":wave:", } -# this is always (currently) the user id of Slack's official slackbot +# This is always (currently) the user id of Slack's official slackbot _OFFICIAL_SLACKBOT_USER_ID = "USLACKBOT" -def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool: +class SlackbotHandler: + def __init__(self) -> None: + logger.info("Initializing SlackbotHandler") + self.tenant_ids: Set[str | None] = set() + # The keys for these dictionaries are tuples of (tenant_id, slack_bot_id) + self.socket_clients: Dict[tuple[str | None, int], TenantSocketModeClient] = {} + self.slack_bot_tokens: Dict[tuple[str | None, int], SlackBotTokens] = {} + + self.running = True + self.pod_id = self.get_pod_id() + self._shutdown_event = Event() + logger.info(f"Pod ID: {self.pod_id}") + + # Set up signal handlers for graceful shutdown + signal.signal(signal.SIGTERM, self.shutdown) + signal.signal(signal.SIGINT, self.shutdown) + logger.info("Signal handlers registered") + + # Start the Prometheus metrics server + logger.info("Starting Prometheus metrics server") + start_http_server(8000) + logger.info("Prometheus metrics server started") + + # Start background threads + logger.info("Starting background threads") + self.acquire_thread = threading.Thread( + target=self.acquire_tenants_loop, daemon=True + ) + self.heartbeat_thread = threading.Thread( + target=self.heartbeat_loop, daemon=True + ) + + self.acquire_thread.start() + self.heartbeat_thread.start() + logger.info("Background threads started") + + def get_pod_id(self) -> str: + pod_id = os.environ.get("HOSTNAME", "unknown_pod") + logger.info(f"Retrieved pod ID: {pod_id}") + return pod_id + + def acquire_tenants_loop(self) -> None: + while not self._shutdown_event.is_set(): + try: + self.acquire_tenants() + active_tenants_gauge.labels(namespace=POD_NAMESPACE, pod=POD_NAME).set( + len(self.tenant_ids) + ) + logger.debug(f"Current active tenants: {len(self.tenant_ids)}") + except Exception as e: + logger.exception(f"Error in Slack acquisition: {e}") + self._shutdown_event.wait(timeout=TENANT_ACQUISITION_INTERVAL) + + def heartbeat_loop(self) -> None: + while not self._shutdown_event.is_set(): + try: + self.send_heartbeats() + logger.debug(f"Sent heartbeats for {len(self.tenant_ids)} tenants") + except Exception as e: + logger.exception(f"Error in heartbeat loop: {e}") + self._shutdown_event.wait(timeout=TENANT_HEARTBEAT_INTERVAL) + + def _manage_clients_per_tenant( + self, db_session: Session, tenant_id: str | None, bot: SlackBot + ) -> None: + slack_bot_tokens = SlackBotTokens( + bot_token=bot.bot_token, + app_token=bot.app_token, + ) + tenant_bot_pair = (tenant_id, bot.id) + + # If the tokens are not set, we need to close the socket client and delete the tokens + # for the tenant and app + if not slack_bot_tokens: + logger.debug( + f"No Slack bot token found for tenant {tenant_id}, bot {bot.id}" + ) + if tenant_bot_pair in self.socket_clients: + asyncio.run(self.socket_clients[tenant_bot_pair].close()) + del self.socket_clients[tenant_bot_pair] + del self.slack_bot_tokens[tenant_bot_pair] + return + + tokens_exist = tenant_bot_pair in self.slack_bot_tokens + tokens_changed = ( + tokens_exist and slack_bot_tokens != self.slack_bot_tokens[tenant_bot_pair] + ) + if not tokens_exist or tokens_changed: + if tokens_exist: + logger.info( + f"Slack Bot tokens have changed for tenant {tenant_id}, bot {bot.id} - reconnecting" + ) + else: + search_settings = get_current_search_settings(db_session) + embedding_model = EmbeddingModel.from_db_model( + search_settings=search_settings, + server_host=MODEL_SERVER_HOST, + server_port=MODEL_SERVER_PORT, + ) + warm_up_bi_encoder(embedding_model=embedding_model) + + self.slack_bot_tokens[tenant_bot_pair] = slack_bot_tokens + + if tenant_bot_pair in self.socket_clients: + asyncio.run(self.socket_clients[tenant_bot_pair].close()) + + self.start_socket_client(bot.id, tenant_id, slack_bot_tokens) + + def acquire_tenants(self) -> None: + tenant_ids = get_all_tenant_ids() + + for tenant_id in tenant_ids: + if ( + DISALLOWED_SLACK_BOT_TENANT_LIST is not None + and tenant_id in DISALLOWED_SLACK_BOT_TENANT_LIST + ): + logger.debug(f"Tenant {tenant_id} is in the disallowed list, skipping") + continue + + if tenant_id in self.tenant_ids: + logger.debug(f"Tenant {tenant_id} already in self.tenant_ids") + continue + + if len(self.tenant_ids) >= MAX_TENANTS_PER_POD: + logger.info( + f"Max tenants per pod reached ({MAX_TENANTS_PER_POD}) Not acquiring any more tenants" + ) + break + + redis_client = get_redis_client(tenant_id=tenant_id) + pod_id = self.pod_id + acquired = redis_client.set( + DanswerRedisLocks.SLACK_BOT_LOCK, + pod_id, + nx=True, + ex=TENANT_LOCK_EXPIRATION, + ) + if not acquired: + logger.debug(f"Another pod holds the lock for tenant {tenant_id}") + continue + + logger.debug(f"Acquired lock for tenant {tenant_id}") + + self.tenant_ids.add(tenant_id) + + for tenant_id in self.tenant_ids: + token = CURRENT_TENANT_ID_CONTEXTVAR.set( + tenant_id or POSTGRES_DEFAULT_SCHEMA + ) + try: + with get_session_with_tenant(tenant_id) as db_session: + try: + bots = fetch_slack_bots(db_session=db_session) + for bot in bots: + self._manage_clients_per_tenant( + db_session=db_session, + tenant_id=tenant_id, + bot=bot, + ) + + except KvKeyNotFoundError: + logger.debug(f"Missing Slack Bot tokens for tenant {tenant_id}") + if (tenant_id, bot.id) in self.socket_clients: + asyncio.run(self.socket_clients[tenant_id, bot.id].close()) + del self.socket_clients[tenant_id, bot.id] + del self.slack_bot_tokens[tenant_id, bot.id] + except Exception as e: + logger.exception(f"Error handling tenant {tenant_id}: {e}") + finally: + CURRENT_TENANT_ID_CONTEXTVAR.reset(token) + + def send_heartbeats(self) -> None: + current_time = int(time.time()) + logger.debug(f"Sending heartbeats for {len(self.tenant_ids)} tenants") + for tenant_id in self.tenant_ids: + redis_client = get_redis_client(tenant_id=tenant_id) + heartbeat_key = ( + f"{DanswerRedisLocks.SLACK_BOT_HEARTBEAT_PREFIX}:{self.pod_id}" + ) + redis_client.set( + heartbeat_key, current_time, ex=TENANT_HEARTBEAT_EXPIRATION + ) + + def start_socket_client( + self, slack_bot_id: int, tenant_id: str | None, slack_bot_tokens: SlackBotTokens + ) -> None: + logger.info( + f"Starting socket client for tenant: {tenant_id}, app: {slack_bot_id}" + ) + socket_client: TenantSocketModeClient = _get_socket_client( + slack_bot_tokens, tenant_id, slack_bot_id + ) + + # Append the event handler + process_slack_event = create_process_slack_event() + socket_client.socket_mode_request_listeners.append(process_slack_event) # type: ignore + + # Establish a WebSocket connection to the Socket Mode servers + logger.info( + f"Connecting socket client for tenant: {tenant_id}, app: {slack_bot_id}" + ) + socket_client.connect() + self.socket_clients[tenant_id, slack_bot_id] = socket_client + self.tenant_ids.add(tenant_id) + logger.info( + f"Started SocketModeClient for tenant: {tenant_id}, app: {slack_bot_id}" + ) + + def stop_socket_clients(self) -> None: + logger.info(f"Stopping {len(self.socket_clients)} socket clients") + for (tenant_id, slack_bot_id), client in self.socket_clients.items(): + asyncio.run(client.close()) + logger.info( + f"Stopped SocketModeClient for tenant: {tenant_id}, app: {slack_bot_id}" + ) + + def shutdown(self, signum: int | None, frame: FrameType | None) -> None: + if not self.running: + return + + logger.info("Shutting down gracefully") + self.running = False + self._shutdown_event.set() + + # Stop all socket clients + logger.info(f"Stopping {len(self.socket_clients)} socket clients") + self.stop_socket_clients() + + # Release locks for all tenants + logger.info(f"Releasing locks for {len(self.tenant_ids)} tenants") + for tenant_id in self.tenant_ids: + try: + redis_client = get_redis_client(tenant_id=tenant_id) + redis_client.delete(DanswerRedisLocks.SLACK_BOT_LOCK) + logger.info(f"Released lock for tenant {tenant_id}") + except Exception as e: + logger.error(f"Error releasing lock for tenant {tenant_id}: {e}") + + # Wait for background threads to finish (with timeout) + logger.info("Waiting for background threads to finish...") + self.acquire_thread.join(timeout=5) + self.heartbeat_thread.join(timeout=5) + + logger.info("Shutdown complete") + sys.exit(0) + + +def prefilter_requests(req: SocketModeRequest, client: TenantSocketModeClient) -> bool: """True to keep going, False to ignore this Slack request""" if req.type == "events_api": # Verify channel is valid @@ -131,9 +411,8 @@ def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool ) return False + bot_tag_id = get_danswer_bot_slack_bot_id(client.web_client) if event_type == "message": - bot_tag_id = get_danswer_bot_app_id(client.web_client) - is_dm = event.get("channel_type") == "im" is_tagged = bot_tag_id and bot_tag_id in msg is_danswer_bot_msg = bot_tag_id and bot_tag_id in event.get("user", "") @@ -154,13 +433,16 @@ def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool client=client.web_client, channel_id=channel ) - engine = get_sqlalchemy_engine() - with Session(engine) as db_session: - slack_bot_config = get_slack_bot_config_for_channel( - channel_name=channel_name, db_session=db_session + with get_session_with_tenant(client.tenant_id) as db_session: + slack_channel_config = get_slack_channel_config_for_bot_and_channel( + db_session=db_session, + slack_bot_id=client.slack_bot_id, + channel_name=channel_name, ) - if not slack_bot_config or not slack_bot_config.channel_config.get( - "respond_to_bots" + # If DanswerBot is not specifically tagged and the channel is not set to respond to bots, ignore the message + if (not bot_tag_id or bot_tag_id not in msg) and ( + not slack_channel_config + or not slack_channel_config.channel_config.get("respond_to_bots") ): channel_specific_logger.info("Ignoring message from bot") return False @@ -171,7 +453,7 @@ def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool message_subtype = event.get("subtype") if message_subtype not in [None, "file_share"]: channel_specific_logger.info( - f"Ignoring message with subtype '{message_subtype}' since is is a special message type" + f"Ignoring message with subtype '{message_subtype}' since it is a special message type" ) return False @@ -220,7 +502,7 @@ def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool return True -def process_feedback(req: SocketModeRequest, client: SocketModeClient) -> None: +def process_feedback(req: SocketModeRequest, client: TenantSocketModeClient) -> None: if actions := req.payload.get("actions"): action = cast(dict[str, Any], actions[0]) feedback_type = cast(str, action.get("action_id")) @@ -242,14 +524,15 @@ def process_feedback(req: SocketModeRequest, client: SocketModeClient) -> None: user_id_to_post_confirmation=user_id, channel_id_to_post_confirmation=channel_id, thread_ts_to_post_confirmation=thread_ts, + tenant_id=client.tenant_id, ) query_event_id, _, _ = decompose_action_id(feedback_id) - logger.notice(f"Successfully handled QA feedback for event: {query_event_id}") + logger.info(f"Successfully handled QA feedback for event: {query_event_id}") def build_request_details( - req: SocketModeRequest, client: SocketModeClient + req: SocketModeRequest, client: TenantSocketModeClient ) -> SlackMessageInfo: if req.type == "events_api": event = cast(dict[str, Any], req.payload["event"]) @@ -267,14 +550,14 @@ def build_request_details( msg = remove_danswer_bot_tag(msg, client=client.web_client) if DANSWER_BOT_REPHRASE_MESSAGE: - logger.notice(f"Rephrasing Slack message. Original message: {msg}") + logger.info(f"Rephrasing Slack message. Original message: {msg}") try: msg = rephrase_slack_message(msg) - logger.notice(f"Rephrased message: {msg}") + logger.info(f"Rephrased message: {msg}") except Exception as e: logger.error(f"Error while trying to rephrase the Slack message: {e}") else: - logger.notice(f"Received Slack message: {msg}") + logger.info(f"Received Slack message: {msg}") if tagged: logger.debug("User tagged DanswerBot") @@ -328,7 +611,7 @@ def build_request_details( def apologize_for_fail( details: SlackMessageInfo, - client: SocketModeClient, + client: TenantSocketModeClient, ) -> None: respond_in_thread( client=client.web_client, @@ -340,11 +623,13 @@ def apologize_for_fail( def process_message( req: SocketModeRequest, - client: SocketModeClient, + client: TenantSocketModeClient, respond_every_channel: bool = DANSWER_BOT_RESPOND_EVERY_CHANNEL, notify_no_answer: bool = NOTIFY_SLACKBOT_NO_ANSWER, ) -> None: - logger.debug(f"Received Slack request of type: '{req.type}'") + logger.debug( + f"Received Slack request of type: '{req.type}' for tenant, {client.tenant_id}" + ) # Throw out requests that can't or shouldn't be handled if not prefilter_requests(req, client): @@ -356,59 +641,70 @@ def process_message( client=client.web_client, channel_id=channel ) - engine = get_sqlalchemy_engine() - with Session(engine) as db_session: - slack_bot_config = get_slack_bot_config_for_channel( - channel_name=channel_name, db_session=db_session - ) - - # Be careful about this default, don't want to accidentally spam every channel - # Users should be able to DM slack bot in their private channels though - if ( - slack_bot_config is None - and not respond_every_channel - # Can't have configs for DMs so don't toss them out - and not is_dm - # If /DanswerBot (is_bot_msg) or @DanswerBot (bypass_filters) - # always respond with the default configs - and not (details.is_bot_msg or details.bypass_filters) - ): - return - - follow_up = bool( - slack_bot_config - and slack_bot_config.channel_config - and slack_bot_config.channel_config.get("follow_up_tags") is not None - ) - feedback_reminder_id = schedule_feedback_reminder( - details=details, client=client.web_client, include_followup=follow_up - ) + # Set the current tenant ID at the beginning for all DB calls within this thread + if client.tenant_id: + logger.info(f"Setting tenant ID to {client.tenant_id}") + token = CURRENT_TENANT_ID_CONTEXTVAR.set(client.tenant_id) + try: + with get_session_with_tenant(client.tenant_id) as db_session: + slack_channel_config = get_slack_channel_config_for_bot_and_channel( + db_session=db_session, + slack_bot_id=client.slack_bot_id, + channel_name=channel_name, + ) - failed = handle_message( - message_info=details, - slack_bot_config=slack_bot_config, - client=client.web_client, - feedback_reminder_id=feedback_reminder_id, - ) + # Be careful about this default, don't want to accidentally spam every channel + # Users should be able to DM slack bot in their private channels though + if ( + slack_channel_config is None + and not respond_every_channel + # Can't have configs for DMs so don't toss them out + and not is_dm + # If /DanswerBot (is_bot_msg) or @DanswerBot (bypass_filters) + # always respond with the default configs + and not (details.is_bot_msg or details.bypass_filters) + ): + return - if failed: - if feedback_reminder_id: - remove_scheduled_feedback_reminder( - client=client.web_client, - channel=details.sender, - msg_id=feedback_reminder_id, - ) - # Skipping answering due to pre-filtering is not considered a failure - if notify_no_answer: - apologize_for_fail(details, client) + follow_up = bool( + slack_channel_config + and slack_channel_config.channel_config + and slack_channel_config.channel_config.get("follow_up_tags") + is not None + ) + feedback_reminder_id = schedule_feedback_reminder( + details=details, client=client.web_client, include_followup=follow_up + ) + failed = handle_message( + message_info=details, + slack_channel_config=slack_channel_config, + client=client.web_client, + feedback_reminder_id=feedback_reminder_id, + tenant_id=client.tenant_id, + ) -def acknowledge_message(req: SocketModeRequest, client: SocketModeClient) -> None: + if failed: + if feedback_reminder_id: + remove_scheduled_feedback_reminder( + client=client.web_client, + channel=details.sender, + msg_id=feedback_reminder_id, + ) + # Skipping answering due to pre-filtering is not considered a failure + if notify_no_answer: + apologize_for_fail(details, client) + finally: + if client.tenant_id: + CURRENT_TENANT_ID_CONTEXTVAR.reset(token) + + +def acknowledge_message(req: SocketModeRequest, client: TenantSocketModeClient) -> None: response = SocketModeResponse(envelope_id=req.envelope_id) client.send_socket_mode_response(response) -def action_routing(req: SocketModeRequest, client: SocketModeClient) -> None: +def action_routing(req: SocketModeRequest, client: TenantSocketModeClient) -> None: if actions := req.payload.get("actions"): action = cast(dict[str, Any], actions[0]) @@ -428,107 +724,65 @@ def action_routing(req: SocketModeRequest, client: SocketModeClient) -> None: return handle_generate_answer_button(req, client) -def view_routing(req: SocketModeRequest, client: SocketModeClient) -> None: +def view_routing(req: SocketModeRequest, client: TenantSocketModeClient) -> None: if view := req.payload.get("view"): if view["callback_id"] == VIEW_DOC_FEEDBACK_ID: return process_feedback(req, client) -def process_slack_event(client: SocketModeClient, req: SocketModeRequest) -> None: - # Always respond right away, if Slack doesn't receive these frequently enough - # it will assume the Bot is DEAD!!! :( - acknowledge_message(req, client) - - try: - if req.type == "interactive": - if req.payload.get("type") == "block_actions": - return action_routing(req, client) - elif req.payload.get("type") == "view_submission": - return view_routing(req, client) - elif req.type == "events_api" or req.type == "slash_commands": - return process_message(req, client) - except Exception: - logger.exception("Failed to process slack event") +def create_process_slack_event() -> ( + Callable[[TenantSocketModeClient, SocketModeRequest], None] +): + def process_slack_event( + client: TenantSocketModeClient, req: SocketModeRequest + ) -> None: + # Always respond right away, if Slack doesn't receive these frequently enough + # it will assume the Bot is DEAD!!! :( + acknowledge_message(req, client) - -def _get_socket_client(slack_bot_tokens: SlackBotTokens) -> SocketModeClient: + try: + if req.type == "interactive": + if req.payload.get("type") == "block_actions": + return action_routing(req, client) + elif req.payload.get("type") == "view_submission": + return view_routing(req, client) + elif req.type == "events_api" or req.type == "slash_commands": + return process_message(req, client) + except Exception: + logger.exception("Failed to process slack event") + + return process_slack_event + + +def _get_socket_client( + slack_bot_tokens: SlackBotTokens, tenant_id: str | None, slack_bot_id: int +) -> TenantSocketModeClient: # For more info on how to set this up, checkout the docs: # https://docs.danswer.dev/slack_bot_setup - return SocketModeClient( + return TenantSocketModeClient( # This app-level token will be used only for establishing a connection app_token=slack_bot_tokens.app_token, web_client=WebClient(token=slack_bot_tokens.bot_token), + tenant_id=tenant_id, + slack_bot_id=slack_bot_id, ) -def _initialize_socket_client(socket_client: SocketModeClient) -> None: - socket_client.socket_mode_request_listeners.append(process_slack_event) # type: ignore - - # Establish a WebSocket connection to the Socket Mode servers - logger.notice("Listening for messages from Slack...") - socket_client.connect() - - -# Follow the guide (https://docs.danswer.dev/slack_bot_setup) to set up -# the slack bot in your workspace, and then add the bot to any channels you want to -# try and answer questions for. Running this file will setup Danswer to listen to all -# messages in those channels and attempt to answer them. As of now, it will only respond -# to messages sent directly in the channel - it will not respond to messages sent within a -# thread. -# -# NOTE: we are using Web Sockets so that you can run this from within a firewalled VPC -# without issue. if __name__ == "__main__": - slack_bot_tokens: SlackBotTokens | None = None - socket_client: SocketModeClient | None = None + # Initialize the tenant handler which will manage tenant connections + logger.info("Starting SlackbotHandler") + tenant_handler = SlackbotHandler() set_is_ee_based_on_env_variable() - logger.notice("Verifying query preprocessing (NLTK) data is downloaded") + logger.info("Verifying query preprocessing (NLTK) data is downloaded") download_nltk_data() - while True: - try: - latest_slack_bot_tokens = fetch_tokens() - - if latest_slack_bot_tokens != slack_bot_tokens: - if slack_bot_tokens is not None: - logger.notice("Slack Bot tokens have changed - reconnecting") - else: - # This happens on the very first time the listener process comes up - # or the tokens have updated (set up for the first time) - with Session(get_sqlalchemy_engine()) as db_session: - search_settings = get_current_search_settings(db_session) - embedding_model = EmbeddingModel.from_db_model( - search_settings=search_settings, - server_host=MODEL_SERVER_HOST, - server_port=MODEL_SERVER_PORT, - ) - - warm_up_bi_encoder( - embedding_model=embedding_model, - ) - - slack_bot_tokens = latest_slack_bot_tokens - # potentially may cause a message to be dropped, but it is complicated - # to avoid + (1) if the user is changing tokens, they are likely okay with some - # "migration downtime" and (2) if a single message is lost it is okay - # as this should be a very rare occurrence - if socket_client: - socket_client.close() - - socket_client = _get_socket_client(slack_bot_tokens) - _initialize_socket_client(socket_client) - - # Let the handlers run in the background + re-check for token updates every 60 seconds - Event().wait(timeout=60) - except ConfigNotFoundError: - # try again every 30 seconds. This is needed since the user may add tokens - # via the UI at any point in the programs lifecycle - if we just allow it to - # fail, then the user will need to restart the containers after adding tokens - logger.debug( - "Missing Slack Bot tokens - waiting 60 seconds and trying again" - ) - if socket_client: - socket_client.disconnect() - time.sleep(60) + try: + # Keep the main thread alive + while tenant_handler.running: + time.sleep(1) + + except Exception: + logger.exception("Fatal error in main thread") + tenant_handler.shutdown(None, None) diff --git a/backend/danswer/danswerbot/slack/tokens.py b/backend/danswer/danswerbot/slack/tokens.py deleted file mode 100644 index 5de3a6a0135..00000000000 --- a/backend/danswer/danswerbot/slack/tokens.py +++ /dev/null @@ -1,28 +0,0 @@ -import os -from typing import cast - -from danswer.configs.constants import KV_SLACK_BOT_TOKENS_CONFIG_KEY -from danswer.dynamic_configs.factory import get_dynamic_config_store -from danswer.server.manage.models import SlackBotTokens - - -def fetch_tokens() -> SlackBotTokens: - # first check env variables - app_token = os.environ.get("DANSWER_BOT_SLACK_APP_TOKEN") - bot_token = os.environ.get("DANSWER_BOT_SLACK_BOT_TOKEN") - if app_token and bot_token: - return SlackBotTokens(app_token=app_token, bot_token=bot_token) - - dynamic_config_store = get_dynamic_config_store() - return SlackBotTokens( - **cast(dict, dynamic_config_store.load(key=KV_SLACK_BOT_TOKENS_CONFIG_KEY)) - ) - - -def save_tokens( - tokens: SlackBotTokens, -) -> None: - dynamic_config_store = get_dynamic_config_store() - dynamic_config_store.store( - key=KV_SLACK_BOT_TOKENS_CONFIG_KEY, val=dict(tokens), encrypt=True - ) diff --git a/backend/danswer/danswerbot/slack/utils.py b/backend/danswer/danswerbot/slack/utils.py index d762dde7826..cf6f1e1bfc8 100644 --- a/backend/danswer/danswerbot/slack/utils.py +++ b/backend/danswer/danswerbot/slack/utils.py @@ -3,16 +3,16 @@ import re import string import time +import uuid from typing import Any from typing import cast -from typing import Optional from retry import retry from slack_sdk import WebClient from slack_sdk.errors import SlackApiError from slack_sdk.models.blocks import Block from slack_sdk.models.metadata import Metadata -from sqlalchemy.orm import Session +from slack_sdk.socket_mode import SocketModeClient from danswer.configs.app_configs import DISABLE_TELEMETRY from danswer.configs.constants import ID_SEPARATOR @@ -30,8 +30,7 @@ from danswer.connectors.slack.utils import make_slack_api_rate_limited from danswer.connectors.slack.utils import SlackTextCleaner from danswer.danswerbot.slack.constants import FeedbackVisibility -from danswer.danswerbot.slack.tokens import fetch_tokens -from danswer.db.engine import get_sqlalchemy_engine +from danswer.db.engine import get_session_with_tenant from danswer.db.users import get_user_by_email from danswer.llm.exceptions import GenAIDisabledException from danswer.llm.factory import get_default_llms @@ -47,16 +46,16 @@ logger = setup_logger() -_DANSWER_BOT_APP_ID: str | None = None +_DANSWER_BOT_SLACK_BOT_ID: str | None = None _DANSWER_BOT_MESSAGE_COUNT: int = 0 _DANSWER_BOT_COUNT_START_TIME: float = time.time() -def get_danswer_bot_app_id(web_client: WebClient) -> Any: - global _DANSWER_BOT_APP_ID - if _DANSWER_BOT_APP_ID is None: - _DANSWER_BOT_APP_ID = web_client.auth_test().get("user_id") - return _DANSWER_BOT_APP_ID +def get_danswer_bot_slack_bot_id(web_client: WebClient) -> Any: + global _DANSWER_BOT_SLACK_BOT_ID + if _DANSWER_BOT_SLACK_BOT_ID is None: + _DANSWER_BOT_SLACK_BOT_ID = web_client.auth_test().get("user_id") + return _DANSWER_BOT_SLACK_BOT_ID def check_message_limit() -> bool: @@ -137,15 +136,10 @@ def update_emote_react( def remove_danswer_bot_tag(message_str: str, client: WebClient) -> str: - bot_tag_id = get_danswer_bot_app_id(web_client=client) + bot_tag_id = get_danswer_bot_slack_bot_id(web_client=client) return re.sub(rf"<@{bot_tag_id}>\s", "", message_str) -def get_web_client() -> WebClient: - slack_tokens = fetch_tokens() - return WebClient(token=slack_tokens.bot_token) - - @retry( tries=DANSWER_BOT_NUM_RETRIES, delay=0.25, @@ -222,6 +216,13 @@ def build_feedback_id( return unique_prefix + ID_SEPARATOR + feedback_id +def build_continue_in_web_ui_id( + message_id: int, +) -> str: + unique_prefix = str(uuid.uuid4())[:10] + return unique_prefix + ID_SEPARATOR + str(message_id) + + def decompose_action_id(feedback_id: str) -> tuple[int, str | None, int | None]: """Decompose into query_id, document_id, document_rank, see above function""" try: @@ -319,7 +320,7 @@ def get_channel_name_from_id( raise e -def fetch_user_ids_from_emails( +def fetch_slack_user_ids_from_emails( user_emails: list[str], client: WebClient ) -> tuple[list[str], list[str]]: user_ids: list[str] = [] @@ -430,35 +431,58 @@ def read_slack_thread( replies = cast(dict, response.data).get("messages", []) for reply in replies: if "user" in reply and "bot_id" not in reply: - message = remove_danswer_bot_tag(reply["text"], client=client) - user_sem_id = fetch_user_semantic_id_from_id(reply["user"], client) + message = reply["text"] + user_sem_id = ( + fetch_user_semantic_id_from_id(reply.get("user"), client) + or "Unknown User" + ) message_type = MessageType.USER else: - self_app_id = get_danswer_bot_app_id(client) - - # Only include bot messages from Danswer, other bots are not taken in as context - if self_app_id != reply.get("user"): - continue - - blocks = reply["blocks"] - if len(blocks) <= 1: - continue - - # For the old flow, the useful block is the second one after the header block that says AI Answer - if reply["blocks"][0]["text"]["text"] == "AI Answer": - message = reply["blocks"][1]["text"]["text"] - else: - # for the new flow, the answer is the first block - message = reply["blocks"][0]["text"]["text"] - - if message.startswith("_Filters"): - if len(blocks) <= 2: + self_slack_bot_id = get_danswer_bot_slack_bot_id(client) + + if reply.get("user") == self_slack_bot_id: + # DanswerBot response + message_type = MessageType.ASSISTANT + user_sem_id = "Assistant" + + # DanswerBot responses have both text and blocks + # The useful content is in the blocks, specifically the first block unless there are + # auto-detected filters + blocks = reply.get("blocks") + if not blocks: + logger.warning(f"DanswerBot response has no blocks: {reply}") continue - message = reply["blocks"][2]["text"]["text"] - user_sem_id = "Assistant" - message_type = MessageType.ASSISTANT + message = blocks[0].get("text", {}).get("text") + + # If auto-detected filters are on, use the second block for the actual answer + # The first block is the auto-detected filters + if message.startswith("_Filters"): + if len(blocks) < 2: + logger.warning(f"Only filter blocks found: {reply}") + continue + # This is the DanswerBot answer format, if there is a change to how we respond, + # this will need to be updated to get the correct "answer" portion + message = reply["blocks"][1].get("text", {}).get("text") + else: + # Other bots are not counted as the LLM response which only comes from Danswer + message_type = MessageType.USER + bot_user_name = fetch_user_semantic_id_from_id( + reply.get("user"), client + ) + user_sem_id = bot_user_name or "Unknown" + " Bot" + + # For other bots, just use the text as we have no way of knowing that the + # useful portion is + message = reply.get("text") + if not message: + message = blocks[0].get("text", {}).get("text") + + if not message: + logger.warning("Skipping Slack thread message, no text found") + continue + message = remove_danswer_bot_tag(message, client=client) thread_messages.append( ThreadMessage(message=message, sender=user_sem_id, role=message_type) ) @@ -466,7 +490,9 @@ def read_slack_thread( return thread_messages -def slack_usage_report(action: str, sender_id: str | None, client: WebClient) -> None: +def slack_usage_report( + action: str, sender_id: str | None, client: WebClient, tenant_id: str | None +) -> None: if DISABLE_TELEMETRY: return @@ -478,7 +504,7 @@ def slack_usage_report(action: str, sender_id: str | None, client: WebClient) -> logger.warning("Unable to find sender email") if sender_email is not None: - with Session(get_sqlalchemy_engine()) as db_session: + with get_session_with_tenant(tenant_id) as db_session: danswer_user = get_user_by_email(email=sender_email, db_session=db_session) optional_telemetry( @@ -503,7 +529,7 @@ def refill(self) -> None: self.last_reset_time = time.time() def notify( - self, client: WebClient, channel: str, position: int, thread_ts: Optional[str] + self, client: WebClient, channel: str, position: int, thread_ts: str | None ) -> None: respond_in_thread( client=client, @@ -554,3 +580,12 @@ def get_feedback_visibility() -> FeedbackVisibility: return FeedbackVisibility(DANSWER_BOT_FEEDBACK_VISIBILITY.lower()) except ValueError: return FeedbackVisibility.PRIVATE + + +class TenantSocketModeClient(SocketModeClient): + def __init__( + self, tenant_id: str | None, slack_bot_id: int, *args: Any, **kwargs: Any + ): + super().__init__(*args, **kwargs) + self.tenant_id = tenant_id + self.slack_bot_id = slack_bot_id diff --git a/backend/ee/danswer/db/api_key.py b/backend/danswer/db/api_key.py similarity index 81% rename from backend/ee/danswer/db/api_key.py rename to backend/danswer/db/api_key.py index c38f32a0f84..b4a56f3f23f 100644 --- a/backend/ee/danswer/db/api_key.py +++ b/backend/danswer/db/api_key.py @@ -2,23 +2,30 @@ from fastapi_users.password import PasswordHelper from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import joinedload from sqlalchemy.orm import Session +from danswer.auth.api_key import ApiKeyDescriptor +from danswer.auth.api_key import build_displayable_api_key +from danswer.auth.api_key import generate_api_key +from danswer.auth.api_key import hash_api_key from danswer.configs.constants import DANSWER_API_KEY_DUMMY_EMAIL_DOMAIN from danswer.configs.constants import DANSWER_API_KEY_PREFIX from danswer.configs.constants import UNNAMED_KEY_PLACEHOLDER from danswer.db.models import ApiKey from danswer.db.models import User -from ee.danswer.auth.api_key import ApiKeyDescriptor -from ee.danswer.auth.api_key import build_displayable_api_key -from ee.danswer.auth.api_key import generate_api_key -from ee.danswer.auth.api_key import hash_api_key -from ee.danswer.server.api_key.models import APIKeyArgs +from danswer.server.api_key.models import APIKeyArgs +from shared_configs.configs import MULTI_TENANT +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR + + +def get_api_key_email_pattern() -> str: + return DANSWER_API_KEY_DUMMY_EMAIL_DOMAIN def is_api_key_email_address(email: str) -> bool: - return email.endswith(f"{DANSWER_API_KEY_DUMMY_EMAIL_DOMAIN}") + return email.endswith(get_api_key_email_pattern()) def fetch_api_keys(db_session: Session) -> list[ApiKeyDescriptor]: @@ -39,14 +46,16 @@ def fetch_api_keys(db_session: Session) -> list[ApiKeyDescriptor]: ] -def fetch_user_for_api_key(hashed_api_key: str, db_session: Session) -> User | None: - api_key = db_session.scalar( - select(ApiKey).where(ApiKey.hashed_api_key == hashed_api_key) +async def fetch_user_for_api_key( + hashed_api_key: str, async_db_session: AsyncSession +) -> User | None: + """NOTE: this is async, since it's used during auth + (which is necessarily async due to FastAPI Users)""" + return await async_db_session.scalar( + select(User) + .join(ApiKey, ApiKey.user_id == User.id) + .where(ApiKey.hashed_api_key == hashed_api_key) ) - if api_key is None: - return None - - return db_session.scalar(select(User).where(User.id == api_key.user_id)) # type: ignore def get_api_key_fake_email( @@ -60,7 +69,11 @@ def insert_api_key( db_session: Session, api_key_args: APIKeyArgs, user_id: uuid.UUID | None ) -> ApiKeyDescriptor: std_password_helper = PasswordHelper() - api_key = generate_api_key() + + # Get tenant_id from context var (will be default schema for single tenant) + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() + + api_key = generate_api_key(tenant_id if MULTI_TENANT else None) api_key_user_id = uuid.uuid4() display_name = api_key_args.name or UNNAMED_KEY_PLACEHOLDER diff --git a/backend/danswer/db/auth.py b/backend/danswer/db/auth.py index 6d150b106cb..bc4047109fa 100644 --- a/backend/danswer/db/auth.py +++ b/backend/danswer/db/auth.py @@ -4,16 +4,20 @@ from typing import Dict from fastapi import Depends +from fastapi_users.models import ID from fastapi_users.models import UP from fastapi_users_db_sqlalchemy import SQLAlchemyUserDatabase from fastapi_users_db_sqlalchemy.access_token import SQLAlchemyAccessTokenDatabase from sqlalchemy import func from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select +from sqlalchemy.orm import Session +from danswer.auth.invited_users import get_invited_users from danswer.auth.schemas import UserRole +from danswer.db.api_key import get_api_key_email_pattern from danswer.db.engine import get_async_session -from danswer.db.engine import get_sqlalchemy_async_engine +from danswer.db.engine import get_async_session_with_tenant from danswer.db.models import AccessToken from danswer.db.models import OAuthAccount from danswer.db.models import User @@ -33,10 +37,27 @@ def get_default_admin_user_emails() -> list[str]: return get_default_admin_user_emails_fn() +def get_total_users_count(db_session: Session) -> int: + """ + Returns the total number of users in the system. + This is the sum of users and invited users. + """ + user_count = ( + db_session.query(User) + .filter( + ~User.email.endswith(get_api_key_email_pattern()), # type: ignore + User.role != UserRole.EXT_PERM_USER, + ) + .count() + ) + invited_users = len(get_invited_users()) + return user_count + invited_users + + async def get_user_count() -> int: - async with AsyncSession(get_sqlalchemy_async_engine()) as asession: + async with get_async_session_with_tenant() as session: stmt = select(func.count(User.id)) - result = await asession.execute(stmt) + result = await session.execute(stmt) user_count = result.scalar() if user_count is None: raise RuntimeError("Was not able to fetch the user count.") @@ -44,8 +65,11 @@ async def get_user_count() -> int: # Need to override this because FastAPI Users doesn't give flexibility for backend field creation logic in OAuth flow -class SQLAlchemyUserAdminDB(SQLAlchemyUserDatabase): - async def create(self, create_dict: Dict[str, Any]) -> UP: +class SQLAlchemyUserAdminDB(SQLAlchemyUserDatabase[UP, ID]): + async def create( + self, + create_dict: Dict[str, Any], + ) -> UP: user_count = await get_user_count() if user_count == 0 or create_dict["email"] in get_default_admin_user_emails(): create_dict["role"] = UserRole.ADMIN diff --git a/backend/danswer/db/chat.py b/backend/danswer/db/chat.py index 8599714ce8b..73d0a886f45 100644 --- a/backend/danswer/db/chat.py +++ b/backend/danswer/db/chat.py @@ -3,6 +3,7 @@ from datetime import timedelta from uuid import UUID +from fastapi import HTTPException from sqlalchemy import delete from sqlalchemy import desc from sqlalchemy import func @@ -18,6 +19,9 @@ from danswer.chat.models import DocumentRelevance from danswer.configs.chat_configs import HARD_DELETE_CHATS from danswer.configs.constants import MessageType +from danswer.context.search.models import RetrievalDocs +from danswer.context.search.models import SavedSearchDoc +from danswer.context.search.models import SearchDoc as ServerSearchDoc from danswer.db.models import ChatMessage from danswer.db.models import ChatMessage__SearchDoc from danswer.db.models import ChatSession @@ -27,13 +31,11 @@ from danswer.db.models import SearchDoc as DBSearchDoc from danswer.db.models import ToolCall from danswer.db.models import User +from danswer.db.persona import get_best_persona_id_for_user from danswer.db.pg_file_store import delete_lobj_by_name from danswer.file_store.models import FileDescriptor from danswer.llm.override_models import LLMOverride from danswer.llm.override_models import PromptOverride -from danswer.search.models import RetrievalDocs -from danswer.search.models import SavedSearchDoc -from danswer.search.models import SearchDoc as ServerSearchDoc from danswer.server.query_and_chat.models import ChatMessageDetail from danswer.tools.tool_runner import ToolCallFinalResult from danswer.utils.logger import setup_logger @@ -43,7 +45,7 @@ def get_chat_session_by_id( - chat_session_id: int, + chat_session_id: UUID, user_id: UUID | None, db_session: Session, include_deleted: bool = False, @@ -87,9 +89,9 @@ def get_chat_sessions_by_slack_thread_id( def get_valid_messages_from_query_sessions( - chat_session_ids: list[int], + chat_session_ids: list[UUID], db_session: Session, -) -> dict[int, str]: +) -> dict[UUID, str]: user_message_subquery = ( select( ChatMessage.chat_session_id, func.min(ChatMessage.id).label("user_msg_id") @@ -196,7 +198,7 @@ def delete_orphaned_search_docs(db_session: Session) -> None: def delete_messages_and_files_from_chat_session( - chat_session_id: int, db_session: Session + chat_session_id: UUID, db_session: Session ) -> None: # Select messages older than cutoff_time with files messages_with_files = db_session.execute( @@ -250,10 +252,54 @@ def create_chat_session( return chat_session +def duplicate_chat_session_for_user_from_slack( + db_session: Session, + user: User | None, + chat_session_id: UUID, +) -> ChatSession: + """ + This takes a chat session id for a session in Slack and: + - Creates a new chat session in the DB + - Tries to copy the persona from the original chat session + (if it is available to the user clicking the button) + - Sets the user to the given user (if provided) + """ + chat_session = get_chat_session_by_id( + chat_session_id=chat_session_id, + user_id=None, # Ignore user permissions for this + db_session=db_session, + ) + if not chat_session: + raise HTTPException(status_code=400, detail="Invalid Chat Session ID provided") + + # This enforces permissions and sets a default + new_persona_id = get_best_persona_id_for_user( + db_session=db_session, + user=user, + persona_id=chat_session.persona_id, + ) + + return create_chat_session( + db_session=db_session, + user_id=user.id if user else None, + persona_id=new_persona_id, + # Set this to empty string so the frontend will force a rename + description="", + llm_override=chat_session.llm_override, + prompt_override=chat_session.prompt_override, + # Chat sessions from Slack should put people in the chat UI, not the search + one_shot=False, + # Chat is in UI now so this is false + danswerbot_flow=False, + # Maybe we want this in the future to track if it was created from Slack + slack_thread_id=None, + ) + + def update_chat_session( db_session: Session, user_id: UUID | None, - chat_session_id: int, + chat_session_id: UUID, description: str | None = None, sharing_status: ChatSessionSharedStatus | None = None, ) -> ChatSession: @@ -276,7 +322,7 @@ def update_chat_session( def delete_chat_session( user_id: UUID | None, - chat_session_id: int, + chat_session_id: UUID, db_session: Session, hard_delete: bool = HARD_DELETE_CHATS, ) -> None: @@ -336,8 +382,30 @@ def get_chat_message( return chat_message +def get_chat_session_by_message_id( + db_session: Session, + message_id: int, +) -> ChatSession: + """ + Should only be used for Slack + Get the chat session associated with a specific message ID + Note: this ignores permission checks. + """ + stmt = select(ChatMessage).where(ChatMessage.id == message_id) + + result = db_session.execute(stmt) + chat_message = result.scalar_one_or_none() + + if chat_message is None: + raise ValueError( + f"Unable to find chat session associated with message ID: {message_id}" + ) + + return chat_message.chat_session + + def get_chat_messages_by_sessions( - chat_session_ids: list[int], + chat_session_ids: list[UUID], user_id: UUID | None, db_session: Session, skip_permission_check: bool = False, @@ -355,6 +423,44 @@ def get_chat_messages_by_sessions( return db_session.execute(stmt).scalars().all() +def add_chats_to_session_from_slack_thread( + db_session: Session, + slack_chat_session_id: UUID, + new_chat_session_id: UUID, +) -> None: + new_root_message = get_or_create_root_message( + chat_session_id=new_chat_session_id, + db_session=db_session, + ) + + for chat_message in get_chat_messages_by_sessions( + chat_session_ids=[slack_chat_session_id], + user_id=None, # Ignore user permissions for this + db_session=db_session, + skip_permission_check=True, + ): + if chat_message.message_type == MessageType.SYSTEM: + continue + # Duplicate the message + new_root_message = create_new_chat_message( + db_session=db_session, + chat_session_id=new_chat_session_id, + parent_message=new_root_message, + message=chat_message.message, + files=chat_message.files, + rephrased_query=chat_message.rephrased_query, + error=chat_message.error, + citations=chat_message.citations, + reference_docs=chat_message.search_docs, + tool_call=chat_message.tool_call, + prompt_id=chat_message.prompt_id, + token_count=chat_message.token_count, + message_type=chat_message.message_type, + alternate_assistant_id=chat_message.alternate_assistant_id, + overridden_model=chat_message.overridden_model, + ) + + def get_search_docs_for_chat_message( chat_message_id: int, db_session: Session ) -> list[SearchDoc]: @@ -370,7 +476,7 @@ def get_search_docs_for_chat_message( def get_chat_messages_by_session( - chat_session_id: int, + chat_session_id: UUID, user_id: UUID | None, db_session: Session, skip_permission_check: bool = False, @@ -388,7 +494,7 @@ def get_chat_messages_by_session( ) if prefetch_tool_calls: - stmt = stmt.options(joinedload(ChatMessage.tool_calls)) + stmt = stmt.options(joinedload(ChatMessage.tool_call)) result = db_session.scalars(stmt).unique().all() else: result = db_session.scalars(stmt).all() @@ -397,7 +503,7 @@ def get_chat_messages_by_session( def get_or_create_root_message( - chat_session_id: int, + chat_session_id: UUID, db_session: Session, ) -> ChatMessage: try: @@ -433,7 +539,7 @@ def get_or_create_root_message( def reserve_message_id( db_session: Session, - chat_session_id: int, + chat_session_id: UUID, parent_message: int, message_type: MessageType, ) -> int: @@ -460,7 +566,7 @@ def reserve_message_id( def create_new_chat_message( - chat_session_id: int, + chat_session_id: UUID, parent_message: ChatMessage, message: str, prompt_id: int | None, @@ -474,7 +580,7 @@ def create_new_chat_message( alternate_assistant_id: int | None = None, # Maps the citation number [n] to the DB SearchDoc citations: dict[int, int] | None = None, - tool_calls: list[ToolCall] | None = None, + tool_call: ToolCall | None = None, commit: bool = True, reserved_message_id: int | None = None, overridden_model: str | None = None, @@ -494,7 +600,7 @@ def create_new_chat_message( existing_message.message_type = message_type existing_message.citations = citations existing_message.files = files - existing_message.tool_calls = tool_calls if tool_calls else [] + existing_message.tool_call = tool_call existing_message.error = error existing_message.alternate_assistant_id = alternate_assistant_id existing_message.overridden_model = overridden_model @@ -513,7 +619,7 @@ def create_new_chat_message( message_type=message_type, citations=citations, files=files, - tool_calls=tool_calls if tool_calls else [], + tool_call=tool_call, error=error, alternate_assistant_id=alternate_assistant_id, overridden_model=overridden_model, @@ -598,6 +704,7 @@ def get_doc_query_identifiers_from_model( chat_session: ChatSession, user_id: UUID | None, db_session: Session, + enforce_chat_session_id_for_search_docs: bool, ) -> list[tuple[str, int]]: """Given a list of search_doc_ids""" search_docs = ( @@ -617,7 +724,8 @@ def get_doc_query_identifiers_from_model( for doc in search_docs ] ): - raise ValueError("Invalid reference doc, not from this chat session.") + if enforce_chat_session_id_for_search_docs: + raise ValueError("Invalid reference doc, not from this chat session.") except IndexError: # This happens when the doc has no chat_messages associated with it. # which happens as an edge case where the chat message failed to save @@ -747,14 +855,13 @@ def translate_db_message_to_chat_message_detail( time_sent=chat_message.time_sent, citations=chat_message.citations, files=chat_message.files or [], - tool_calls=[ - ToolCallFinalResult( - tool_name=tool_call.tool_name, - tool_args=tool_call.tool_arguments, - tool_result=tool_call.tool_result, - ) - for tool_call in chat_message.tool_calls - ], + tool_call=ToolCallFinalResult( + tool_name=chat_message.tool_call.tool_name, + tool_args=chat_message.tool_call.tool_arguments, + tool_result=chat_message.tool_call.tool_result, + ) + if chat_message.tool_call + else None, alternate_assistant_id=chat_message.alternate_assistant_id, overridden_model=chat_message.overridden_model, ) diff --git a/backend/danswer/db/connector.py b/backend/danswer/db/connector.py index 89e6977103e..767a722eec4 100644 --- a/backend/danswer/db/connector.py +++ b/backend/danswer/db/connector.py @@ -1,3 +1,5 @@ +from datetime import datetime +from datetime import timezone from typing import cast from sqlalchemy import and_ @@ -246,7 +248,7 @@ def create_initial_default_connector(db_session: Session) -> None: logger.warning( "Default connector does not have expected values. Updating to proper state." ) - # Ensure default connector has correct valuesg + # Ensure default connector has correct values default_connector.source = DocumentSource.INGESTION_API default_connector.input_type = InputType.LOAD_STATE default_connector.refresh_freq = None @@ -268,3 +270,44 @@ def create_initial_default_connector(db_session: Session) -> None: ) db_session.add(connector) db_session.commit() + + +def mark_ccpair_as_pruned(cc_pair_id: int, db_session: Session) -> None: + stmt = select(ConnectorCredentialPair).where( + ConnectorCredentialPair.id == cc_pair_id + ) + cc_pair = db_session.scalar(stmt) + if cc_pair is None: + raise ValueError(f"No cc_pair with ID: {cc_pair_id}") + + cc_pair.last_pruned = datetime.now(timezone.utc) + db_session.commit() + + +def mark_cc_pair_as_permissions_synced( + db_session: Session, cc_pair_id: int, start_time: datetime | None +) -> None: + stmt = select(ConnectorCredentialPair).where( + ConnectorCredentialPair.id == cc_pair_id + ) + cc_pair = db_session.scalar(stmt) + if cc_pair is None: + raise ValueError(f"No cc_pair with ID: {cc_pair_id}") + + cc_pair.last_time_perm_sync = start_time + db_session.commit() + + +def mark_cc_pair_as_external_group_synced(db_session: Session, cc_pair_id: int) -> None: + stmt = select(ConnectorCredentialPair).where( + ConnectorCredentialPair.id == cc_pair_id + ) + cc_pair = db_session.scalar(stmt) + if cc_pair is None: + raise ValueError(f"No cc_pair with ID: {cc_pair_id}") + + # The sync time can be marked after it ran because all group syncs + # are run in full, not polling for changes. + # If this changes, we need to update this function. + cc_pair.last_time_external_group_sync = datetime.now(timezone.utc) + db_session.commit() diff --git a/backend/danswer/db/connector_credential_pair.py b/backend/danswer/db/connector_credential_pair.py index 004b5a754e4..26730d1178f 100644 --- a/backend/danswer/db/connector_credential_pair.py +++ b/backend/danswer/db/connector_credential_pair.py @@ -12,6 +12,7 @@ from danswer.configs.constants import DocumentSource from danswer.db.connector import fetch_connector_by_id from danswer.db.credentials import fetch_credential_by_id +from danswer.db.enums import AccessType from danswer.db.enums import ConnectorCredentialPairStatus from danswer.db.models import ConnectorCredentialPair from danswer.db.models import IndexAttempt @@ -24,6 +25,8 @@ from danswer.db.models import UserRole from danswer.server.models import StatusResponse from danswer.utils.logger import setup_logger +from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop + logger = setup_logger() @@ -73,8 +76,10 @@ def _add_user_filters( .where(~UG__CCpair.user_group_id.in_(user_groups)) .correlate(ConnectorCredentialPair) ) + where_clause |= ConnectorCredentialPair.creator_id == user.id else: - where_clause |= ConnectorCredentialPair.is_public == True # noqa: E712 + where_clause |= ConnectorCredentialPair.access_type == AccessType.PUBLIC + where_clause |= ConnectorCredentialPair.access_type == AccessType.SYNC return stmt.where(where_clause) @@ -94,8 +99,7 @@ def get_connector_credential_pairs( ) # noqa if ids: stmt = stmt.where(ConnectorCredentialPair.id.in_(ids)) - results = db_session.scalars(stmt) - return list(results.all()) + return list(db_session.scalars(stmt).all()) def add_deletion_failure_message( @@ -309,9 +313,9 @@ def associate_default_cc_pair(db_session: Session) -> None: association = ConnectorCredentialPair( connector_id=0, credential_id=0, + access_type=AccessType.PUBLIC, name="DefaultCCPair", status=ConnectorCredentialPairStatus.ACTIVE, - is_public=True, ) db_session.add(association) db_session.commit() @@ -320,8 +324,11 @@ def associate_default_cc_pair(db_session: Session) -> None: def _relate_groups_to_cc_pair__no_commit( db_session: Session, cc_pair_id: int, - user_group_ids: list[int], + user_group_ids: list[int] | None = None, ) -> None: + if not user_group_ids: + return + for group_id in user_group_ids: db_session.add( UserGroup__ConnectorCredentialPair( @@ -336,8 +343,11 @@ def add_credential_to_connector( connector_id: int, credential_id: int, cc_pair_name: str | None, - is_public: bool, + access_type: AccessType, groups: list[int] | None, + auto_sync_options: dict | None = None, + initial_status: ConnectorCredentialPairStatus = ConnectorCredentialPairStatus.ACTIVE, + last_successful_index_time: datetime | None = None, ) -> StatusResponse: connector = fetch_connector_by_id(connector_id, db_session) credential = fetch_credential_by_id(credential_id, user, db_session) @@ -345,6 +355,17 @@ def add_credential_to_connector( if connector is None: raise HTTPException(status_code=404, detail="Connector does not exist") + if access_type == AccessType.SYNC: + if not fetch_ee_implementation_or_noop( + "danswer.external_permissions.sync_params", + "check_if_valid_sync_source", + noop_return_value=True, + )(connector.source): + raise HTTPException( + status_code=400, + detail=f"Connector of type {connector.source} does not support SYNC access type", + ) + if credential is None: error_msg = ( f"Credential {credential_id} does not exist or does not belong to user" @@ -371,21 +392,24 @@ def add_credential_to_connector( ) association = ConnectorCredentialPair( + creator_id=user.id if user else None, connector_id=connector_id, credential_id=credential_id, name=cc_pair_name, - status=ConnectorCredentialPairStatus.ACTIVE, - is_public=is_public, + status=initial_status, + access_type=access_type, + auto_sync_options=auto_sync_options, + last_successful_index_time=last_successful_index_time, ) db_session.add(association) db_session.flush() # make sure the association has an id + db_session.refresh(association) - if groups: - _relate_groups_to_cc_pair__no_commit( - db_session=db_session, - cc_pair_id=association.id, - user_group_ids=groups, - ) + _relate_groups_to_cc_pair__no_commit( + db_session=db_session, + cc_pair_id=association.id, + user_group_ids=groups, + ) db_session.commit() @@ -423,6 +447,13 @@ def remove_credential_from_connector( ) if association is not None: + fetch_ee_implementation_or_noop( + "danswer.db.external_perm", + "delete_user__ext_group_for_cc_pair__no_commit", + )( + db_session=db_session, + cc_pair_id=association.id, + ) db_session.delete(association) db_session.commit() return StatusResponse( diff --git a/backend/danswer/db/credentials.py b/backend/danswer/db/credentials.py index abab904cc48..4a146c5c5f4 100644 --- a/backend/danswer/db/credentials.py +++ b/backend/danswer/db/credentials.py @@ -10,10 +10,7 @@ from danswer.auth.schemas import UserRole from danswer.configs.constants import DocumentSource -from danswer.connectors.gmail.constants import ( - GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY, -) -from danswer.connectors.google_drive.constants import ( +from danswer.connectors.google_utils.shared_constants import ( DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY, ) from danswer.db.models import ConnectorCredentialPair @@ -40,6 +37,8 @@ DocumentSource.MEDIAWIKI, } +PUBLIC_CREDENTIAL_ID = 0 + def _add_user_filters( stmt: Select, @@ -242,7 +241,6 @@ def create_credential( ) db_session.add(credential) db_session.flush() # This ensures the credential gets an ID - _relate_credential_to_user_groups__no_commit( db_session=db_session, credential_id=credential.id, @@ -385,12 +383,11 @@ def delete_credential( def create_initial_public_credential(db_session: Session) -> None: - public_cred_id = 0 error_msg = ( "DB is not in a valid initial state." "There must exist an empty public credential for data connectors that do not require additional Auth." ) - first_credential = fetch_credential_by_id(public_cred_id, None, db_session) + first_credential = fetch_credential_by_id(PUBLIC_CREDENTIAL_ID, None, db_session) if first_credential is not None: if first_credential.credential_json != {} or first_credential.user is not None: @@ -398,7 +395,7 @@ def create_initial_public_credential(db_session: Session) -> None: return credential = Credential( - id=public_cred_id, + id=PUBLIC_CREDENTIAL_ID, credential_json={}, user_id=None, ) @@ -406,25 +403,33 @@ def create_initial_public_credential(db_session: Session) -> None: db_session.commit() -def delete_gmail_service_account_credentials( - user: User | None, db_session: Session -) -> None: - credentials = fetch_credentials(db_session=db_session, user=user) - for credential in credentials: - if credential.credential_json.get( - GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY - ): - db_session.delete(credential) +def cleanup_gmail_credentials(db_session: Session) -> None: + gmail_credentials = fetch_credentials_by_source( + db_session=db_session, user=None, document_source=DocumentSource.GMAIL + ) + for credential in gmail_credentials: + db_session.delete(credential) + db_session.commit() + +def cleanup_google_drive_credentials(db_session: Session) -> None: + google_drive_credentials = fetch_credentials_by_source( + db_session=db_session, user=None, document_source=DocumentSource.GOOGLE_DRIVE + ) + for credential in google_drive_credentials: + db_session.delete(credential) db_session.commit() -def delete_google_drive_service_account_credentials( - user: User | None, db_session: Session +def delete_service_account_credentials( + user: User | None, db_session: Session, source: DocumentSource ) -> None: credentials = fetch_credentials(db_session=db_session, user=user) for credential in credentials: - if credential.credential_json.get(DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY): + if ( + credential.credential_json.get(DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY) + and credential.source == source + ): db_session.delete(credential) db_session.commit() diff --git a/backend/danswer/db/document.py b/backend/danswer/db/document.py index 0d5bc276bc1..8f0bfa2cac2 100644 --- a/backend/danswer/db/document.py +++ b/backend/danswer/db/document.py @@ -4,7 +4,6 @@ from collections.abc import Sequence from datetime import datetime from datetime import timezone -from uuid import UUID from sqlalchemy import and_ from sqlalchemy import delete @@ -17,14 +16,18 @@ from sqlalchemy.engine.util import TransactionalContext from sqlalchemy.exc import OperationalError from sqlalchemy.orm import Session +from sqlalchemy.sql.expression import null from danswer.configs.constants import DEFAULT_BOOST +from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id +from danswer.db.enums import AccessType from danswer.db.enums import ConnectorCredentialPairStatus from danswer.db.feedback import delete_document_feedback_for_documents__no_commit from danswer.db.models import ConnectorCredentialPair from danswer.db.models import Credential from danswer.db.models import Document as DbDocument from danswer.db.models import DocumentByConnectorCredentialPair +from danswer.db.models import User from danswer.db.tag import delete_document_tags_for_documents__no_commit from danswer.db.utils import model_to_dict from danswer.document_index.interfaces import DocumentMetadata @@ -44,13 +47,21 @@ def count_documents_by_needs_sync(session: Session) -> int: """Get the count of all documents where: 1. last_modified is newer than last_synced 2. last_synced is null (meaning we've never synced) + AND the document has a relationship with a connector/credential pair + + TODO: The documents without a relationship with a connector/credential pair + should be cleaned up somehow eventually. This function executes the query and returns the count of documents matching the criteria.""" count = ( - session.query(func.count()) + session.query(func.count(DbDocument.id.distinct())) .select_from(DbDocument) + .join( + DocumentByConnectorCredentialPair, + DbDocument.id == DocumentByConnectorCredentialPair.id, + ) .filter( or_( DbDocument.last_modified > DbDocument.last_synced, @@ -89,6 +100,22 @@ def construct_document_select_for_connector_credential_pair_by_needs_sync( return stmt +def get_all_documents_needing_vespa_sync_for_cc_pair( + db_session: Session, cc_pair_id: int +) -> list[DbDocument]: + cc_pair = get_connector_credential_pair_from_id( + cc_pair_id=cc_pair_id, db_session=db_session + ) + if not cc_pair: + raise ValueError(f"No CC pair found with ID: {cc_pair_id}") + + stmt = construct_document_select_for_connector_credential_pair_by_needs_sync( + cc_pair.connector_id, cc_pair.credential_id + ) + + return list(db_session.scalars(stmt).all()) + + def construct_document_select_for_connector_credential_pair( connector_id: int, credential_id: int | None = None ) -> Select: @@ -102,6 +129,33 @@ def construct_document_select_for_connector_credential_pair( return stmt +def get_documents_for_cc_pair( + db_session: Session, + cc_pair_id: int, +) -> list[DbDocument]: + cc_pair = get_connector_credential_pair_from_id( + cc_pair_id=cc_pair_id, db_session=db_session + ) + if not cc_pair: + raise ValueError(f"No CC pair found with ID: {cc_pair_id}") + stmt = construct_document_select_for_connector_credential_pair( + connector_id=cc_pair.connector_id, credential_id=cc_pair.credential_id + ) + return list(db_session.scalars(stmt).all()) + + +def get_document_ids_for_connector_credential_pair( + db_session: Session, connector_id: int, credential_id: int, limit: int | None = None +) -> list[str]: + doc_ids_stmt = select(DocumentByConnectorCredentialPair.id).where( + and_( + DocumentByConnectorCredentialPair.connector_id == connector_id, + DocumentByConnectorCredentialPair.credential_id == credential_id, + ) + ) + return list(db_session.execute(doc_ids_stmt).scalars().all()) + + def get_documents_for_connector_credential_pair( db_session: Session, connector_id: int, credential_id: int, limit: int | None = None ) -> Sequence[DbDocument]: @@ -118,8 +172,8 @@ def get_documents_for_connector_credential_pair( def get_documents_by_ids( - document_ids: list[str], db_session: Session, + document_ids: list[str], ) -> list[DbDocument]: stmt = select(DbDocument).where(DbDocument.id.in_(document_ids)) documents = db_session.execute(stmt).scalars().all() @@ -155,6 +209,7 @@ def get_document_connector_counts( def get_document_counts_for_cc_pairs( db_session: Session, cc_pair_identifiers: list[ConnectorCredentialPairIdentifier] ) -> Sequence[tuple[int, int, int]]: + """Returns a sequence of tuples of (connector_id, credential_id, document count)""" stmt = ( select( DocumentByConnectorCredentialPair.connector_id, @@ -186,16 +241,14 @@ def get_document_counts_for_cc_pairs( def get_access_info_for_document( db_session: Session, document_id: str, -) -> tuple[str, list[UUID | None], bool] | None: +) -> tuple[str, list[str | None], bool] | None: """Gets access info for a single document by calling the get_access_info_for_documents function and passing a list with a single document ID. - Args: db_session (Session): The database session to use. document_id (str): The document ID to fetch access info for. - Returns: - Optional[Tuple[str, List[UUID | None], bool]]: A tuple containing the document ID, a list of user IDs, + Optional[Tuple[str, List[str | None], bool]]: A tuple containing the document ID, a list of user emails, and a boolean indicating if the document is globally public, or None if no results are found. """ results = get_access_info_for_documents(db_session, [document_id]) @@ -208,19 +261,27 @@ def get_access_info_for_document( def get_access_info_for_documents( db_session: Session, document_ids: list[str], -) -> Sequence[tuple[str, list[UUID | None], bool]]: +) -> Sequence[tuple[str, list[str | None], bool]]: """Gets back all relevant access info for the given documents. This includes the user_ids for cc pairs that the document is associated with + whether any of the associated cc pairs are intending to make the document globally public. + Returns the list where each element contains: + - Document ID (which is also the ID of the DocumentByConnectorCredentialPair) + - List of emails of Danswer users with direct access to the doc (includes a "None" element if + the connector was set up by an admin when auth was off + - bool for whether the document is public (the document later can also be marked public by + automatic permission sync step) """ + stmt = select( + DocumentByConnectorCredentialPair.id, + func.array_agg(func.coalesce(User.email, null())).label("user_emails"), + func.bool_or(ConnectorCredentialPair.access_type == AccessType.PUBLIC).label( + "public_doc" + ), + ).where(DocumentByConnectorCredentialPair.id.in_(document_ids)) + stmt = ( - select( - DocumentByConnectorCredentialPair.id, - func.array_agg(Credential.user_id).label("user_ids"), - func.bool_or(ConnectorCredentialPair.is_public).label("public_doc"), - ) - .where(DocumentByConnectorCredentialPair.id.in_(document_ids)) - .join( + stmt.join( Credential, DocumentByConnectorCredentialPair.credential_id == Credential.id, ) @@ -233,6 +294,13 @@ def get_access_info_for_documents( == ConnectorCredentialPair.credential_id, ), ) + .outerjoin( + User, + and_( + Credential.user_id == User.id, + ConnectorCredentialPair.access_type != AccessType.SYNC, + ), + ) # don't include CC pairs that are being deleted # NOTE: CC pairs can never go from DELETING to any other state -> it's safe to ignore them .where(ConnectorCredentialPair.status != ConnectorCredentialPairStatus.DELETING) @@ -278,31 +346,43 @@ def upsert_documents( for doc in seen_documents.values() ] ) - # for now, there are no columns to update. If more metadata is added, then this - # needs to change to an `on_conflict_do_update` - on_conflict_stmt = insert_stmt.on_conflict_do_nothing() + + # This does not update the permissions of the document if + # the document already exists. + on_conflict_stmt = insert_stmt.on_conflict_do_update( + index_elements=["id"], # Conflict target + set_={ + "from_ingestion_api": insert_stmt.excluded.from_ingestion_api, + "boost": insert_stmt.excluded.boost, + "hidden": insert_stmt.excluded.hidden, + "semantic_id": insert_stmt.excluded.semantic_id, + "link": insert_stmt.excluded.link, + "primary_owners": insert_stmt.excluded.primary_owners, + "secondary_owners": insert_stmt.excluded.secondary_owners, + }, + ) db_session.execute(on_conflict_stmt) db_session.commit() def upsert_document_by_connector_credential_pair( - db_session: Session, document_metadata_batch: list[DocumentMetadata] + db_session: Session, connector_id: int, credential_id: int, document_ids: list[str] ) -> None: """NOTE: this function is Postgres specific. Not all DBs support the ON CONFLICT clause.""" - if not document_metadata_batch: - logger.info("`document_metadata_batch` is empty. Skipping.") + if not document_ids: + logger.info("`document_ids` is empty. Skipping.") return insert_stmt = insert(DocumentByConnectorCredentialPair).values( [ model_to_dict( DocumentByConnectorCredentialPair( - id=document_metadata.document_id, - connector_id=document_metadata.connector_id, - credential_id=document_metadata.credential_id, + id=doc_id, + connector_id=connector_id, + credential_id=credential_id, ) ) - for document_metadata in document_metadata_batch + for doc_id in document_ids ] ) # for now, there are no columns to update. If more metadata is added, then this @@ -338,26 +418,29 @@ def update_docs_last_modified__no_commit( doc.last_modified = now -def mark_document_as_synced(document_id: str, db_session: Session) -> None: +def mark_document_as_modified( + document_id: str, + db_session: Session, +) -> None: stmt = select(DbDocument).where(DbDocument.id == document_id) doc = db_session.scalar(stmt) if doc is None: raise ValueError(f"No document with ID: {document_id}") # update last_synced - doc.last_synced = datetime.now(timezone.utc) + doc.last_modified = datetime.now(timezone.utc) db_session.commit() -def upsert_documents_complete( - db_session: Session, - document_metadata_batch: list[DocumentMetadata], -) -> None: - upsert_documents(db_session, document_metadata_batch) - upsert_document_by_connector_credential_pair(db_session, document_metadata_batch) - logger.info( - f"Upserted {len(document_metadata_batch)} document store entries into DB" - ) +def mark_document_as_synced(document_id: str, db_session: Session) -> None: + stmt = select(DbDocument).where(DbDocument.id == document_id) + doc = db_session.scalar(stmt) + if doc is None: + raise ValueError(f"No document with ID: {document_id}") + + # update last_synced + doc.last_synced = datetime.now(timezone.utc) + db_session.commit() def delete_document_by_connector_credential_pair__no_commit( @@ -412,7 +495,6 @@ def delete_documents_complete__no_commit( db_session: Session, document_ids: list[str] ) -> None: """This completely deletes the documents from the db, including all foreign key relationships""" - logger.info(f"Deleting {len(document_ids)} documents from the DB") delete_documents_by_connector_credential_pair__no_commit(db_session, document_ids) delete_document_feedback_for_documents__no_commit( document_ids=document_ids, db_session=db_session @@ -469,7 +551,7 @@ def prepare_to_modify_documents( db_session.commit() # ensure that we're not in a transaction lock_acquired = False - for _ in range(_NUM_LOCK_ATTEMPTS): + for i in range(_NUM_LOCK_ATTEMPTS): try: with db_session.begin() as transaction: lock_acquired = acquire_document_locks( @@ -480,7 +562,7 @@ def prepare_to_modify_documents( break except OperationalError as e: logger.warning( - f"Failed to acquire locks for documents, retrying. Error: {e}" + f"Failed to acquire locks for documents on attempt {i}, retrying. Error: {e}" ) time.sleep(retry_delay) diff --git a/backend/danswer/db/document_set.py b/backend/danswer/db/document_set.py index a8c1e4ebb1d..b5af99b22d4 100644 --- a/backend/danswer/db/document_set.py +++ b/backend/danswer/db/document_set.py @@ -14,6 +14,7 @@ from danswer.db.connector_credential_pair import get_cc_pair_groups_for_ids from danswer.db.connector_credential_pair import get_connector_credential_pairs +from danswer.db.enums import AccessType from danswer.db.enums import ConnectorCredentialPairStatus from danswer.db.models import ConnectorCredentialPair from danswer.db.models import Document @@ -180,7 +181,7 @@ def _check_if_cc_pairs_are_owned_by_groups( ids=missing_cc_pair_ids, ) for cc_pair in cc_pairs: - if not cc_pair.is_public: + if cc_pair.access_type != AccessType.PUBLIC: raise ValueError( f"Connector Credential Pair with ID: '{cc_pair.id}'" " is not owned by the specified groups" @@ -397,7 +398,7 @@ def mark_document_set_as_to_be_deleted( def delete_document_set_cc_pair_relationship__no_commit( connector_id: int, credential_id: int, db_session: Session -) -> None: +) -> int: """Deletes all rows from DocumentSet__ConnectorCredentialPair where the connector_credential_pair_id matches the given cc_pair_id.""" delete_stmt = delete(DocumentSet__ConnectorCredentialPair).where( @@ -408,7 +409,8 @@ def delete_document_set_cc_pair_relationship__no_commit( == ConnectorCredentialPair.id, ) ) - db_session.execute(delete_stmt) + result = db_session.execute(delete_stmt) + return result.rowcount # type: ignore def fetch_document_sets( @@ -704,7 +706,7 @@ def check_document_sets_are_public( ConnectorCredentialPair.id.in_( connector_credential_pair_ids # type:ignore ), - ConnectorCredentialPair.is_public.is_(False), + ConnectorCredentialPair.access_type != AccessType.PUBLIC, ) .limit(1) .first() diff --git a/backend/danswer/db/engine.py b/backend/danswer/db/engine.py index 94b5d0123cc..5d4753e136a 100644 --- a/backend/danswer/db/engine.py +++ b/backend/danswer/db/engine.py @@ -1,10 +1,18 @@ import contextlib +import re +import threading import time from collections.abc import AsyncGenerator from collections.abc import Generator +from contextlib import asynccontextmanager +from contextlib import contextmanager from datetime import datetime +from typing import Any from typing import ContextManager +import jwt +from fastapi import HTTPException +from fastapi import Request from sqlalchemy import event from sqlalchemy import text from sqlalchemy.engine import create_engine @@ -17,34 +25,36 @@ from danswer.configs.app_configs import LOG_POSTGRES_CONN_COUNTS from danswer.configs.app_configs import LOG_POSTGRES_LATENCY +from danswer.configs.app_configs import POSTGRES_API_SERVER_POOL_OVERFLOW +from danswer.configs.app_configs import POSTGRES_API_SERVER_POOL_SIZE from danswer.configs.app_configs import POSTGRES_DB from danswer.configs.app_configs import POSTGRES_HOST +from danswer.configs.app_configs import POSTGRES_IDLE_SESSIONS_TIMEOUT from danswer.configs.app_configs import POSTGRES_PASSWORD from danswer.configs.app_configs import POSTGRES_POOL_PRE_PING from danswer.configs.app_configs import POSTGRES_POOL_RECYCLE from danswer.configs.app_configs import POSTGRES_PORT from danswer.configs.app_configs import POSTGRES_USER +from danswer.configs.app_configs import USER_AUTH_SECRET from danswer.configs.constants import POSTGRES_UNKNOWN_APP_NAME from danswer.utils.logger import setup_logger +from shared_configs.configs import MULTI_TENANT +from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA +from shared_configs.configs import TENANT_ID_PREFIX +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR logger = setup_logger() SYNC_DB_API = "psycopg2" ASYNC_DB_API = "asyncpg" -POSTGRES_APP_NAME = ( - POSTGRES_UNKNOWN_APP_NAME # helps to diagnose open connections in postgres -) - # global so we don't create more than one engine per process # outside of being best practice, this is needed so we can properly pool # connections and not create a new pool on every request -_SYNC_ENGINE: Engine | None = None -_ASYNC_ENGINE: AsyncEngine | None = None +_ASYNC_ENGINE: AsyncEngine | None = None SessionFactory: sessionmaker[Session] | None = None - if LOG_POSTGRES_LATENCY: # Function to log before query execution @event.listens_for(Engine, "before_cursor_execute") @@ -108,6 +118,108 @@ def get_db_current_time(db_session: Session) -> datetime: return result +# Regular expression to validate schema names to prevent SQL injection +SCHEMA_NAME_REGEX = re.compile(r"^[a-zA-Z0-9_-]+$") + + +def is_valid_schema_name(name: str) -> bool: + return SCHEMA_NAME_REGEX.match(name) is not None + + +class SqlEngine: + """Class to manage a global SQLAlchemy engine (needed for proper resource control). + Will eventually subsume most of the standalone functions in this file. + Sync only for now. + """ + + _engine: Engine | None = None + _lock: threading.Lock = threading.Lock() + _app_name: str = POSTGRES_UNKNOWN_APP_NAME + + # Default parameters for engine creation + DEFAULT_ENGINE_KWARGS = { + "pool_size": 20, + "max_overflow": 5, + "pool_pre_ping": POSTGRES_POOL_PRE_PING, + "pool_recycle": POSTGRES_POOL_RECYCLE, + } + + def __init__(self) -> None: + pass + + @classmethod + def _init_engine(cls, **engine_kwargs: Any) -> Engine: + """Private helper method to create and return an Engine.""" + connection_string = build_connection_string( + db_api=SYNC_DB_API, app_name=cls._app_name + "_sync" + ) + merged_kwargs = {**cls.DEFAULT_ENGINE_KWARGS, **engine_kwargs} + return create_engine(connection_string, **merged_kwargs) + + @classmethod + def init_engine(cls, **engine_kwargs: Any) -> None: + """Allow the caller to init the engine with extra params. Different clients + such as the API server and different Celery workers and tasks + need different settings. + """ + with cls._lock: + if not cls._engine: + cls._engine = cls._init_engine(**engine_kwargs) + + @classmethod + def get_engine(cls) -> Engine: + """Gets the SQLAlchemy engine. Will init a default engine if init hasn't + already been called. You probably want to init first! + """ + if not cls._engine: + with cls._lock: + if not cls._engine: + cls._engine = cls._init_engine() + return cls._engine + + @classmethod + def set_app_name(cls, app_name: str) -> None: + """Class method to set the app name.""" + cls._app_name = app_name + + @classmethod + def get_app_name(cls) -> str: + """Class method to get current app name.""" + if not cls._app_name: + return "" + return cls._app_name + + @classmethod + def reset_engine(cls) -> None: + with cls._lock: + if cls._engine: + cls._engine.dispose() + cls._engine = None + + +def get_all_tenant_ids() -> list[str] | list[None]: + if not MULTI_TENANT: + return [None] + with get_session_with_tenant(tenant_id=POSTGRES_DEFAULT_SCHEMA) as session: + result = session.execute( + text( + f""" + SELECT schema_name + FROM information_schema.schemata + WHERE schema_name NOT IN ('pg_catalog', 'information_schema', '{POSTGRES_DEFAULT_SCHEMA}')""" + ) + ) + tenant_ids = [row[0] for row in result] + + valid_tenants = [ + tenant + for tenant in tenant_ids + if tenant is None or tenant.startswith(TENANT_ID_PREFIX) + ] + + return valid_tenants + + def build_connection_string( *, db_api: str = ASYNC_DB_API, @@ -120,69 +232,239 @@ def build_connection_string( ) -> str: if app_name: return f"postgresql+{db_api}://{user}:{password}@{host}:{port}/{db}?application_name={app_name}" - return f"postgresql+{db_api}://{user}:{password}@{host}:{port}/{db}" -def init_sqlalchemy_engine(app_name: str) -> None: - global POSTGRES_APP_NAME - POSTGRES_APP_NAME = app_name - - def get_sqlalchemy_engine() -> Engine: - global _SYNC_ENGINE - if _SYNC_ENGINE is None: - connection_string = build_connection_string( - db_api=SYNC_DB_API, app_name=POSTGRES_APP_NAME + "_sync" - ) - _SYNC_ENGINE = create_engine( - connection_string, - pool_size=40, - max_overflow=10, - pool_pre_ping=POSTGRES_POOL_PRE_PING, - pool_recycle=POSTGRES_POOL_RECYCLE, - ) - return _SYNC_ENGINE + return SqlEngine.get_engine() def get_sqlalchemy_async_engine() -> AsyncEngine: global _ASYNC_ENGINE if _ASYNC_ENGINE is None: - # underlying asyncpg cannot accept application_name directly in the connection string + # Underlying asyncpg cannot accept application_name directly in the connection string # https://github.com/MagicStack/asyncpg/issues/798 connection_string = build_connection_string() _ASYNC_ENGINE = create_async_engine( connection_string, connect_args={ - "server_settings": {"application_name": POSTGRES_APP_NAME + "_async"} + "server_settings": { + "application_name": SqlEngine.get_app_name() + "_async" + } }, - pool_size=40, - max_overflow=10, + # async engine is only used by API server, so we can use those values + # here as well + pool_size=POSTGRES_API_SERVER_POOL_SIZE, + max_overflow=POSTGRES_API_SERVER_POOL_OVERFLOW, pool_pre_ping=POSTGRES_POOL_PRE_PING, pool_recycle=POSTGRES_POOL_RECYCLE, ) return _ASYNC_ENGINE -def get_session_context_manager() -> ContextManager[Session]: - return contextlib.contextmanager(get_session)() +# Dependency to get the current tenant ID +# If no token is present, uses the default schema for this use case +def get_current_tenant_id(request: Request) -> str: + """Dependency that extracts the tenant ID from the JWT token in the request and sets the context variable.""" + if not MULTI_TENANT: + tenant_id = POSTGRES_DEFAULT_SCHEMA + CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id) + return tenant_id + + token = request.cookies.get("fastapiusersauth") + if not token: + current_value = CURRENT_TENANT_ID_CONTEXTVAR.get() + # If no token is present, use the default schema or handle accordingly + return current_value + + try: + payload = jwt.decode( + token, + USER_AUTH_SECRET, + audience=["fastapi-users:auth"], + algorithms=["HS256"], + ) + tenant_id = payload.get("tenant_id", POSTGRES_DEFAULT_SCHEMA) + if not is_valid_schema_name(tenant_id): + raise HTTPException(status_code=400, detail="Invalid tenant ID format") + CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id) + + return tenant_id + except jwt.InvalidTokenError: + return CURRENT_TENANT_ID_CONTEXTVAR.get() + except Exception as e: + logger.error(f"Unexpected error in get_current_tenant_id: {str(e)}") + raise HTTPException(status_code=500, detail="Internal server error") + + +@asynccontextmanager +async def get_async_session_with_tenant( + tenant_id: str | None = None, +) -> AsyncGenerator[AsyncSession, None]: + if tenant_id is None: + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() + + if not is_valid_schema_name(tenant_id): + logger.error(f"Invalid tenant ID: {tenant_id}") + raise Exception("Invalid tenant ID") + + engine = get_sqlalchemy_async_engine() + async_session_factory = sessionmaker( + bind=engine, expire_on_commit=False, class_=AsyncSession + ) # type: ignore + + async with async_session_factory() as session: + try: + # Set the search_path to the tenant's schema + await session.execute(text(f'SET search_path = "{tenant_id}"')) + if POSTGRES_IDLE_SESSIONS_TIMEOUT: + await session.execute( + text( + f"SET SESSION idle_in_transaction_session_timeout = {POSTGRES_IDLE_SESSIONS_TIMEOUT}" + ) + ) + except Exception: + logger.exception("Error setting search_path.") + # You can choose to re-raise the exception or handle it + # Here, we'll re-raise to prevent proceeding with an incorrect session + raise + else: + yield session + + +@contextmanager +def get_session_with_default_tenant() -> Generator[Session, None, None]: + """ + Get a database session using the current tenant ID from the context variable. + """ + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() + with get_session_with_tenant(tenant_id) as session: + yield session + + +@contextmanager +def get_session_with_tenant( + tenant_id: str | None = None, +) -> Generator[Session, None, None]: + """ + Generate a database session for a specific tenant. + + This function: + 1. Sets the database schema to the specified tenant's schema. + 2. Preserves the tenant ID across the session. + 3. Reverts to the previous tenant ID after the session is closed. + 4. Uses the default schema if no tenant ID is provided. + """ + engine = get_sqlalchemy_engine() + + # Store the previous tenant ID + previous_tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() or POSTGRES_DEFAULT_SCHEMA + + if tenant_id is None: + tenant_id = POSTGRES_DEFAULT_SCHEMA + + CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id) + + event.listen(engine, "checkout", set_search_path_on_checkout) + + if not is_valid_schema_name(tenant_id): + raise HTTPException(status_code=400, detail="Invalid tenant ID") + + try: + # Establish a raw connection + with engine.connect() as connection: + # Access the raw DBAPI connection and set the search_path + dbapi_connection = connection.connection + + # Set the search_path outside of any transaction + cursor = dbapi_connection.cursor() + try: + cursor.execute(f'SET search_path = "{tenant_id}"') + if POSTGRES_IDLE_SESSIONS_TIMEOUT: + cursor.execute( + text( + f"SET SESSION idle_in_transaction_session_timeout = {POSTGRES_IDLE_SESSIONS_TIMEOUT}" + ) + ) + finally: + cursor.close() + + # Bind the session to the connection + with Session(bind=connection, expire_on_commit=False) as session: + try: + yield session + finally: + # Reset search_path to default after the session is used + if MULTI_TENANT: + cursor = dbapi_connection.cursor() + try: + cursor.execute('SET search_path TO "$user", public') + finally: + cursor.close() + + finally: + # Restore the previous tenant ID + CURRENT_TENANT_ID_CONTEXTVAR.set(previous_tenant_id) + + +def set_search_path_on_checkout( + dbapi_conn: Any, connection_record: Any, connection_proxy: Any +) -> None: + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() + if tenant_id and is_valid_schema_name(tenant_id): + with dbapi_conn.cursor() as cursor: + cursor.execute(f'SET search_path TO "{tenant_id}"') + + +def get_session_generator_with_tenant() -> Generator[Session, None, None]: + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() + with get_session_with_tenant(tenant_id) as session: + yield session def get_session() -> Generator[Session, None, None]: - # The line below was added to monitor the latency caused by Postgres connections - # during API calls. - # with tracer.trace("db.get_session"): - with Session(get_sqlalchemy_engine(), expire_on_commit=False) as session: + """Generate a database session with the appropriate tenant schema set.""" + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() + if tenant_id == POSTGRES_DEFAULT_SCHEMA and MULTI_TENANT: + raise HTTPException(status_code=401, detail="User must authenticate") + + engine = get_sqlalchemy_engine() + + with Session(engine, expire_on_commit=False) as session: + if MULTI_TENANT: + if not is_valid_schema_name(tenant_id): + raise HTTPException(status_code=400, detail="Invalid tenant ID") + # Set the search_path to the tenant's schema + session.execute(text(f'SET search_path = "{tenant_id}"')) yield session async def get_async_session() -> AsyncGenerator[AsyncSession, None]: - async with AsyncSession( - get_sqlalchemy_async_engine(), expire_on_commit=False - ) as async_session: + """Generate an async database session with the appropriate tenant schema set.""" + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() + engine = get_sqlalchemy_async_engine() + async with AsyncSession(engine, expire_on_commit=False) as async_session: + if MULTI_TENANT: + if not is_valid_schema_name(tenant_id): + raise HTTPException(status_code=400, detail="Invalid tenant ID") + # Set the search_path to the tenant's schema + await async_session.execute(text(f'SET search_path = "{tenant_id}"')) yield async_session +def get_session_context_manager() -> ContextManager[Session]: + """Context manager for database sessions.""" + return contextlib.contextmanager(get_session_generator_with_tenant)() + + +def get_session_factory() -> sessionmaker[Session]: + """Get a session factory.""" + global SessionFactory + if SessionFactory is None: + SessionFactory = sessionmaker(bind=get_sqlalchemy_engine()) + return SessionFactory + + async def warm_up_connections( sync_connections_to_warm_up: int = 20, async_connections_to_warm_up: int = 20 ) -> None: @@ -204,10 +486,3 @@ async def warm_up_connections( await async_conn.execute(text("SELECT 1")) for async_conn in async_connections: await async_conn.close() - - -def get_session_factory() -> sessionmaker[Session]: - global SessionFactory - if SessionFactory is None: - SessionFactory = sessionmaker(bind=get_sqlalchemy_engine()) - return SessionFactory diff --git a/backend/danswer/db/enums.py b/backend/danswer/db/enums.py index eac048e10ab..b1905d4e785 100644 --- a/backend/danswer/db/enums.py +++ b/backend/danswer/db/enums.py @@ -5,6 +5,7 @@ class IndexingStatus(str, PyEnum): NOT_STARTED = "not_started" IN_PROGRESS = "in_progress" SUCCESS = "success" + CANCELED = "canceled" FAILED = "failed" COMPLETED_WITH_ERRORS = "completed_with_errors" @@ -12,6 +13,7 @@ def is_terminal(self) -> bool: terminal_states = { IndexingStatus.SUCCESS, IndexingStatus.COMPLETED_WITH_ERRORS, + IndexingStatus.CANCELED, IndexingStatus.FAILED, } return self in terminal_states @@ -51,3 +53,9 @@ class ConnectorCredentialPairStatus(str, PyEnum): def is_active(self) -> bool: return self == ConnectorCredentialPairStatus.ACTIVE + + +class AccessType(str, PyEnum): + PUBLIC = "public" + PRIVATE = "private" + SYNC = "sync" diff --git a/backend/danswer/db/feedback.py b/backend/danswer/db/feedback.py index 6df1f1f5051..219e2474729 100644 --- a/backend/danswer/db/feedback.py +++ b/backend/danswer/db/feedback.py @@ -16,6 +16,7 @@ from danswer.configs.constants import MessageType from danswer.configs.constants import SearchFeedbackType from danswer.db.chat import get_chat_message +from danswer.db.enums import AccessType from danswer.db.models import ChatMessageFeedback from danswer.db.models import ConnectorCredentialPair from danswer.db.models import Document as DbDocument @@ -94,7 +95,7 @@ def _add_user_filters( .correlate(CCPair) ) else: - where_clause |= CCPair.is_public == True # noqa: E712 + where_clause |= CCPair.access_type == AccessType.PUBLIC return stmt.where(where_clause) diff --git a/backend/danswer/db/index_attempt.py b/backend/danswer/db/index_attempt.py index 32e20d065c0..06bbee10559 100644 --- a/backend/danswer/db/index_attempt.py +++ b/backend/danswer/db/index_attempt.py @@ -1,4 +1,7 @@ from collections.abc import Sequence +from datetime import datetime +from datetime import timedelta +from datetime import timezone from sqlalchemy import and_ from sqlalchemy import delete @@ -19,8 +22,6 @@ from danswer.server.documents.models import ConnectorCredentialPair from danswer.server.documents.models import ConnectorCredentialPairIdentifier from danswer.utils.logger import setup_logger -from danswer.utils.telemetry import optional_telemetry -from danswer.utils.telemetry import RecordType logger = setup_logger() @@ -66,7 +67,40 @@ def create_index_attempt( return new_attempt.id -def get_inprogress_index_attempts( +def delete_index_attempt(db_session: Session, index_attempt_id: int) -> None: + index_attempt = get_index_attempt(db_session, index_attempt_id) + if index_attempt: + db_session.delete(index_attempt) + db_session.commit() + + +def mock_successful_index_attempt( + connector_credential_pair_id: int, + search_settings_id: int, + docs_indexed: int, + db_session: Session, +) -> int: + """Should not be used in any user triggered flows""" + db_time = func.now() + new_attempt = IndexAttempt( + connector_credential_pair_id=connector_credential_pair_id, + search_settings_id=search_settings_id, + from_beginning=True, + status=IndexingStatus.SUCCESS, + total_docs_indexed=docs_indexed, + new_docs_indexed=docs_indexed, + # Need this to be some convincing random looking value and it can't be 0 + # or the indexing rate would calculate out to infinity + time_started=db_time - timedelta(seconds=1.92), + time_updated=db_time, + ) + db_session.add(new_attempt) + db_session.commit() + + return new_attempt.id + + +def get_in_progress_index_attempts( connector_id: int | None, db_session: Session, ) -> list[IndexAttempt]: @@ -81,13 +115,15 @@ def get_inprogress_index_attempts( return list(incomplete_attempts.all()) -def get_not_started_index_attempts(db_session: Session) -> list[IndexAttempt]: +def get_all_index_attempts_by_status( + status: IndexingStatus, db_session: Session +) -> list[IndexAttempt]: """This eagerly loads the connector and credential so that the db_session can be expired before running long-living indexing jobs, which causes increasing memory usage. Results are ordered by time_created (oldest to newest).""" stmt = select(IndexAttempt) - stmt = stmt.where(IndexAttempt.status == IndexingStatus.NOT_STARTED) + stmt = stmt.where(IndexAttempt.status == status) stmt = stmt.order_by(IndexAttempt.time_created) stmt = stmt.options( joinedload(IndexAttempt.connector_credential_pair).joinedload( @@ -101,47 +137,138 @@ def get_not_started_index_attempts(db_session: Session) -> list[IndexAttempt]: return list(new_attempts.all()) +def transition_attempt_to_in_progress( + index_attempt_id: int, + db_session: Session, +) -> IndexAttempt: + """Locks the row when we try to update""" + try: + attempt = db_session.execute( + select(IndexAttempt) + .where(IndexAttempt.id == index_attempt_id) + .with_for_update() + ).scalar_one() + + if attempt is None: + raise RuntimeError( + f"Unable to find IndexAttempt for ID '{index_attempt_id}'" + ) + + if attempt.status != IndexingStatus.NOT_STARTED: + raise RuntimeError( + f"Indexing attempt with ID '{index_attempt_id}' is not in NOT_STARTED status. " + f"Current status is '{attempt.status}'." + ) + + attempt.status = IndexingStatus.IN_PROGRESS + attempt.time_started = attempt.time_started or func.now() # type: ignore + db_session.commit() + return attempt + except Exception: + db_session.rollback() + logger.exception("transition_attempt_to_in_progress exceptioned.") + raise + + def mark_attempt_in_progress( index_attempt: IndexAttempt, db_session: Session, ) -> None: - index_attempt.status = IndexingStatus.IN_PROGRESS - index_attempt.time_started = index_attempt.time_started or func.now() # type: ignore - db_session.commit() + try: + attempt = db_session.execute( + select(IndexAttempt) + .where(IndexAttempt.id == index_attempt.id) + .with_for_update() + ).scalar_one() + + attempt.status = IndexingStatus.IN_PROGRESS + attempt.time_started = index_attempt.time_started or func.now() # type: ignore + db_session.commit() + except Exception: + db_session.rollback() + raise def mark_attempt_succeeded( index_attempt: IndexAttempt, db_session: Session, ) -> None: - index_attempt.status = IndexingStatus.SUCCESS - db_session.add(index_attempt) - db_session.commit() + try: + attempt = db_session.execute( + select(IndexAttempt) + .where(IndexAttempt.id == index_attempt.id) + .with_for_update() + ).scalar_one() + + attempt.status = IndexingStatus.SUCCESS + db_session.commit() + except Exception: + db_session.rollback() + raise def mark_attempt_partially_succeeded( index_attempt: IndexAttempt, db_session: Session, ) -> None: - index_attempt.status = IndexingStatus.COMPLETED_WITH_ERRORS - db_session.add(index_attempt) - db_session.commit() + try: + attempt = db_session.execute( + select(IndexAttempt) + .where(IndexAttempt.id == index_attempt.id) + .with_for_update() + ).scalar_one() + + attempt.status = IndexingStatus.COMPLETED_WITH_ERRORS + db_session.commit() + except Exception: + db_session.rollback() + raise + + +def mark_attempt_canceled( + index_attempt_id: int, + db_session: Session, + reason: str = "Unknown", +) -> None: + try: + attempt = db_session.execute( + select(IndexAttempt) + .where(IndexAttempt.id == index_attempt_id) + .with_for_update() + ).scalar_one() + + if not attempt.time_started: + attempt.time_started = datetime.now(timezone.utc) + attempt.status = IndexingStatus.CANCELED + attempt.error_msg = reason + db_session.commit() + except Exception: + db_session.rollback() + raise def mark_attempt_failed( - index_attempt: IndexAttempt, + index_attempt_id: int, db_session: Session, failure_reason: str = "Unknown", full_exception_trace: str | None = None, ) -> None: - index_attempt.status = IndexingStatus.FAILED - index_attempt.error_msg = failure_reason - index_attempt.full_exception_trace = full_exception_trace - db_session.add(index_attempt) - db_session.commit() - - source = index_attempt.connector_credential_pair.connector.source - optional_telemetry(record_type=RecordType.FAILURE, data={"connector": source}) + try: + attempt = db_session.execute( + select(IndexAttempt) + .where(IndexAttempt.id == index_attempt_id) + .with_for_update() + ).scalar_one() + + if not attempt.time_started: + attempt.time_started = datetime.now(timezone.utc) + attempt.status = IndexingStatus.FAILED + attempt.error_msg = failure_reason + attempt.full_exception_trace = full_exception_trace + db_session.commit() + except Exception: + db_session.rollback() + raise def update_docs_indexed( @@ -435,14 +562,13 @@ def cancel_indexing_attempts_for_ccpair( db_session.execute(stmt) - db_session.commit() - def cancel_indexing_attempts_past_model( db_session: Session, ) -> None: """Stops all indexing attempts that are in progress or not started for any embedding model that not present/future""" + db_session.execute( update(IndexAttempt) .where( @@ -455,8 +581,6 @@ def cancel_indexing_attempts_past_model( .values(status=IndexingStatus.FAILED) ) - db_session.commit() - def count_unique_cc_pairs_with_successful_index_attempts( search_settings_id: int | None, diff --git a/backend/danswer/db/llm.py b/backend/danswer/db/llm.py index 36d05948be5..b01fd81079c 100644 --- a/backend/danswer/db/llm.py +++ b/backend/danswer/db/llm.py @@ -62,7 +62,8 @@ def upsert_cloud_embedding_provider( def upsert_llm_provider( - llm_provider: LLMProviderUpsertRequest, db_session: Session + llm_provider: LLMProviderUpsertRequest, + db_session: Session, ) -> FullLLMProvider: existing_llm_provider = db_session.scalar( select(LLMProviderModel).where(LLMProviderModel.name == llm_provider.name) @@ -82,6 +83,7 @@ def upsert_llm_provider( existing_llm_provider.model_names = llm_provider.model_names existing_llm_provider.is_public = llm_provider.is_public existing_llm_provider.display_model_names = llm_provider.display_model_names + existing_llm_provider.deployment_name = llm_provider.deployment_name if not existing_llm_provider.id: # If its not already in the db, we need to generate an ID by flushing @@ -93,10 +95,11 @@ def upsert_llm_provider( group_ids=llm_provider.groups, db_session=db_session, ) + full_llm_provider = FullLLMProvider.from_model(existing_llm_provider) db_session.commit() - return FullLLMProvider.from_model(existing_llm_provider) + return full_llm_provider def fetch_existing_embedding_providers( diff --git a/backend/danswer/db/models.py b/backend/danswer/db/models.py index 79d8206586b..4e1970a7bd2 100644 --- a/backend/danswer/db/models.py +++ b/backend/danswer/db/models.py @@ -5,9 +5,12 @@ from typing import Literal from typing import NotRequired from typing import Optional +from uuid import uuid4 from typing_extensions import TypedDict # noreorder from uuid import UUID +from sqlalchemy.dialects.postgresql import UUID as PGUUID + from fastapi_users_db_sqlalchemy import SQLAlchemyBaseOAuthAccountTableUUID from fastapi_users_db_sqlalchemy import SQLAlchemyBaseUserTableUUID from fastapi_users_db_sqlalchemy.access_token import SQLAlchemyBaseAccessTokenTableUUID @@ -39,6 +42,7 @@ from danswer.configs.constants import DocumentSource from danswer.configs.constants import FileOrigin from danswer.configs.constants import MessageType +from danswer.db.enums import AccessType from danswer.configs.constants import NotificationType from danswer.configs.constants import SearchFeedbackType from danswer.configs.constants import TokenRateLimitScope @@ -49,13 +53,14 @@ from danswer.db.enums import IndexModelStatus from danswer.db.enums import TaskStatus from danswer.db.pydantic_type import PydanticType -from danswer.dynamic_configs.interface import JSON_ro +from danswer.utils.special_types import JSON_ro from danswer.file_store.models import FileDescriptor from danswer.llm.override_models import LLMOverride from danswer.llm.override_models import PromptOverride -from danswer.search.enums import RecencyBiasSetting +from danswer.context.search.enums import RecencyBiasSetting from danswer.utils.encryption import decrypt_bytes_to_string from danswer.utils.encryption import encrypt_string_to_bytes +from danswer.utils.headers import HeaderItemDict from shared_configs.enums import EmbeddingProvider from shared_configs.enums import RerankerProvider @@ -108,7 +113,7 @@ class OAuthAccount(SQLAlchemyBaseOAuthAccountTableUUID, Base): class User(SQLAlchemyBaseUserTableUUID, Base): oauth_accounts: Mapped[list[OAuthAccount]] = relationship( - "OAuthAccount", lazy="joined" + "OAuthAccount", lazy="joined", cascade="all, delete-orphan" ) role: Mapped[UserRole] = mapped_column( Enum(UserRole, native_enum=False, default=UserRole.BASIC) @@ -121,8 +126,17 @@ class User(SQLAlchemyBaseUserTableUUID, Base): # if specified, controls the assistants that are shown to the user + their order # if not specified, all assistants are shown - chosen_assistants: Mapped[list[int]] = mapped_column( - postgresql.JSONB(), nullable=False, default=[-2, -1, 0] + chosen_assistants: Mapped[list[int] | None] = mapped_column( + postgresql.JSONB(), nullable=True, default=None + ) + visible_assistants: Mapped[list[int]] = mapped_column( + postgresql.JSONB(), nullable=False, default=[] + ) + hidden_assistants: Mapped[list[int]] = mapped_column( + postgresql.JSONB(), nullable=False, default=[] + ) + recent_assistants: Mapped[list[dict]] = mapped_column( + postgresql.JSONB(), nullable=False, default=list, server_default="[]" ) oidc_expiry: Mapped[datetime.datetime] = mapped_column( @@ -157,8 +171,11 @@ class User(SQLAlchemyBaseUserTableUUID, Base): notifications: Mapped[list["Notification"]] = relationship( "Notification", back_populates="user" ) - # Whether the user has logged in via web. False if user has only used Danswer through Slack bot - has_web_login: Mapped[bool] = mapped_column(Boolean, default=True) + cc_pairs: Mapped[list["ConnectorCredentialPair"]] = relationship( + "ConnectorCredentialPair", + back_populates="creator", + primaryjoin="User.id == foreign(ConnectorCredentialPair.creator_id)", + ) class InputPrompt(Base): @@ -170,7 +187,9 @@ class InputPrompt(Base): active: Mapped[bool] = mapped_column(Boolean) user: Mapped[User | None] = relationship("User", back_populates="input_prompts") is_public: Mapped[bool] = mapped_column(Boolean, nullable=False, default=True) - user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=True) + user_id: Mapped[UUID | None] = mapped_column( + ForeignKey("user.id", ondelete="CASCADE"), nullable=True + ) class InputPrompt__User(Base): @@ -214,12 +233,17 @@ class Notification(Base): notif_type: Mapped[NotificationType] = mapped_column( Enum(NotificationType, native_enum=False) ) - user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=True) + user_id: Mapped[UUID | None] = mapped_column( + ForeignKey("user.id", ondelete="CASCADE"), nullable=True + ) dismissed: Mapped[bool] = mapped_column(Boolean, default=False) last_shown: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True)) first_shown: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True)) user: Mapped[User] = relationship("User", back_populates="notifications") + additional_data: Mapped[dict | None] = mapped_column( + postgresql.JSONB(), nullable=True + ) """ @@ -249,7 +273,7 @@ class Persona__User(Base): persona_id: Mapped[int] = mapped_column(ForeignKey("persona.id"), primary_key=True) user_id: Mapped[UUID | None] = mapped_column( - ForeignKey("user.id"), primary_key=True, nullable=True + ForeignKey("user.id", ondelete="CASCADE"), primary_key=True, nullable=True ) @@ -260,7 +284,7 @@ class DocumentSet__User(Base): ForeignKey("document_set.id"), primary_key=True ) user_id: Mapped[UUID | None] = mapped_column( - ForeignKey("user.id"), primary_key=True, nullable=True + ForeignKey("user.id", ondelete="CASCADE"), primary_key=True, nullable=True ) @@ -326,11 +350,11 @@ class StandardAnswer__StandardAnswerCategory(Base): ) -class SlackBotConfig__StandardAnswerCategory(Base): - __tablename__ = "slack_bot_config__standard_answer_category" +class SlackChannelConfig__StandardAnswerCategory(Base): + __tablename__ = "slack_channel_config__standard_answer_category" - slack_bot_config_id: Mapped[int] = mapped_column( - ForeignKey("slack_bot_config.id"), primary_key=True + slack_channel_config_id: Mapped[int] = mapped_column( + ForeignKey("slack_channel_config.id"), primary_key=True ) standard_answer_category_id: Mapped[int] = mapped_column( ForeignKey("standard_answer_category.id"), primary_key=True @@ -384,15 +408,34 @@ class ConnectorCredentialPair(Base): # controls whether the documents indexed by this CC pair are visible to all # or if they are only visible to those with that are given explicit access # (e.g. via owning the credential or being a part of a group that is given access) - is_public: Mapped[bool] = mapped_column( - Boolean, - default=True, - nullable=False, + access_type: Mapped[AccessType] = mapped_column( + Enum(AccessType, native_enum=False), nullable=False + ) + + # special info needed for the auto-sync feature. The exact structure depends on the + + # source type (defined in the connector's `source` field) + # E.g. for google_drive perm sync: + # {"customer_id": "123567", "company_domain": "@danswer.ai"} + auto_sync_options: Mapped[dict[str, Any] | None] = mapped_column( + postgresql.JSONB(), nullable=True + ) + last_time_perm_sync: Mapped[datetime.datetime | None] = mapped_column( + DateTime(timezone=True), nullable=True + ) + last_time_external_group_sync: Mapped[datetime.datetime | None] = mapped_column( + DateTime(timezone=True), nullable=True ) # Time finished, not used for calculating backend jobs which uses time started (created) last_successful_index_time: Mapped[datetime.datetime | None] = mapped_column( DateTime(timezone=True), default=None ) + + # last successful prune + last_pruned: Mapped[datetime.datetime | None] = mapped_column( + DateTime(timezone=True), nullable=True, index=True + ) + total_docs_indexed: Mapped[int] = mapped_column(Integer, default=0) connector: Mapped["Connector"] = relationship( @@ -415,9 +458,18 @@ class ConnectorCredentialPair(Base): "IndexAttempt", back_populates="connector_credential_pair" ) + # the user id of the user that created this cc pair + creator_id: Mapped[UUID | None] = mapped_column(nullable=True) + creator: Mapped["User"] = relationship( + "User", + back_populates="cc_pairs", + primaryjoin="foreign(ConnectorCredentialPair.creator_id) == remote(User.id)", + ) + class Document(Base): __tablename__ = "document" + # NOTE: if more sensitive data is added here for display, make sure to add user/group permission # this should correspond to the ID of the document # (as is passed around in Danswer) @@ -461,7 +513,18 @@ class Document(Base): secondary_owners: Mapped[list[str] | None] = mapped_column( postgresql.ARRAY(String), nullable=True ) - # TODO if more sensitive data is added here for display, make sure to add user/group permission + # Permission sync columns + # Email addresses are saved at the document level for externally synced permissions + # This is becuase the normal flow of assigning permissions is through the cc_pair + # doesn't apply here + external_user_emails: Mapped[list[str] | None] = mapped_column( + postgresql.ARRAY(String), nullable=True + ) + # These group ids have been prefixed by the source type + external_user_group_ids: Mapped[list[str] | None] = mapped_column( + postgresql.ARRAY(String), nullable=True + ) + is_public: Mapped[bool] = mapped_column(Boolean, default=False) retrieval_feedbacks: Mapped[list["DocumentRetrievalFeedback"]] = relationship( "DocumentRetrievalFeedback", back_populates="document" @@ -541,7 +604,9 @@ class Credential(Base): id: Mapped[int] = mapped_column(primary_key=True) credential_json: Mapped[dict[str, Any]] = mapped_column(EncryptedJson()) - user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=True) + user_id: Mapped[UUID | None] = mapped_column( + ForeignKey("user.id", ondelete="CASCADE"), nullable=True + ) # if `true`, then all Admins will have access to the credential admin_public: Mapped[bool] = mapped_column(Boolean, default=True) time_created: Mapped[datetime.datetime] = mapped_column( @@ -574,6 +639,7 @@ class SearchSettings(Base): normalize: Mapped[bool] = mapped_column(Boolean) query_prefix: Mapped[str | None] = mapped_column(String, nullable=True) passage_prefix: Mapped[str | None] = mapped_column(String, nullable=True) + status: Mapped[IndexModelStatus] = mapped_column( Enum(IndexModelStatus, native_enum=False) ) @@ -629,6 +695,20 @@ def __repr__(self) -> str: return f"" + @property + def api_version(self) -> str | None: + return ( + self.cloud_provider.api_version if self.cloud_provider is not None else None + ) + + @property + def deployment_name(self) -> str | None: + return ( + self.cloud_provider.deployment_name + if self.cloud_provider is not None + else None + ) + @property def api_url(self) -> str | None: return self.cloud_provider.api_url if self.cloud_provider is not None else None @@ -671,9 +751,10 @@ class IndexAttempt(Base): full_exception_trace: Mapped[str | None] = mapped_column(Text, default=None) # Nullable because in the past, we didn't allow swapping out embedding models live search_settings_id: Mapped[int] = mapped_column( - ForeignKey("search_settings.id"), - nullable=False, + ForeignKey("search_settings.id", ondelete="SET NULL"), + nullable=True, ) + time_created: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), @@ -693,7 +774,7 @@ class IndexAttempt(Base): "ConnectorCredentialPair", back_populates="index_attempts" ) - search_settings: Mapped[SearchSettings] = relationship( + search_settings: Mapped[SearchSettings | None] = relationship( "SearchSettings", back_populates="index_attempts" ) @@ -854,18 +935,27 @@ class ToolCall(Base): tool_arguments: Mapped[dict[str, JSON_ro]] = mapped_column(postgresql.JSONB()) tool_result: Mapped[JSON_ro] = mapped_column(postgresql.JSONB()) - message_id: Mapped[int] = mapped_column(ForeignKey("chat_message.id")) + message_id: Mapped[int | None] = mapped_column( + ForeignKey("chat_message.id"), nullable=False + ) + # Update the relationship message: Mapped["ChatMessage"] = relationship( - "ChatMessage", back_populates="tool_calls" + "ChatMessage", + back_populates="tool_call", + uselist=False, ) class ChatSession(Base): __tablename__ = "chat_session" - id: Mapped[int] = mapped_column(primary_key=True) - user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=True) + id: Mapped[UUID] = mapped_column( + PGUUID(as_uuid=True), primary_key=True, default=uuid4 + ) + user_id: Mapped[UUID | None] = mapped_column( + ForeignKey("user.id", ondelete="CASCADE"), nullable=True + ) persona_id: Mapped[int | None] = mapped_column( ForeignKey("persona.id"), nullable=True ) @@ -932,7 +1022,9 @@ class ChatMessage(Base): __tablename__ = "chat_message" id: Mapped[int] = mapped_column(primary_key=True) - chat_session_id: Mapped[int] = mapped_column(ForeignKey("chat_session.id")) + chat_session_id: Mapped[UUID] = mapped_column( + PGUUID(as_uuid=True), ForeignKey("chat_session.id") + ) alternate_assistant_id = mapped_column( Integer, ForeignKey("persona.id"), nullable=True @@ -982,12 +1074,13 @@ class ChatMessage(Base): secondary=ChatMessage__SearchDoc.__table__, back_populates="chat_messages", ) - # NOTE: Should always be attached to the `assistant` message. - # represents the tool calls used to generate this message - tool_calls: Mapped[list["ToolCall"]] = relationship( + + tool_call: Mapped["ToolCall"] = relationship( "ToolCall", back_populates="message", + uselist=False, ) + standard_answers: Mapped[list["StandardAnswer"]] = relationship( "StandardAnswer", secondary=ChatMessage__StandardAnswer.__table__, @@ -1002,7 +1095,9 @@ class ChatFolder(Base): id: Mapped[int] = mapped_column(primary_key=True) # Only null if auth is off - user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=True) + user_id: Mapped[UUID | None] = mapped_column( + ForeignKey("user.id", ondelete="CASCADE"), nullable=True + ) name: Mapped[str | None] = mapped_column(String, nullable=True) display_priority: Mapped[int] = mapped_column(Integer, nullable=True, default=0) @@ -1086,7 +1181,7 @@ class LLMProvider(Base): default_model_name: Mapped[str] = mapped_column(String) fast_default_model_name: Mapped[str | None] = mapped_column(String, nullable=True) - # Models to actually disp;aly to users + # Models to actually display to users # If nulled out, we assume in the application logic we should present all display_model_names: Mapped[list[str] | None] = mapped_column( postgresql.ARRAY(String), nullable=True @@ -1098,6 +1193,8 @@ class LLMProvider(Base): postgresql.ARRAY(String), nullable=True ) + deployment_name: Mapped[str | None] = mapped_column(String, nullable=True) + # should only be set for a single provider is_default_provider: Mapped[bool | None] = mapped_column(Boolean, unique=True) # EE only @@ -1117,6 +1214,9 @@ class CloudEmbeddingProvider(Base): ) api_url: Mapped[str | None] = mapped_column(String, nullable=True) api_key: Mapped[str | None] = mapped_column(EncryptedString()) + api_version: Mapped[str | None] = mapped_column(String, nullable=True) + deployment_name: Mapped[str | None] = mapped_column(String, nullable=True) + search_settings: Mapped[list["SearchSettings"]] = relationship( "SearchSettings", back_populates="cloud_provider", @@ -1133,7 +1233,9 @@ class DocumentSet(Base): id: Mapped[int] = mapped_column(Integer, primary_key=True) name: Mapped[str] = mapped_column(String, unique=True) description: Mapped[str] = mapped_column(String) - user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=True) + user_id: Mapped[UUID | None] = mapped_column( + ForeignKey("user.id", ondelete="CASCADE"), nullable=True + ) # Whether changes to the document set have been propagated is_up_to_date: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) # If `False`, then the document set is not visible to users who are not explicitly @@ -1177,7 +1279,9 @@ class Prompt(Base): __tablename__ = "prompt" id: Mapped[int] = mapped_column(primary_key=True) - user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=True) + user_id: Mapped[UUID | None] = mapped_column( + ForeignKey("user.id", ondelete="CASCADE"), nullable=True + ) name: Mapped[str] = mapped_column(String) description: Mapped[str] = mapped_column(String) system_prompt: Mapped[str] = mapped_column(Text) @@ -1212,9 +1316,13 @@ class Tool(Base): openapi_schema: Mapped[dict[str, Any] | None] = mapped_column( postgresql.JSONB(), nullable=True ) - + custom_headers: Mapped[list[HeaderItemDict] | None] = mapped_column( + postgresql.JSONB(), nullable=True + ) # user who created / owns the tool. Will be None for built-in tools. - user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=True) + user_id: Mapped[UUID | None] = mapped_column( + ForeignKey("user.id", ondelete="CASCADE"), nullable=True + ) user: Mapped[User | None] = relationship("User", back_populates="custom_tools") # Relationship to Persona through the association table @@ -1230,7 +1338,6 @@ class StarterMessage(TypedDict): in Postgres""" name: str - description: str message: str @@ -1238,7 +1345,9 @@ class Persona(Base): __tablename__ = "persona" id: Mapped[int] = mapped_column(primary_key=True) - user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=True) + user_id: Mapped[UUID | None] = mapped_column( + ForeignKey("user.id", ondelete="CASCADE"), nullable=True + ) name: Mapped[str] = mapped_column(String) description: Mapped[str] = mapped_column(String) # Number of chunks to pass to the LLM for generation. @@ -1254,6 +1363,9 @@ class Persona(Base): recency_bias: Mapped[RecencyBiasSetting] = mapped_column( Enum(RecencyBiasSetting, native_enum=False) ) + category_id: Mapped[int | None] = mapped_column( + ForeignKey("persona_category.id"), nullable=True + ) # Allows the Persona to specify a different LLM version than is controlled # globablly via env variables. For flexibility, validity is not currently enforced # NOTE: only is applied on the actual response generation - is not used for things like @@ -1267,9 +1379,18 @@ class Persona(Base): starter_messages: Mapped[list[StarterMessage] | None] = mapped_column( postgresql.JSONB(), nullable=True ) - # Default personas are configured via backend during deployment + search_start_date: Mapped[datetime.datetime | None] = mapped_column( + DateTime(timezone=True), default=None + ) + # Built-in personas are configured via backend during deployment # Treated specially (cannot be user edited etc.) - default_persona: Mapped[bool] = mapped_column(Boolean, default=False) + builtin_persona: Mapped[bool] = mapped_column(Boolean, default=False) + + # Default personas are personas created by admins and are automatically added + # to all users' assistants list. + is_default_persona: Mapped[bool] = mapped_column( + Boolean, default=False, nullable=False + ) # controls whether the persona is available to be selected by users is_visible: Mapped[bool] = mapped_column(Boolean, default=True) # controls the ordering of personas in the UI @@ -1316,18 +1437,32 @@ class Persona(Base): secondary="persona__user_group", viewonly=True, ) + category: Mapped["PersonaCategory"] = relationship( + "PersonaCategory", back_populates="personas" + ) # Default personas loaded via yaml cannot have the same name __table_args__ = ( Index( - "_default_persona_name_idx", + "_builtin_persona_name_idx", "name", unique=True, - postgresql_where=(default_persona == True), # noqa: E712 + postgresql_where=(builtin_persona == True), # noqa: E712 ), ) +class PersonaCategory(Base): + __tablename__ = "persona_category" + + id: Mapped[int] = mapped_column(primary_key=True) + name: Mapped[str] = mapped_column(String, unique=True) + description: Mapped[str | None] = mapped_column(String, nullable=True) + personas: Mapped[list["Persona"]] = relationship( + "Persona", back_populates="category" + ) + + AllowedAnswerFilters = ( Literal["well_answered_postfilter"] | Literal["questionmark_prefilter"] ) @@ -1337,7 +1472,7 @@ class ChannelConfig(TypedDict): """NOTE: is a `TypedDict` so it can be used as a type hint for a JSONB column in Postgres""" - channel_names: list[str] + channel_name: str respond_tag_only: NotRequired[bool] # defaults to False respond_to_bots: NotRequired[bool] # defaults to False respond_member_group_list: NotRequired[list[str]] @@ -1345,6 +1480,7 @@ class ChannelConfig(TypedDict): # If None then no follow up # If empty list, follow up with no tags follow_up_tags: NotRequired[list[str]] + show_continue_in_web_ui: NotRequired[bool] # defaults to False class SlackBotResponseType(str, PyEnum): @@ -1352,10 +1488,11 @@ class SlackBotResponseType(str, PyEnum): CITATIONS = "citations" -class SlackBotConfig(Base): - __tablename__ = "slack_bot_config" +class SlackChannelConfig(Base): + __tablename__ = "slack_channel_config" id: Mapped[int] = mapped_column(primary_key=True) + slack_bot_id: Mapped[int] = mapped_column(ForeignKey("slack_bot.id"), nullable=True) persona_id: Mapped[int | None] = mapped_column( ForeignKey("persona.id"), nullable=True ) @@ -1372,10 +1509,30 @@ class SlackBotConfig(Base): ) persona: Mapped[Persona | None] = relationship("Persona") + slack_bot: Mapped["SlackBot"] = relationship( + "SlackBot", + back_populates="slack_channel_configs", + ) standard_answer_categories: Mapped[list["StandardAnswerCategory"]] = relationship( "StandardAnswerCategory", - secondary=SlackBotConfig__StandardAnswerCategory.__table__, - back_populates="slack_bot_configs", + secondary=SlackChannelConfig__StandardAnswerCategory.__table__, + back_populates="slack_channel_configs", + ) + + +class SlackBot(Base): + __tablename__ = "slack_bot" + + id: Mapped[int] = mapped_column(primary_key=True) + name: Mapped[str] = mapped_column(String) + enabled: Mapped[bool] = mapped_column(Boolean, default=True) + + bot_token: Mapped[str] = mapped_column(EncryptedString(), unique=True) + app_token: Mapped[str] = mapped_column(EncryptedString(), unique=True) + + slack_channel_configs: Mapped[list[SlackChannelConfig]] = relationship( + "SlackChannelConfig", + back_populates="slack_bot", ) @@ -1434,7 +1591,9 @@ class SamlAccount(Base): __tablename__ = "saml" id: Mapped[int] = mapped_column(primary_key=True) - user_id: Mapped[int] = mapped_column(ForeignKey("user.id"), unique=True) + user_id: Mapped[int] = mapped_column( + ForeignKey("user.id", ondelete="CASCADE"), unique=True + ) encrypted_cookie: Mapped[str] = mapped_column(Text, unique=True) expires_at: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True)) updated_at: Mapped[datetime.datetime] = mapped_column( @@ -1453,7 +1612,7 @@ class User__UserGroup(Base): ForeignKey("user_group.id"), primary_key=True ) user_id: Mapped[UUID | None] = mapped_column( - ForeignKey("user.id"), primary_key=True, nullable=True + ForeignKey("user.id", ondelete="CASCADE"), primary_key=True, nullable=True ) @@ -1612,9 +1771,9 @@ class StandardAnswerCategory(Base): secondary=StandardAnswer__StandardAnswerCategory.__table__, back_populates="categories", ) - slack_bot_configs: Mapped[list["SlackBotConfig"]] = relationship( - "SlackBotConfig", - secondary=SlackBotConfig__StandardAnswerCategory.__table__, + slack_channel_configs: Mapped[list["SlackChannelConfig"]] = relationship( + "SlackChannelConfig", + secondary=SlackChannelConfig__StandardAnswerCategory.__table__, back_populates="standard_answer_categories", ) @@ -1654,92 +1813,21 @@ class StandardAnswer(Base): """Tables related to Permission Sync""" -class PermissionSyncStatus(str, PyEnum): - IN_PROGRESS = "in_progress" - SUCCESS = "success" - FAILED = "failed" - - -class PermissionSyncJobType(str, PyEnum): - USER_LEVEL = "user_level" - GROUP_LEVEL = "group_level" - - -class PermissionSyncRun(Base): - """Represents one run of a permission sync job. For some given cc_pair, it is either sync-ing - the users or it is sync-ing the groups""" - - __tablename__ = "permission_sync_run" - - id: Mapped[int] = mapped_column(Integer, primary_key=True) - # Not strictly needed but makes it easy to use without fetching from cc_pair - source_type: Mapped[DocumentSource] = mapped_column( - Enum(DocumentSource, native_enum=False) - ) - # Currently all sync jobs are handled as a group permission sync or a user permission sync - update_type: Mapped[PermissionSyncJobType] = mapped_column( - Enum(PermissionSyncJobType) - ) - cc_pair_id: Mapped[int | None] = mapped_column( - ForeignKey("connector_credential_pair.id"), nullable=True - ) - status: Mapped[PermissionSyncStatus] = mapped_column(Enum(PermissionSyncStatus)) - error_msg: Mapped[str | None] = mapped_column(Text, default=None) - updated_at: Mapped[datetime.datetime] = mapped_column( - DateTime(timezone=True), server_default=func.now(), onupdate=func.now() - ) - - cc_pair: Mapped[ConnectorCredentialPair] = relationship("ConnectorCredentialPair") - - -class ExternalPermission(Base): +class User__ExternalUserGroupId(Base): """Maps user info both internal and external to the name of the external group This maps the user to all of their external groups so that the external group name can be attached to the ACL list matching during query time. User level permissions can be handled by directly adding the Danswer user to the doc ACL list""" - __tablename__ = "external_permission" - - id: Mapped[int] = mapped_column(Integer, primary_key=True) - user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=True) - # Email is needed because we want to keep track of users not in Danswer to simplify process - # when the user joins - user_email: Mapped[str] = mapped_column(String) - source_type: Mapped[DocumentSource] = mapped_column( - Enum(DocumentSource, native_enum=False) - ) - external_permission_group: Mapped[str] = mapped_column(String) - user = relationship("User") - - -class EmailToExternalUserCache(Base): - """A way to map users IDs in the external tool to a user in Danswer or at least an email for - when the user joins. Used as a cache for when fetching external groups which have their own - user ids, this can easily be mapped back to users already known in Danswer without needing - to call external APIs to get the user emails. - - This way when groups are updated in the external tool and we need to update the mapping of - internal users to the groups, we can sync the internal users to the external groups they are - part of using this. - - Ie. User Chris is part of groups alpha, beta, and we can update this if Chris is no longer - part of alpha in some external tool. - """ - - __tablename__ = "email_to_external_user_cache" + __tablename__ = "user__external_user_group_id" - id: Mapped[int] = mapped_column(Integer, primary_key=True) - external_user_id: Mapped[str] = mapped_column(String) - user_id: Mapped[UUID | None] = mapped_column(ForeignKey("user.id"), nullable=True) - # Email is needed because we want to keep track of users not in Danswer to simplify process - # when the user joins - user_email: Mapped[str] = mapped_column(String) - source_type: Mapped[DocumentSource] = mapped_column( - Enum(DocumentSource, native_enum=False) + user_id: Mapped[UUID] = mapped_column(ForeignKey("user.id"), primary_key=True) + # These group ids have been prefixed by the source type + external_user_group_id: Mapped[str] = mapped_column(String, primary_key=True) + cc_pair_id: Mapped[int] = mapped_column( + ForeignKey("connector_credential_pair.id"), primary_key=True ) - user = relationship("User") - class UsageReport(Base): """This stores metadata about usage reports generated by admin including user who generated @@ -1754,7 +1842,7 @@ class UsageReport(Base): # if None, report was auto-generated requestor_user_id: Mapped[UUID | None] = mapped_column( - ForeignKey("user.id"), nullable=True + ForeignKey("user.id", ondelete="CASCADE"), nullable=True ) time_created: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), server_default=func.now() @@ -1766,3 +1854,23 @@ class UsageReport(Base): requestor = relationship("User") file = relationship("PGFileStore") + + +""" +Multi-tenancy related tables +""" + + +class PublicBase(DeclarativeBase): + __abstract__ = True + + +class UserTenantMapping(Base): + __tablename__ = "user_tenant_mapping" + __table_args__ = ( + UniqueConstraint("email", "tenant_id", name="uq_user_tenant"), + {"schema": "public"}, + ) + + email: Mapped[str] = mapped_column(String, nullable=False, primary_key=True) + tenant_id: Mapped[str] = mapped_column(String, nullable=False) diff --git a/backend/danswer/db/notification.py b/backend/danswer/db/notification.py index 61586208c69..a6cdf989177 100644 --- a/backend/danswer/db/notification.py +++ b/backend/danswer/db/notification.py @@ -1,23 +1,47 @@ +from uuid import UUID + from sqlalchemy import select from sqlalchemy.orm import Session from sqlalchemy.sql import func +from danswer.auth.schemas import UserRole from danswer.configs.constants import NotificationType from danswer.db.models import Notification from danswer.db.models import User def create_notification( - user: User | None, + user_id: UUID | None, notif_type: NotificationType, db_session: Session, + additional_data: dict | None = None, ) -> Notification: + # Check if an undismissed notification of the same type and data exists + existing_notification = ( + db_session.query(Notification) + .filter_by( + user_id=user_id, + notif_type=notif_type, + dismissed=False, + ) + .filter(Notification.additional_data == additional_data) + .first() + ) + + if existing_notification: + # Update the last_shown timestamp + existing_notification.last_shown = func.now() + db_session.commit() + return existing_notification + + # Create a new notification if none exists notification = Notification( - user_id=user.id if user else None, + user_id=user_id, notif_type=notif_type, dismissed=False, last_shown=func.now(), first_shown=func.now(), + additional_data=additional_data, ) db_session.add(notification) db_session.commit() @@ -31,7 +55,9 @@ def get_notification_by_id( notif = db_session.get(Notification, notification_id) if not notif: raise ValueError(f"No notification found with id {notification_id}") - if notif.user_id != user_id: + if notif.user_id != user_id and not ( + notif.user_id is None and user is not None and user.role == UserRole.ADMIN + ): raise PermissionError( f"User {user_id} is not authorized to access notification {notification_id}" ) diff --git a/backend/danswer/db/persona.py b/backend/danswer/db/persona.py index 6cb93ad9fc0..b71df22181e 100644 --- a/backend/danswer/db/persona.py +++ b/backend/danswer/db/persona.py @@ -1,4 +1,5 @@ from collections.abc import Sequence +from datetime import datetime from functools import lru_cache from uuid import UUID @@ -19,19 +20,20 @@ from danswer.configs.chat_configs import BING_API_KEY from danswer.configs.chat_configs import CONTEXT_CHUNKS_ABOVE from danswer.configs.chat_configs import CONTEXT_CHUNKS_BELOW +from danswer.context.search.enums import RecencyBiasSetting from danswer.db.constants import SLACK_BOT_PERSONA_PREFIX from danswer.db.engine import get_sqlalchemy_engine from danswer.db.models import DocumentSet from danswer.db.models import Persona from danswer.db.models import Persona__User from danswer.db.models import Persona__UserGroup +from danswer.db.models import PersonaCategory from danswer.db.models import Prompt from danswer.db.models import StarterMessage from danswer.db.models import Tool from danswer.db.models import User from danswer.db.models import User__UserGroup from danswer.db.models import UserGroup -from danswer.search.enums import RecencyBiasSetting from danswer.server.features.persona.models import CreatePersonaRequest from danswer.server.features.persona.models import PersonaSnapshot from danswer.utils.logger import setup_logger @@ -111,6 +113,31 @@ def fetch_persona_by_id( return persona +def get_best_persona_id_for_user( + db_session: Session, user: User | None, persona_id: int | None = None +) -> int | None: + if persona_id is not None: + stmt = select(Persona).where(Persona.id == persona_id).distinct() + stmt = _add_user_filters( + stmt=stmt, + user=user, + # We don't want to filter by editable here, we just want to see if the + # persona is usable by the user + get_editable=False, + ) + persona = db_session.scalars(stmt).one_or_none() + if persona: + return persona.id + + # If the persona is not found, or the slack bot is using doc sets instead of personas, + # we need to find the best persona for the user + # This is the persona with the highest display priority that the user has access to + stmt = select(Persona).order_by(Persona.display_priority.desc()).distinct() + stmt = _add_user_filters(stmt=stmt, user=user, get_editable=True) + persona = db_session.scalars(stmt).one_or_none() + return persona.id if persona else None + + def _get_persona_by_name( persona_name: str, user: User | None, db_session: Session ) -> Persona | None: @@ -178,6 +205,7 @@ def create_update_persona( except ValueError as e: logger.exception("Failed to create persona") raise HTTPException(status_code=400, detail=str(e)) + return PersonaSnapshot.from_model(persona) @@ -256,9 +284,8 @@ def get_personas( ) -> Sequence[Persona]: stmt = select(Persona).distinct() stmt = _add_user_filters(stmt=stmt, user=user, get_editable=get_editable) - if not include_default: - stmt = stmt.where(Persona.default_persona.is_(False)) + stmt = stmt.where(Persona.builtin_persona.is_(False)) if not include_slack_bot_personas: stmt = stmt.where(not_(Persona.name.startswith(SLACK_BOT_PERSONA_PREFIX))) if not include_deleted: @@ -306,7 +333,7 @@ def mark_delete_persona_by_name( ) -> None: stmt = ( update(Persona) - .where(Persona.name == persona_name, Persona.default_persona == is_default) + .where(Persona.name == persona_name, Persona.builtin_persona == is_default) .values(deleted=True) ) @@ -326,7 +353,6 @@ def update_all_personas_display_priority( for persona in personas: persona.display_priority = display_priority_map[persona.id] - db_session.commit() @@ -389,6 +415,9 @@ def upsert_prompt( return prompt +# NOTE: This operation cannot update persona configuration options that +# are core to the persona, such as its display priority and +# whether or not the assistant is a built-in / default assistant def upsert_persona( user: User | None, name: str, @@ -406,7 +435,6 @@ def upsert_persona( document_set_ids: list[int] | None = None, tool_ids: list[int] | None = None, persona_id: int | None = None, - default_persona: bool = False, commit: bool = True, icon_color: str | None = None, icon_shape: int | None = None, @@ -414,6 +442,10 @@ def upsert_persona( display_priority: int | None = None, is_visible: bool = True, remove_image: bool | None = None, + search_start_date: datetime | None = None, + builtin_persona: bool = False, + is_default_persona: bool = False, + category_id: int | None = None, chunks_above: int = CONTEXT_CHUNKS_ABOVE, chunks_below: int = CONTEXT_CHUNKS_BELOW, ) -> Persona: @@ -454,8 +486,8 @@ def upsert_persona( validate_persona_tools(tools) if persona: - if not default_persona and persona.default_persona: - raise ValueError("Cannot update default persona with non-default.") + if persona.builtin_persona and not builtin_persona: + raise ValueError("Cannot update builtin persona with non-builtin.") # this checks if the user has permission to edit the persona persona = fetch_persona_by_id( @@ -470,7 +502,6 @@ def upsert_persona( persona.llm_relevance_filter = llm_relevance_filter persona.llm_filter_extraction = llm_filter_extraction persona.recency_bias = recency_bias - persona.default_persona = default_persona persona.llm_model_provider_override = llm_model_provider_override persona.llm_model_version_override = llm_model_version_override persona.starter_messages = starter_messages @@ -480,9 +511,9 @@ def upsert_persona( persona.icon_shape = icon_shape if remove_image or uploaded_image_id: persona.uploaded_image_id = uploaded_image_id - persona.display_priority = display_priority persona.is_visible = is_visible - + persona.search_start_date = search_start_date + persona.category_id = category_id # Do not delete any associations manually added unless # a new updated list is provided if document_sets is not None: @@ -509,7 +540,7 @@ def upsert_persona( llm_relevance_filter=llm_relevance_filter, llm_filter_extraction=llm_filter_extraction, recency_bias=recency_bias, - default_persona=default_persona, + builtin_persona=builtin_persona, prompts=prompts or [], document_sets=document_sets or [], llm_model_provider_override=llm_model_provider_override, @@ -521,6 +552,9 @@ def upsert_persona( uploaded_image_id=uploaded_image_id, display_priority=display_priority, is_visible=is_visible, + search_start_date=search_start_date, + is_default_persona=is_default_persona, + category_id=category_id, ) db_session.add(persona) @@ -550,7 +584,7 @@ def delete_old_default_personas( Need a more graceful fix later or those need to never have IDs""" stmt = ( update(Persona) - .where(Persona.default_persona, Persona.id > 0) + .where(Persona.builtin_persona, Persona.id > 0) .values(deleted=True, name=func.concat(Persona.name, "_old")) ) @@ -732,9 +766,44 @@ def delete_persona_by_name( persona_name: str, db_session: Session, is_default: bool = True ) -> None: stmt = delete(Persona).where( - Persona.name == persona_name, Persona.default_persona == is_default + Persona.name == persona_name, Persona.builtin_persona == is_default ) db_session.execute(stmt) + db_session.commit() + + +def get_assistant_categories(db_session: Session) -> list[PersonaCategory]: + return db_session.query(PersonaCategory).all() + + +def create_assistant_category( + db_session: Session, name: str, description: str +) -> PersonaCategory: + category = PersonaCategory(name=name, description=description) + db_session.add(category) + db_session.commit() + return category + + +def update_persona_category( + category_id: int, + category_description: str, + category_name: str, + db_session: Session, +) -> None: + persona_category = ( + db_session.query(PersonaCategory) + .filter(PersonaCategory.id == category_id) + .one_or_none() + ) + if persona_category is None: + raise ValueError(f"Persona category with ID {category_id} does not exist") + persona_category.description = category_description + persona_category.name = category_name + db_session.commit() + +def delete_persona_category(category_id: int, db_session: Session) -> None: + db_session.query(PersonaCategory).filter(PersonaCategory.id == category_id).delete() db_session.commit() diff --git a/backend/danswer/db/search_settings.py b/backend/danswer/db/search_settings.py index e3f35e31007..4f437eaae53 100644 --- a/backend/danswer/db/search_settings.py +++ b/backend/danswer/db/search_settings.py @@ -12,7 +12,8 @@ from danswer.configs.model_configs import OLD_DEFAULT_DOCUMENT_ENCODER_MODEL from danswer.configs.model_configs import OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM from danswer.configs.model_configs import OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS -from danswer.db.engine import get_sqlalchemy_engine +from danswer.context.search.models import SavedSearchSettings +from danswer.db.engine import get_session_with_default_tenant from danswer.db.llm import fetch_embedding_provider from danswer.db.models import CloudEmbeddingProvider from danswer.db.models import IndexAttempt @@ -21,7 +22,6 @@ from danswer.indexing.models import IndexingSetting from danswer.natural_language_processing.search_nlp_models import clean_model_name from danswer.natural_language_processing.search_nlp_models import warm_up_cross_encoder -from danswer.search.models import SavedSearchSettings from danswer.server.manage.embedding.models import ( CloudEmbeddingProvider as ServerCloudEmbeddingProvider, ) @@ -152,7 +152,7 @@ def get_all_search_settings(db_session: Session) -> list[SearchSettings]: def get_multilingual_expansion(db_session: Session | None = None) -> list[str]: if db_session is None: - with Session(get_sqlalchemy_engine()) as db_session: + with get_session_with_default_tenant() as db_session: search_settings = get_current_search_settings(db_session) else: search_settings = get_current_search_settings(db_session) diff --git a/backend/danswer/db/slack_bot.py b/backend/danswer/db/slack_bot.py new file mode 100644 index 00000000000..c3d7559b996 --- /dev/null +++ b/backend/danswer/db/slack_bot.py @@ -0,0 +1,76 @@ +from collections.abc import Sequence + +from sqlalchemy import select +from sqlalchemy.orm import Session + +from danswer.db.models import SlackBot + + +def insert_slack_bot( + db_session: Session, + name: str, + enabled: bool, + bot_token: str, + app_token: str, +) -> SlackBot: + slack_bot = SlackBot( + name=name, + enabled=enabled, + bot_token=bot_token, + app_token=app_token, + ) + db_session.add(slack_bot) + db_session.commit() + + return slack_bot + + +def update_slack_bot( + db_session: Session, + slack_bot_id: int, + name: str, + enabled: bool, + bot_token: str, + app_token: str, +) -> SlackBot: + slack_bot = db_session.scalar(select(SlackBot).where(SlackBot.id == slack_bot_id)) + if slack_bot is None: + raise ValueError(f"Unable to find Slack Bot with ID {slack_bot_id}") + + # update the app + slack_bot.name = name + slack_bot.enabled = enabled + slack_bot.bot_token = bot_token + slack_bot.app_token = app_token + + db_session.commit() + + return slack_bot + + +def fetch_slack_bot( + db_session: Session, + slack_bot_id: int, +) -> SlackBot: + slack_bot = db_session.scalar(select(SlackBot).where(SlackBot.id == slack_bot_id)) + if slack_bot is None: + raise ValueError(f"Unable to find Slack Bot with ID {slack_bot_id}") + + return slack_bot + + +def remove_slack_bot( + db_session: Session, + slack_bot_id: int, +) -> None: + slack_bot = fetch_slack_bot( + db_session=db_session, + slack_bot_id=slack_bot_id, + ) + + db_session.delete(slack_bot) + db_session.commit() + + +def fetch_slack_bots(db_session: Session) -> Sequence[SlackBot]: + return db_session.scalars(select(SlackBot)).all() diff --git a/backend/danswer/db/slack_bot_config.py b/backend/danswer/db/slack_channel_config.py similarity index 70% rename from backend/danswer/db/slack_bot_config.py rename to backend/danswer/db/slack_channel_config.py index a37bd18c0ec..00e5965120a 100644 --- a/backend/danswer/db/slack_bot_config.py +++ b/backend/danswer/db/slack_channel_config.py @@ -5,25 +5,25 @@ from sqlalchemy.orm import Session from danswer.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT +from danswer.context.search.enums import RecencyBiasSetting from danswer.db.constants import SLACK_BOT_PERSONA_PREFIX from danswer.db.models import ChannelConfig from danswer.db.models import Persona from danswer.db.models import Persona__DocumentSet -from danswer.db.models import SlackBotConfig from danswer.db.models import SlackBotResponseType +from danswer.db.models import SlackChannelConfig from danswer.db.models import User from danswer.db.persona import get_default_prompt from danswer.db.persona import mark_persona_as_deleted from danswer.db.persona import upsert_persona -from danswer.search.enums import RecencyBiasSetting from danswer.utils.errors import EERequiredError from danswer.utils.variable_functionality import ( fetch_versioned_implementation_with_fallback, ) -def _build_persona_name(channel_names: list[str]) -> str: - return f"{SLACK_BOT_PERSONA_PREFIX}{'-'.join(channel_names)}" +def _build_persona_name(channel_name: str) -> str: + return f"{SLACK_BOT_PERSONA_PREFIX}{channel_name}" def _cleanup_relationships(db_session: Session, persona_id: int) -> None: @@ -38,9 +38,9 @@ def _cleanup_relationships(db_session: Session, persona_id: int) -> None: db_session.delete(rel) -def create_slack_bot_persona( +def create_slack_channel_persona( db_session: Session, - channel_names: list[str], + channel_name: str, document_set_ids: list[int], existing_persona_id: int | None = None, num_chunks: float = MAX_CHUNKS_FED_TO_CHAT, @@ -48,11 +48,11 @@ def create_slack_bot_persona( ) -> Persona: """NOTE: does not commit changes""" - # create/update persona associated with the slack bot - persona_name = _build_persona_name(channel_names) + # create/update persona associated with the Slack channel + persona_name = _build_persona_name(channel_name) default_prompt = get_default_prompt(db_session) persona = upsert_persona( - user=None, # Slack Bot Personas are not attached to users + user=None, # Slack channel Personas are not attached to users persona_id=existing_persona_id, name=persona_name, description="", @@ -66,7 +66,7 @@ def create_slack_bot_persona( llm_model_version_override=None, starter_messages=None, is_public=True, - default_persona=False, + is_default_persona=False, db_session=db_session, commit=False, ) @@ -78,14 +78,15 @@ def _no_ee_standard_answer_categories(*args: Any, **kwargs: Any) -> list: return [] -def insert_slack_bot_config( +def insert_slack_channel_config( + db_session: Session, + slack_bot_id: int, persona_id: int | None, channel_config: ChannelConfig, response_type: SlackBotResponseType, standard_answer_category_ids: list[int], enable_auto_filters: bool, - db_session: Session, -) -> SlackBotConfig: +) -> SlackChannelConfig: versioned_fetch_standard_answer_categories_by_ids = ( fetch_versioned_implementation_with_fallback( "danswer.db.standard_answer", @@ -110,34 +111,37 @@ def insert_slack_bot_config( f"Some or all categories with ids {standard_answer_category_ids} do not exist" ) - slack_bot_config = SlackBotConfig( + slack_channel_config = SlackChannelConfig( + slack_bot_id=slack_bot_id, persona_id=persona_id, channel_config=channel_config, response_type=response_type, standard_answer_categories=existing_standard_answer_categories, enable_auto_filters=enable_auto_filters, ) - db_session.add(slack_bot_config) + db_session.add(slack_channel_config) db_session.commit() - return slack_bot_config + return slack_channel_config -def update_slack_bot_config( - slack_bot_config_id: int, +def update_slack_channel_config( + db_session: Session, + slack_channel_config_id: int, persona_id: int | None, channel_config: ChannelConfig, response_type: SlackBotResponseType, standard_answer_category_ids: list[int], enable_auto_filters: bool, - db_session: Session, -) -> SlackBotConfig: - slack_bot_config = db_session.scalar( - select(SlackBotConfig).where(SlackBotConfig.id == slack_bot_config_id) +) -> SlackChannelConfig: + slack_channel_config = db_session.scalar( + select(SlackChannelConfig).where( + SlackChannelConfig.id == slack_channel_config_id + ) ) - if slack_bot_config is None: + if slack_channel_config is None: raise ValueError( - f"Unable to find slack bot config with ID {slack_bot_config_id}" + f"Unable to find Slack channel config with ID {slack_channel_config_id}" ) versioned_fetch_standard_answer_categories_by_ids = ( @@ -159,25 +163,25 @@ def update_slack_bot_config( ) # get the existing persona id before updating the object - existing_persona_id = slack_bot_config.persona_id + existing_persona_id = slack_channel_config.persona_id # update the config # NOTE: need to do this before cleaning up the old persona or else we # will encounter `violates foreign key constraint` errors - slack_bot_config.persona_id = persona_id - slack_bot_config.channel_config = channel_config - slack_bot_config.response_type = response_type - slack_bot_config.standard_answer_categories = list( + slack_channel_config.persona_id = persona_id + slack_channel_config.channel_config = channel_config + slack_channel_config.response_type = response_type + slack_channel_config.standard_answer_categories = list( existing_standard_answer_categories ) - slack_bot_config.enable_auto_filters = enable_auto_filters + slack_channel_config.enable_auto_filters = enable_auto_filters # if the persona has changed, then clean up the old persona if persona_id != existing_persona_id and existing_persona_id: existing_persona = db_session.scalar( select(Persona).where(Persona.id == existing_persona_id) ) - # if the existing persona was one created just for use with this Slack Bot, + # if the existing persona was one created just for use with this Slack channel, # then clean it up if existing_persona and existing_persona.name.startswith( SLACK_BOT_PERSONA_PREFIX @@ -188,28 +192,30 @@ def update_slack_bot_config( db_session.commit() - return slack_bot_config + return slack_channel_config -def remove_slack_bot_config( - slack_bot_config_id: int, - user: User | None, +def remove_slack_channel_config( db_session: Session, + slack_channel_config_id: int, + user: User | None, ) -> None: - slack_bot_config = db_session.scalar( - select(SlackBotConfig).where(SlackBotConfig.id == slack_bot_config_id) + slack_channel_config = db_session.scalar( + select(SlackChannelConfig).where( + SlackChannelConfig.id == slack_channel_config_id + ) ) - if slack_bot_config is None: + if slack_channel_config is None: raise ValueError( - f"Unable to find slack bot config with ID {slack_bot_config_id}" + f"Unable to find Slack channel config with ID {slack_channel_config_id}" ) - existing_persona_id = slack_bot_config.persona_id + existing_persona_id = slack_channel_config.persona_id if existing_persona_id: existing_persona = db_session.scalar( select(Persona).where(Persona.id == existing_persona_id) ) - # if the existing persona was one created just for use with this Slack Bot, + # if the existing persona was one created just for use with this Slack channel, # then clean it up if existing_persona and existing_persona.name.startswith( SLACK_BOT_PERSONA_PREFIX @@ -221,17 +227,28 @@ def remove_slack_bot_config( persona_id=existing_persona_id, user=user, db_session=db_session ) - db_session.delete(slack_bot_config) + db_session.delete(slack_channel_config) db_session.commit() -def fetch_slack_bot_config( - db_session: Session, slack_bot_config_id: int -) -> SlackBotConfig | None: - return db_session.scalar( - select(SlackBotConfig).where(SlackBotConfig.id == slack_bot_config_id) - ) +def fetch_slack_channel_configs( + db_session: Session, slack_bot_id: int | None = None +) -> Sequence[SlackChannelConfig]: + if not slack_bot_id: + return db_session.scalars(select(SlackChannelConfig)).all() + + return db_session.scalars( + select(SlackChannelConfig).where( + SlackChannelConfig.slack_bot_id == slack_bot_id + ) + ).all() -def fetch_slack_bot_configs(db_session: Session) -> Sequence[SlackBotConfig]: - return db_session.scalars(select(SlackBotConfig)).all() +def fetch_slack_channel_config( + db_session: Session, slack_channel_config_id: int +) -> SlackChannelConfig | None: + return db_session.scalar( + select(SlackChannelConfig).where( + SlackChannelConfig.id == slack_channel_config_id + ) + ) diff --git a/backend/danswer/db/swap_index.py b/backend/danswer/db/swap_index.py index 8f6d1718924..8b583bd2e4b 100644 --- a/backend/danswer/db/swap_index.py +++ b/backend/danswer/db/swap_index.py @@ -8,26 +8,35 @@ from danswer.db.index_attempt import ( count_unique_cc_pairs_with_successful_index_attempts, ) +from danswer.db.models import SearchSettings from danswer.db.search_settings import get_current_search_settings from danswer.db.search_settings import get_secondary_search_settings from danswer.db.search_settings import update_search_settings_status -from danswer.dynamic_configs.factory import get_dynamic_config_store +from danswer.key_value_store.factory import get_kv_store from danswer.utils.logger import setup_logger + logger = setup_logger() -def check_index_swap(db_session: Session) -> None: +def check_index_swap(db_session: Session) -> SearchSettings | None: """Get count of cc-pairs and count of successful index_attempts for the new model grouped by connector + credential, if it's the same, then assume - new index is done building. If so, swap the indices and expire the old one.""" + new index is done building. If so, swap the indices and expire the old one. + + Returns None if search settings did not change, or the old search settings if they + did change. + """ + + old_search_settings = None + # Default CC-pair created for Ingestion API unused here all_cc_pairs = get_connector_credential_pairs(db_session) cc_pair_count = max(len(all_cc_pairs) - 1, 0) search_settings = get_secondary_search_settings(db_session) if not search_settings: - return + return None unique_cc_indexings = count_unique_cc_pairs_with_successful_index_attempts( search_settings_id=search_settings.id, db_session=db_session @@ -40,9 +49,9 @@ def check_index_swap(db_session: Session) -> None: if cc_pair_count == 0 or cc_pair_count == unique_cc_indexings: # Swap indices - now_old_search_settings = get_current_search_settings(db_session) + current_search_settings = get_current_search_settings(db_session) update_search_settings_status( - search_settings=now_old_search_settings, + search_settings=current_search_settings, new_status=IndexModelStatus.PAST, db_session=db_session, ) @@ -54,7 +63,7 @@ def check_index_swap(db_session: Session) -> None: ) if cc_pair_count > 0: - kv_store = get_dynamic_config_store() + kv_store = get_kv_store() kv_store.store(KV_REINDEX_KEY, False) # Expire jobs for the now past index/embedding model @@ -63,3 +72,7 @@ def check_index_swap(db_session: Session) -> None: # Recount aggregates for cc_pair in all_cc_pairs: resync_cc_pair(cc_pair, db_session=db_session) + + old_search_settings = current_search_settings + + return old_search_settings diff --git a/backend/danswer/db/tag.py b/backend/danswer/db/tag.py index 688b8a11272..6f19859087f 100644 --- a/backend/danswer/db/tag.py +++ b/backend/danswer/db/tag.py @@ -1,3 +1,4 @@ +from sqlalchemy import and_ from sqlalchemy import delete from sqlalchemy import func from sqlalchemy import or_ @@ -107,12 +108,14 @@ def create_or_add_document_tag_list( return all_tags -def get_tags_by_value_prefix_for_source_types( +def find_tags( tag_key_prefix: str | None, tag_value_prefix: str | None, sources: list[DocumentSource] | None, limit: int | None, db_session: Session, + # if set, both tag_key_prefix and tag_value_prefix must be a match + require_both_to_match: bool = False, ) -> list[Tag]: query = select(Tag) @@ -122,7 +125,11 @@ def get_tags_by_value_prefix_for_source_types( conditions.append(Tag.tag_key.ilike(f"{tag_key_prefix}%")) if tag_value_prefix: conditions.append(Tag.tag_value.ilike(f"{tag_value_prefix}%")) - query = query.where(or_(*conditions)) + + final_prefix_condition = ( + and_(*conditions) if require_both_to_match else or_(*conditions) + ) + query = query.where(final_prefix_condition) if sources: query = query.where(Tag.source.in_(sources)) diff --git a/backend/danswer/db/token_limit.py b/backend/danswer/db/token_limit.py new file mode 100644 index 00000000000..24b2433cc1a --- /dev/null +++ b/backend/danswer/db/token_limit.py @@ -0,0 +1,111 @@ +from collections.abc import Sequence + +from sqlalchemy import select +from sqlalchemy.orm import Session + +from danswer.configs.constants import TokenRateLimitScope +from danswer.db.models import TokenRateLimit +from danswer.db.models import TokenRateLimit__UserGroup +from danswer.server.token_rate_limits.models import TokenRateLimitArgs + + +def fetch_all_user_token_rate_limits( + db_session: Session, + enabled_only: bool = False, + ordered: bool = True, +) -> Sequence[TokenRateLimit]: + query = select(TokenRateLimit).where( + TokenRateLimit.scope == TokenRateLimitScope.USER + ) + + if enabled_only: + query = query.where(TokenRateLimit.enabled.is_(True)) + + if ordered: + query = query.order_by(TokenRateLimit.created_at.desc()) + + return db_session.scalars(query).all() + + +def fetch_all_global_token_rate_limits( + db_session: Session, + enabled_only: bool = False, + ordered: bool = True, +) -> Sequence[TokenRateLimit]: + query = select(TokenRateLimit).where( + TokenRateLimit.scope == TokenRateLimitScope.GLOBAL + ) + + if enabled_only: + query = query.where(TokenRateLimit.enabled.is_(True)) + + if ordered: + query = query.order_by(TokenRateLimit.created_at.desc()) + + token_rate_limits = db_session.scalars(query).all() + return token_rate_limits + + +def insert_user_token_rate_limit( + db_session: Session, + token_rate_limit_settings: TokenRateLimitArgs, +) -> TokenRateLimit: + token_limit = TokenRateLimit( + enabled=token_rate_limit_settings.enabled, + token_budget=token_rate_limit_settings.token_budget, + period_hours=token_rate_limit_settings.period_hours, + scope=TokenRateLimitScope.USER, + ) + db_session.add(token_limit) + db_session.commit() + + return token_limit + + +def insert_global_token_rate_limit( + db_session: Session, + token_rate_limit_settings: TokenRateLimitArgs, +) -> TokenRateLimit: + token_limit = TokenRateLimit( + enabled=token_rate_limit_settings.enabled, + token_budget=token_rate_limit_settings.token_budget, + period_hours=token_rate_limit_settings.period_hours, + scope=TokenRateLimitScope.GLOBAL, + ) + db_session.add(token_limit) + db_session.commit() + + return token_limit + + +def update_token_rate_limit( + db_session: Session, + token_rate_limit_id: int, + token_rate_limit_settings: TokenRateLimitArgs, +) -> TokenRateLimit: + token_limit = db_session.get(TokenRateLimit, token_rate_limit_id) + if token_limit is None: + raise ValueError(f"TokenRateLimit with id '{token_rate_limit_id}' not found") + + token_limit.enabled = token_rate_limit_settings.enabled + token_limit.token_budget = token_rate_limit_settings.token_budget + token_limit.period_hours = token_rate_limit_settings.period_hours + db_session.commit() + + return token_limit + + +def delete_token_rate_limit( + db_session: Session, + token_rate_limit_id: int, +) -> None: + token_limit = db_session.get(TokenRateLimit, token_rate_limit_id) + if token_limit is None: + raise ValueError(f"TokenRateLimit with id '{token_rate_limit_id}' not found") + + db_session.query(TokenRateLimit__UserGroup).filter( + TokenRateLimit__UserGroup.rate_limit_id == token_rate_limit_id + ).delete() + + db_session.delete(token_limit) + db_session.commit() diff --git a/backend/danswer/db/tools.py b/backend/danswer/db/tools.py index 1e75b1c4901..a89dafef385 100644 --- a/backend/danswer/db/tools.py +++ b/backend/danswer/db/tools.py @@ -1,10 +1,13 @@ from typing import Any +from typing import cast from uuid import UUID from sqlalchemy import select from sqlalchemy.orm import Session from danswer.db.models import Tool +from danswer.server.features.tool.models import Header +from danswer.utils.headers import HeaderItemDict from danswer.utils.logger import setup_logger logger = setup_logger() @@ -21,10 +24,18 @@ def get_tool_by_id(tool_id: int, db_session: Session) -> Tool: return tool +def get_tool_by_name(tool_name: str, db_session: Session) -> Tool: + tool = db_session.scalar(select(Tool).where(Tool.name == tool_name)) + if not tool: + raise ValueError("Tool by specified name does not exist") + return tool + + def create_tool( name: str, description: str | None, openapi_schema: dict[str, Any] | None, + custom_headers: list[Header] | None, user_id: UUID | None, db_session: Session, ) -> Tool: @@ -33,6 +44,9 @@ def create_tool( description=description, in_code_tool_id=None, openapi_schema=openapi_schema, + custom_headers=[header.model_dump() for header in custom_headers] + if custom_headers + else [], user_id=user_id, ) db_session.add(new_tool) @@ -45,6 +59,7 @@ def update_tool( name: str | None, description: str | None, openapi_schema: dict[str, Any] | None, + custom_headers: list[Header] | None, user_id: UUID | None, db_session: Session, ) -> Tool: @@ -60,6 +75,10 @@ def update_tool( tool.openapi_schema = openapi_schema if user_id is not None: tool.user_id = user_id + if custom_headers is not None: + tool.custom_headers = [ + cast(HeaderItemDict, header.model_dump()) for header in custom_headers + ] db_session.commit() return tool diff --git a/backend/danswer/db/users.py b/backend/danswer/db/users.py index 61ba6e475fe..5014c35cbec 100644 --- a/backend/danswer/db/users.py +++ b/backend/danswer/db/users.py @@ -1,7 +1,9 @@ from collections.abc import Sequence from uuid import UUID +from fastapi import HTTPException from fastapi_users.password import PasswordHelper +from sqlalchemy import func from sqlalchemy import select from sqlalchemy.orm import Session @@ -9,45 +11,177 @@ from danswer.db.models import User +def validate_user_role_update(requested_role: UserRole, current_role: UserRole) -> None: + """ + Validate that a user role update is valid. + Assumed only admins can hit this endpoint. + raise if: + - requested role is a curator + - requested role is a slack user + - requested role is an external permissioned user + - requested role is a limited user + - current role is a slack user + - current role is an external permissioned user + - current role is a limited user + """ + + if current_role == UserRole.SLACK_USER: + raise HTTPException( + status_code=400, + detail="To change a Slack User's role, they must first login to Danswer via the web app.", + ) + + if current_role == UserRole.EXT_PERM_USER: + # This shouldn't happen, but just in case + raise HTTPException( + status_code=400, + detail="To change an External Permissioned User's role, they must first login to Danswer via the web app.", + ) + + if current_role == UserRole.LIMITED: + raise HTTPException( + status_code=400, + detail="To change a Limited User's role, they must first login to Danswer via the web app.", + ) + + if requested_role == UserRole.CURATOR: + # This shouldn't happen, but just in case + raise HTTPException( + status_code=400, + detail="Curator role must be set via the User Group Menu", + ) + + if requested_role == UserRole.LIMITED: + # This shouldn't happen, but just in case + raise HTTPException( + status_code=400, + detail=( + "A user cannot be set to a Limited User role. " + "This role is automatically assigned to users through certain endpoints in the API." + ), + ) + + if requested_role == UserRole.SLACK_USER: + # This shouldn't happen, but just in case + raise HTTPException( + status_code=400, + detail=( + "A user cannot be set to a Slack User role. " + "This role is automatically assigned to users who only use Danswer via Slack." + ), + ) + + if requested_role == UserRole.EXT_PERM_USER: + # This shouldn't happen, but just in case + raise HTTPException( + status_code=400, + detail=( + "A user cannot be set to an External Permissioned User role. " + "This role is automatically assigned to users who have been " + "pulled in to the system via an external permissions system." + ), + ) + + def list_users( - db_session: Session, email_filter_string: str = "", user: User | None = None + db_session: Session, email_filter_string: str = "", include_external: bool = False ) -> Sequence[User]: """List all users. No pagination as of now, as the # of users is assumed to be relatively small (<< 1 million)""" stmt = select(User) + where_clause = [] + + if not include_external: + where_clause.append(User.role != UserRole.EXT_PERM_USER) + if email_filter_string: - stmt = stmt.where(User.email.ilike(f"%{email_filter_string}%")) # type: ignore + where_clause.append(User.email.ilike(f"%{email_filter_string}%")) # type: ignore + + stmt = stmt.where(*where_clause) return db_session.scalars(stmt).unique().all() def get_user_by_email(email: str, db_session: Session) -> User | None: - user = db_session.query(User).filter(User.email == email).first() # type: ignore + user = ( + db_session.query(User) + .filter(func.lower(User.email) == func.lower(email)) + .first() + ) return user def fetch_user_by_id(db_session: Session, user_id: UUID) -> User | None: - user = db_session.query(User).filter(User.id == user_id).first() # type: ignore + return db_session.query(User).filter(User.id == user_id).first() # type: ignore - return user + +def _generate_slack_user(email: str) -> User: + fastapi_users_pw_helper = PasswordHelper() + password = fastapi_users_pw_helper.generate() + hashed_pass = fastapi_users_pw_helper.hash(password) + return User( + email=email, + hashed_password=hashed_pass, + role=UserRole.SLACK_USER, + ) -def add_non_web_user_if_not_exists(email: str, db_session: Session) -> User: +def add_slack_user_if_not_exists(db_session: Session, email: str) -> User: + email = email.lower() user = get_user_by_email(email, db_session) if user is not None: + # If the user is an external permissioned user, we update it to a slack user + if user.role == UserRole.EXT_PERM_USER: + user.role = UserRole.SLACK_USER + db_session.commit() return user + user = _generate_slack_user(email=email) + db_session.add(user) + db_session.commit() + return user + + +def _get_users_by_emails( + db_session: Session, lower_emails: list[str] +) -> tuple[list[User], list[str]]: + stmt = select(User).filter(func.lower(User.email).in_(lower_emails)) # type: ignore + found_users = list(db_session.scalars(stmt).unique().all()) # Convert to list + + # Extract found emails and convert to lowercase to avoid case sensitivity issues + found_users_emails = [user.email.lower() for user in found_users] + + # Separate emails for users that were not found + missing_user_emails = [ + email for email in lower_emails if email not in found_users_emails + ] + return found_users, missing_user_emails + + +def _generate_ext_permissioned_user(email: str) -> User: fastapi_users_pw_helper = PasswordHelper() password = fastapi_users_pw_helper.generate() hashed_pass = fastapi_users_pw_helper.hash(password) - user = User( + return User( email=email, hashed_password=hashed_pass, - has_web_login=False, - role=UserRole.BASIC, + role=UserRole.EXT_PERM_USER, ) - db_session.add(user) + + +def batch_add_ext_perm_user_if_not_exists( + db_session: Session, emails: list[str] +) -> list[User]: + lower_emails = [email.lower() for email in emails] + found_users, missing_lower_emails = _get_users_by_emails(db_session, lower_emails) + + new_users: list[User] = [] + for email in missing_lower_emails: + new_users.append(_generate_ext_permissioned_user(email=email)) + + db_session.add_all(new_users) db_session.commit() - return user + + return found_users + new_users diff --git a/backend/danswer/document_index/document_index_utils.py b/backend/danswer/document_index/document_index_utils.py index fab7b85ef48..c6f48d1b40d 100644 --- a/backend/danswer/document_index/document_index_utils.py +++ b/backend/danswer/document_index/document_index_utils.py @@ -3,10 +3,10 @@ from sqlalchemy.orm import Session +from danswer.context.search.models import InferenceChunk from danswer.db.search_settings import get_current_search_settings from danswer.db.search_settings import get_secondary_search_settings from danswer.indexing.models import IndexChunk -from danswer.search.models import InferenceChunk DEFAULT_BATCH_SIZE = 30 diff --git a/backend/danswer/document_index/factory.py b/backend/danswer/document_index/factory.py index 17701d98e04..92dde3dda43 100644 --- a/backend/danswer/document_index/factory.py +++ b/backend/danswer/document_index/factory.py @@ -1,5 +1,9 @@ +from sqlalchemy.orm import Session + +from danswer.db.search_settings import get_current_search_settings from danswer.document_index.interfaces import DocumentIndex from danswer.document_index.vespa.index import VespaIndex +from shared_configs.configs import MULTI_TENANT def get_default_document_index( @@ -11,5 +15,18 @@ def get_default_document_index( index both need to be updated, updates are applied to both indices""" # Currently only supporting Vespa return VespaIndex( - index_name=primary_index_name, secondary_index_name=secondary_index_name + index_name=primary_index_name, + secondary_index_name=secondary_index_name, + multitenant=MULTI_TENANT, + ) + + +def get_current_primary_default_document_index(db_session: Session) -> DocumentIndex: + """ + TODO: Use redis to cache this or something + """ + search_settings = get_current_search_settings(db_session) + return get_default_document_index( + primary_index_name=search_settings.index_name, + secondary_index_name=None, ) diff --git a/backend/danswer/document_index/interfaces.py b/backend/danswer/document_index/interfaces.py index eaa34b37752..84dcbf48473 100644 --- a/backend/danswer/document_index/interfaces.py +++ b/backend/danswer/document_index/interfaces.py @@ -4,9 +4,9 @@ from typing import Any from danswer.access.models import DocumentAccess +from danswer.context.search.models import IndexFilters +from danswer.context.search.models import InferenceChunkUncleaned from danswer.indexing.models import DocMetadataAwareIndexChunk -from danswer.search.models import IndexFilters -from danswer.search.models import InferenceChunkUncleaned from shared_configs.model_server_models import Embedding @@ -55,6 +55,21 @@ class DocumentMetadata: from_ingestion_api: bool = False +@dataclass +class VespaDocumentFields: + """ + Specifies fields in Vespa for a document. Fields set to None will be ignored. + Perhaps we should name this in an implementation agnostic fashion, but it's more + understandable like this for now. + """ + + # all other fields except these 4 will always be left alone by the update request + access: DocumentAccess | None = None + document_sets: set[str] | None = None + boost: float | None = None + hidden: bool | None = None + + @dataclass class UpdateRequest: """ @@ -112,6 +127,17 @@ def ensure_indices_exist( """ raise NotImplementedError + @staticmethod + @abc.abstractmethod + def register_multitenant_indices( + indices: list[str], + embedding_dims: list[int], + ) -> None: + """ + Register multitenant indices with the document index. + """ + raise NotImplementedError + class Indexable(abc.ABC): """ @@ -156,6 +182,16 @@ class Deletable(abc.ABC): Class must implement the ability to delete document by their unique document ids. """ + @abc.abstractmethod + def delete_single(self, doc_id: str) -> int: + """ + Given a single document id, hard delete it from the document index + + Parameters: + - doc_id: document id as specified by the connector + """ + raise NotImplementedError + @abc.abstractmethod def delete(self, doc_ids: list[str]) -> None: """ @@ -178,11 +214,9 @@ class Updatable(abc.ABC): """ @abc.abstractmethod - def update_single(self, update_request: UpdateRequest) -> None: + def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int: """ - Updates some set of chunks for a document. The document and fields to update - are specified in the update request. Each update request in the list applies - its changes to a list of document ids. + Updates all chunks for a document with the specified fields. None values mean that the field does not need an update. The rationale for a single update function is that it allows retries and parallelism @@ -190,14 +224,10 @@ def update_single(self, update_request: UpdateRequest) -> None: us to individually handle error conditions per document. Parameters: - - update_request: for a list of document ids in the update request, apply the same updates - to all of the documents with those ids. + - fields: the fields to update in the document. Any field set to None will not be changed. Return: - - an HTTPStatus code. The code can used to decide whether to fail immediately, - retry, etc. Although this method likely hits an HTTP API behind the - scenes, the usage of HTTPStatus is a convenience and the interface is not - actually HTTP specific. + None """ raise NotImplementedError diff --git a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd index be279f6a611..e712266fa08 100644 --- a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd +++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd @@ -1,5 +1,6 @@ schema DANSWER_CHUNK_NAME { document DANSWER_CHUNK_NAME { + TENANT_ID_REPLACEMENT # Not to be confused with the UUID generated for this chunk which is called documentid by default field document_id type string { indexing: summary | attribute @@ -14,7 +15,7 @@ schema DANSWER_CHUNK_NAME { # Must have an additional field for whether to skip title embeddings # This information cannot be extracted from either the title field nor title embedding field skip_title type bool { - indexing: attribute + indexing: attribute } # May not always match the `semantic_identifier` e.g. for Slack docs the # `semantic_identifier` will be the channel name, but the `title` will be empty @@ -35,7 +36,7 @@ schema DANSWER_CHUNK_NAME { } # Title embedding (x1) field title_embedding type tensor(x[VARIABLE_DIM]) { - indexing: attribute + indexing: attribute | index attribute { distance-metric: angular } @@ -43,7 +44,7 @@ schema DANSWER_CHUNK_NAME { # Content embeddings (chunk + optional mini chunks embeddings) # "t" and "x" are arbitrary names, not special keywords field embeddings type tensor(t{},x[VARIABLE_DIM]) { - indexing: attribute + indexing: attribute | index attribute { distance-metric: angular } diff --git a/backend/danswer/document_index/vespa/chunk_retrieval.py b/backend/danswer/document_index/vespa/chunk_retrieval.py index e4b2ad83ce2..6715dc76ae1 100644 --- a/backend/danswer/document_index/vespa/chunk_retrieval.py +++ b/backend/danswer/document_index/vespa/chunk_retrieval.py @@ -7,11 +7,14 @@ from typing import Any from typing import cast -import requests +import httpx from retry import retry from danswer.configs.app_configs import LOG_VESPA_TIMING_INFORMATION +from danswer.context.search.models import IndexFilters +from danswer.context.search.models import InferenceChunkUncleaned from danswer.document_index.interfaces import VespaChunkRequest +from danswer.document_index.vespa.shared_utils.utils import get_vespa_http_client from danswer.document_index.vespa.shared_utils.vespa_request_builders import ( build_vespa_filters, ) @@ -43,8 +46,6 @@ from danswer.document_index.vespa_constants import SOURCE_TYPE from danswer.document_index.vespa_constants import TITLE from danswer.document_index.vespa_constants import YQL_BASE -from danswer.search.models import IndexFilters -from danswer.search.models import InferenceChunkUncleaned from danswer.utils.logger import setup_logger from danswer.utils.threadpool_concurrency import run_functions_tuples_in_parallel @@ -192,20 +193,21 @@ def _get_chunks_via_visit_api( document_chunks: list[dict] = [] while True: - response = requests.get(url, params=params) try: - response.raise_for_status() - except requests.HTTPError as e: - request_info = f"Headers: {response.request.headers}\nPayload: {params}" - response_info = f"Status Code: {response.status_code}\nResponse Content: {response.text}" - error_base = f"Error occurred getting chunk by Document ID {chunk_request.document_id}" + filtered_params = {k: v for k, v in params.items() if v is not None} + with get_vespa_http_client() as http_client: + response = http_client.get(url, params=filtered_params) + response.raise_for_status() + except httpx.HTTPError as e: + error_base = "Failed to query Vespa" logger.error( f"{error_base}:\n" - f"{request_info}\n" - f"{response_info}\n" - f"Exception: {e}" + f"Request URL: {e.request.url}\n" + f"Request Headers: {e.request.headers}\n" + f"Request Payload: {params}\n" + f"Exception: {str(e)}" ) - raise requests.HTTPError(error_base) from e + raise httpx.HTTPError(error_base) from e # Check if the response contains any documents response_data = response.json() @@ -229,6 +231,7 @@ def _get_chunks_via_visit_api( return document_chunks +@retry(tries=10, delay=1, backoff=2) def get_all_vespa_ids_for_document_id( document_id: str, index_name: str, @@ -293,28 +296,24 @@ def query_vespa( if LOG_VESPA_TIMING_INFORMATION else {}, ) + try: - response = requests.post( - SEARCH_ENDPOINT, - json=params, - ) - response.raise_for_status() - except requests.HTTPError as e: - request_info = f"Headers: {response.request.headers}\nPayload: {params}" - response_info = ( - f"Status Code: {response.status_code}\n" - f"Response Content: {response.text}" - ) + with get_vespa_http_client() as http_client: + response = http_client.post(SEARCH_ENDPOINT, json=params) + response.raise_for_status() + except httpx.HTTPError as e: error_base = "Failed to query Vespa" logger.error( f"{error_base}:\n" - f"{request_info}\n" - f"{response_info}\n" - f"Exception: {e}" + f"Request URL: {e.request.url}\n" + f"Request Headers: {e.request.headers}\n" + f"Request Payload: {params}\n" + f"Exception: {str(e)}" ) - raise requests.HTTPError(error_base) from e + raise httpx.HTTPError(error_base) from e response_json: dict[str, Any] = response.json() + if LOG_VESPA_TIMING_INFORMATION: logger.debug("Vespa timing info: %s", response_json.get("timing")) hits = response_json["root"].get("children", []) diff --git a/backend/danswer/document_index/vespa/index.py b/backend/danswer/document_index/vespa/index.py index 5d5d63d39eb..ebe6daca1a2 100644 --- a/backend/danswer/document_index/vespa/index.py +++ b/backend/danswer/document_index/vespa/index.py @@ -1,27 +1,34 @@ import concurrent.futures import io +import logging import os import re import time +import urllib import zipfile from dataclasses import dataclass from datetime import datetime from datetime import timedelta from typing import BinaryIO from typing import cast +from typing import List -import httpx -import requests +import httpx # type: ignore +import requests # type: ignore +from danswer.configs.app_configs import DOCUMENT_INDEX_NAME from danswer.configs.chat_configs import DOC_TIME_DECAY from danswer.configs.chat_configs import NUM_RETURNED_HITS from danswer.configs.chat_configs import TITLE_CONTENT_RATIO from danswer.configs.chat_configs import VESPA_SEARCHER_THREADS from danswer.configs.constants import KV_REINDEX_KEY +from danswer.context.search.models import IndexFilters +from danswer.context.search.models import InferenceChunkUncleaned from danswer.document_index.interfaces import DocumentIndex from danswer.document_index.interfaces import DocumentInsertionRecord from danswer.document_index.interfaces import UpdateRequest from danswer.document_index.interfaces import VespaChunkRequest +from danswer.document_index.interfaces import VespaDocumentFields from danswer.document_index.vespa.chunk_retrieval import batch_search_api_retrieval from danswer.document_index.vespa.chunk_retrieval import ( get_all_vespa_ids_for_document_id, @@ -36,6 +43,7 @@ from danswer.document_index.vespa.indexing_utils import ( get_existing_documents_from_chunks, ) +from danswer.document_index.vespa.shared_utils.utils import get_vespa_http_client from danswer.document_index.vespa.shared_utils.utils import ( replace_invalid_doc_id_characters, ) @@ -54,20 +62,26 @@ from danswer.document_index.vespa_constants import HIDDEN from danswer.document_index.vespa_constants import NUM_THREADS from danswer.document_index.vespa_constants import SEARCH_THREAD_NUMBER_PAT +from danswer.document_index.vespa_constants import TENANT_ID_PAT +from danswer.document_index.vespa_constants import TENANT_ID_REPLACEMENT from danswer.document_index.vespa_constants import VESPA_APPLICATION_ENDPOINT from danswer.document_index.vespa_constants import VESPA_DIM_REPLACEMENT_PAT from danswer.document_index.vespa_constants import VESPA_TIMEOUT from danswer.document_index.vespa_constants import YQL_BASE -from danswer.dynamic_configs.factory import get_dynamic_config_store from danswer.indexing.models import DocMetadataAwareIndexChunk -from danswer.search.models import IndexFilters -from danswer.search.models import InferenceChunkUncleaned +from danswer.key_value_store.factory import get_kv_store from danswer.utils.batching import batch_generator from danswer.utils.logger import setup_logger +from shared_configs.configs import MULTI_TENANT from shared_configs.model_server_models import Embedding + logger = setup_logger() +# Set the logging level to WARNING to ignore INFO and DEBUG logs +httpx_logger = logging.getLogger("httpx") +httpx_logger.setLevel(logging.WARNING) + @dataclass class _VespaUpdateRequest: @@ -85,7 +99,7 @@ def in_memory_zip_from_file_bytes(file_contents: dict[str, bytes]) -> BinaryIO: return zip_buffer -def _create_document_xml_lines(doc_names: list[str | None]) -> str: +def _create_document_xml_lines(doc_names: list[str | None] | list[str]) -> str: doc_lines = [ f'' for doc_name in doc_names @@ -110,17 +124,30 @@ def add_ngrams_to_schema(schema_content: str) -> str: class VespaIndex(DocumentIndex): - def __init__(self, index_name: str, secondary_index_name: str | None) -> None: + def __init__( + self, + index_name: str, + secondary_index_name: str | None, + multitenant: bool = False, + ) -> None: self.index_name = index_name self.secondary_index_name = secondary_index_name + self.multitenant = multitenant + self.http_client = get_vespa_http_client() def ensure_indices_exist( self, index_embedding_dim: int, secondary_index_embedding_dim: int | None, ) -> None: + if MULTI_TENANT: + logger.info( + "Skipping Vespa index seup for multitenant (would wipe all indices)" + ) + return None + deploy_url = f"{VESPA_APPLICATION_ENDPOINT}/tenant/default/prepareandactivate" - logger.info(f"Deploying Vespa application package to {deploy_url}") + logger.notice(f"Deploying Vespa application package to {deploy_url}") vespa_schema_path = os.path.join( os.getcwd(), "danswer", "document_index", "vespa", "app_config" @@ -140,7 +167,7 @@ def ensure_indices_exist( SEARCH_THREAD_NUMBER_PAT, str(VESPA_SEARCHER_THREADS) ) - kv_store = get_dynamic_config_store() + kv_store = get_kv_store() needs_reindexing = False try: @@ -166,10 +193,14 @@ def ensure_indices_exist( with open(schema_file, "r") as schema_f: schema_template = schema_f.read() + schema_template = schema_template.replace(TENANT_ID_PAT, "") + schema = schema_template.replace( DANSWER_CHUNK_REPLACEMENT_PAT, self.index_name ).replace(VESPA_DIM_REPLACEMENT_PAT, str(index_embedding_dim)) + schema = add_ngrams_to_schema(schema) if needs_reindexing else schema + schema = schema.replace(TENANT_ID_PAT, "") zip_dict[f"schemas/{schema_names[0]}.sd"] = schema.encode("utf-8") if self.secondary_index_name: @@ -187,6 +218,91 @@ def ensure_indices_exist( f"Failed to prepare Vespa Danswer Index. Response: {response.text}" ) + @staticmethod + def register_multitenant_indices( + indices: list[str], + embedding_dims: list[int], + ) -> None: + if not MULTI_TENANT: + raise ValueError("Multi-tenant is not enabled") + + deploy_url = f"{VESPA_APPLICATION_ENDPOINT}/tenant/default/prepareandactivate" + logger.info(f"Deploying Vespa application package to {deploy_url}") + + vespa_schema_path = os.path.join( + os.getcwd(), "danswer", "document_index", "vespa", "app_config" + ) + schema_file = os.path.join(vespa_schema_path, "schemas", "danswer_chunk.sd") + services_file = os.path.join(vespa_schema_path, "services.xml") + overrides_file = os.path.join(vespa_schema_path, "validation-overrides.xml") + + with open(services_file, "r") as services_f: + services_template = services_f.read() + + # Generate schema names from index settings + schema_names = [index_name for index_name in indices] + + full_schemas = schema_names + + doc_lines = _create_document_xml_lines(full_schemas) + + services = services_template.replace(DOCUMENT_REPLACEMENT_PAT, doc_lines) + services = services.replace( + SEARCH_THREAD_NUMBER_PAT, str(VESPA_SEARCHER_THREADS) + ) + + kv_store = get_kv_store() + + needs_reindexing = False + try: + needs_reindexing = cast(bool, kv_store.load(KV_REINDEX_KEY)) + except Exception: + logger.debug("Could not load the reindexing flag. Using ngrams") + + with open(overrides_file, "r") as overrides_f: + overrides_template = overrides_f.read() + + # Vespa requires an override to erase data including the indices we're no longer using + # It also has a 30 day cap from current so we set it to 7 dynamically + now = datetime.now() + date_in_7_days = now + timedelta(days=7) + formatted_date = date_in_7_days.strftime("%Y-%m-%d") + + overrides = overrides_template.replace(DATE_REPLACEMENT, formatted_date) + + zip_dict = { + "services.xml": services.encode("utf-8"), + "validation-overrides.xml": overrides.encode("utf-8"), + } + + with open(schema_file, "r") as schema_f: + schema_template = schema_f.read() + + for i, index_name in enumerate(indices): + embedding_dim = embedding_dims[i] + logger.info( + f"Creating index: {index_name} with embedding dimension: {embedding_dim}" + ) + + schema = schema_template.replace( + DANSWER_CHUNK_REPLACEMENT_PAT, index_name + ).replace(VESPA_DIM_REPLACEMENT_PAT, str(embedding_dim)) + schema = schema.replace( + TENANT_ID_PAT, TENANT_ID_REPLACEMENT if MULTI_TENANT else "" + ) + schema = add_ngrams_to_schema(schema) if needs_reindexing else schema + zip_dict[f"schemas/{index_name}.sd"] = schema.encode("utf-8") + + zip_file = in_memory_zip_from_file_bytes(zip_dict) + + headers = {"Content-Type": "application/zip"} + response = requests.post(deploy_url, headers=headers, data=zip_file) + + if response.status_code != 200: + raise RuntimeError( + f"Failed to prepare Vespa Danswer Indexes. Response: {response.text}" + ) + def index( self, chunks: list[DocMetadataAwareIndexChunk], @@ -204,7 +320,7 @@ def index( # indexing / updates / deletes since we have to make a large volume of requests. with ( concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor, - httpx.Client(http2=True) as http_client, + get_vespa_http_client() as http_client, ): # Check for existing documents, existing documents need to have all of their chunks deleted # prior to indexing as the document size (num chunks) may have shrunk @@ -232,6 +348,7 @@ def index( chunks=chunk_batch, index_name=self.index_name, http_client=http_client, + multitenant=self.multitenant, executor=executor, ) @@ -266,9 +383,10 @@ def _update_chunk( # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficient for # indexing / updates / deletes since we have to make a large volume of requests. + with ( concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor, - httpx.Client(http2=True) as http_client, + get_vespa_http_client() as http_client, ): for update_batch in batch_generator(updates, batch_size): future_to_document_id = { @@ -377,90 +495,89 @@ def update(self, update_requests: list[UpdateRequest]) -> None: time.monotonic() - update_start, ) - def update_single(self, update_request: UpdateRequest) -> None: + def update_single(self, doc_id: str, fields: VespaDocumentFields) -> int: """Note: if the document id does not exist, the update will be a no-op and the function will complete with no errors or exceptions. Handle other exceptions if you wish to implement retry behavior """ - if len(update_request.document_ids) != 1: - raise ValueError("update_request must contain a single document id") + + total_chunks_updated = 0 # Handle Vespa character limitations # Mutating update_request but it's not used later anyway - update_request.document_ids = [ - replace_invalid_doc_id_characters(doc_id) - for doc_id in update_request.document_ids - ] - - # update_start = time.monotonic() - - # Fetch all chunks for each document ahead of time - index_names = [self.index_name] - if self.secondary_index_name: - index_names.append(self.secondary_index_name) - - chunk_id_start_time = time.monotonic() - all_doc_chunk_ids: list[str] = [] - for index_name in index_names: - for document_id in update_request.document_ids: - # this calls vespa and can raise http exceptions - doc_chunk_ids = get_all_vespa_ids_for_document_id( - document_id=document_id, - index_name=index_name, - filters=None, - get_large_chunks=True, - ) - all_doc_chunk_ids.extend(doc_chunk_ids) - logger.debug( - f"Took {time.monotonic() - chunk_id_start_time:.2f} seconds to fetch all Vespa chunk IDs" - ) + normalized_doc_id = replace_invalid_doc_id_characters(doc_id) # Build the _VespaUpdateRequest objects update_dict: dict[str, dict] = {"fields": {}} - if update_request.boost is not None: - update_dict["fields"][BOOST] = {"assign": update_request.boost} - if update_request.document_sets is not None: + if fields.boost is not None: + update_dict["fields"][BOOST] = {"assign": fields.boost} + if fields.document_sets is not None: update_dict["fields"][DOCUMENT_SETS] = { - "assign": { - document_set: 1 for document_set in update_request.document_sets - } + "assign": {document_set: 1 for document_set in fields.document_sets} } - if update_request.access is not None: + if fields.access is not None: update_dict["fields"][ACCESS_CONTROL_LIST] = { - "assign": {acl_entry: 1 for acl_entry in update_request.access.to_acl()} + "assign": {acl_entry: 1 for acl_entry in fields.access.to_acl()} } - if update_request.hidden is not None: - update_dict["fields"][HIDDEN] = {"assign": update_request.hidden} + if fields.hidden is not None: + update_dict["fields"][HIDDEN] = {"assign": fields.hidden} if not update_dict["fields"]: logger.error("Update request received but nothing to update") - return + return 0 - processed_update_requests: list[_VespaUpdateRequest] = [] - for document_id in update_request.document_ids: - for doc_chunk_id in all_doc_chunk_ids: - processed_update_requests.append( - _VespaUpdateRequest( - document_id=document_id, - url=f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}/{doc_chunk_id}", - update_request=update_dict, - ) - ) + index_names = [self.index_name] + if self.secondary_index_name: + index_names.append(self.secondary_index_name) - with httpx.Client(http2=True) as http_client: - for update in processed_update_requests: - http_client.put( - update.url, - headers={"Content-Type": "application/json"}, - json=update.update_request, + with get_vespa_http_client() as http_client: + for index_name in index_names: + params = httpx.QueryParams( + { + "selection": f"{index_name}.document_id=='{normalized_doc_id}'", + "cluster": DOCUMENT_INDEX_NAME, + } ) - # logger.debug( - # "Finished updating Vespa documents in %.2f seconds", - # time.monotonic() - update_start, - # ) + while True: + try: + resp = http_client.put( + f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}", + params=params, + headers={"Content-Type": "application/json"}, + json=update_dict, + ) - return + resp.raise_for_status() + except httpx.HTTPStatusError as e: + logger.error( + f"Failed to update chunks, details: {e.response.text}" + ) + raise + + resp_data = resp.json() + + if "documentCount" in resp_data: + chunks_updated = resp_data["documentCount"] + total_chunks_updated += chunks_updated + + # Check for continuation token to handle pagination + if "continuation" not in resp_data: + break # Exit loop if no continuation token + + if not resp_data["continuation"]: + break # Exit loop if continuation token is empty + + params = params.set("continuation", resp_data["continuation"]) + + logger.debug( + f"VespaIndex.update_single: " + f"index={index_name} " + f"doc={normalized_doc_id} " + f"chunks_updated={total_chunks_updated}" + ) + + return total_chunks_updated def delete(self, doc_ids: list[str]) -> None: logger.info(f"Deleting {len(doc_ids)} documents from Vespa") @@ -469,7 +586,7 @@ def delete(self, doc_ids: list[str]) -> None: # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for # indexing / updates / deletes since we have to make a large volume of requests. - with httpx.Client(http2=True) as http_client: + with get_vespa_http_client() as http_client: index_names = [self.index_name] if self.secondary_index_name: index_names.append(self.secondary_index_name) @@ -478,6 +595,70 @@ def delete(self, doc_ids: list[str]) -> None: delete_vespa_docs( document_ids=doc_ids, index_name=index_name, http_client=http_client ) + return + + def delete_single(self, doc_id: str) -> int: + """Possibly faster overall than the delete method due to using a single + delete call with a selection query.""" + + total_chunks_deleted = 0 + + # Vespa deletion is poorly documented ... luckily we found this + # https://docs.vespa.ai/en/operations/batch-delete.html#example + + doc_id = replace_invalid_doc_id_characters(doc_id) + + # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for + # indexing / updates / deletes since we have to make a large volume of requests. + index_names = [self.index_name] + if self.secondary_index_name: + index_names.append(self.secondary_index_name) + + with get_vespa_http_client() as http_client: + for index_name in index_names: + params = httpx.QueryParams( + { + "selection": f"{index_name}.document_id=='{doc_id}'", + "cluster": DOCUMENT_INDEX_NAME, + } + ) + + while True: + try: + resp = http_client.delete( + f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}", + params=params, + ) + resp.raise_for_status() + except httpx.HTTPStatusError as e: + logger.error( + f"Failed to delete chunk, details: {e.response.text}" + ) + raise + + resp_data = resp.json() + + if "documentCount" in resp_data: + chunks_deleted = resp_data["documentCount"] + total_chunks_deleted += chunks_deleted + + # Check for continuation token to handle pagination + if "continuation" not in resp_data: + break # Exit loop if no continuation token + + if not resp_data["continuation"]: + break # Exit loop if continuation token is empty + + params = params.set("continuation", resp_data["continuation"]) + + logger.debug( + f"VespaIndex.delete_single: " + f"index={index_name} " + f"doc={doc_id} " + f"chunks_deleted={total_chunks_deleted}" + ) + + return total_chunks_deleted def id_based_retrieval( self, @@ -573,3 +754,158 @@ def admin_retrieval( } return query_vespa(params) + + @classmethod + def delete_entries_by_tenant_id(cls, tenant_id: str, index_name: str) -> None: + """ + Deletes all entries in the specified index with the given tenant_id. + + Parameters: + tenant_id (str): The tenant ID whose documents are to be deleted. + index_name (str): The name of the index from which to delete documents. + """ + logger.info( + f"Deleting entries with tenant_id: {tenant_id} from index: {index_name}" + ) + + # Step 1: Retrieve all document IDs with the given tenant_id + document_ids = cls._get_all_document_ids_by_tenant_id(tenant_id, index_name) + + if not document_ids: + logger.info( + f"No documents found with tenant_id: {tenant_id} in index: {index_name}" + ) + return + + # Step 2: Delete documents in batches + delete_requests = [ + _VespaDeleteRequest(document_id=doc_id, index_name=index_name) + for doc_id in document_ids + ] + + cls._apply_deletes_batched(delete_requests) + + @classmethod + def _get_all_document_ids_by_tenant_id( + cls, tenant_id: str, index_name: str + ) -> List[str]: + """ + Retrieves all document IDs with the specified tenant_id, handling pagination. + + Parameters: + tenant_id (str): The tenant ID to search for. + index_name (str): The name of the index to search in. + + Returns: + List[str]: A list of document IDs matching the tenant_id. + """ + offset = 0 + limit = 1000 # Vespa's maximum hits per query + document_ids = [] + + logger.debug( + f"Starting document ID retrieval for tenant_id: {tenant_id} in index: {index_name}" + ) + + while True: + # Construct the query to fetch document IDs + query_params = { + "yql": f'select id from sources * where tenant_id contains "{tenant_id}";', + "offset": str(offset), + "hits": str(limit), + "timeout": "10s", + "format": "json", + "summary": "id", + } + + url = f"{VESPA_APPLICATION_ENDPOINT}/search/" + + logger.debug( + f"Querying for document IDs with tenant_id: {tenant_id}, offset: {offset}" + ) + + with get_vespa_http_client(no_timeout=True) as http_client: + response = http_client.get(url, params=query_params) + response.raise_for_status() + + search_result = response.json() + hits = search_result.get("root", {}).get("children", []) + + if not hits: + break + + for hit in hits: + doc_id = hit.get("id") + if doc_id: + document_ids.append(doc_id) + + offset += limit # Move to the next page + + logger.debug( + f"Retrieved {len(document_ids)} document IDs for tenant_id: {tenant_id}" + ) + return document_ids + + @classmethod + def _apply_deletes_batched( + cls, + delete_requests: List["_VespaDeleteRequest"], + batch_size: int = BATCH_SIZE, + ) -> None: + """ + Deletes documents in batches using multiple threads. + + Parameters: + delete_requests (List[_VespaDeleteRequest]): The list of delete requests. + batch_size (int): The number of documents to delete in each batch. + """ + + def _delete_document( + delete_request: "_VespaDeleteRequest", http_client: httpx.Client + ) -> None: + logger.debug(f"Deleting document with ID {delete_request.document_id}") + response = http_client.delete( + delete_request.url, + headers={"Content-Type": "application/json"}, + ) + response.raise_for_status() + + logger.debug(f"Starting batch deletion for {len(delete_requests)} documents") + + with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor: + with get_vespa_http_client(no_timeout=True) as http_client: + for batch_start in range(0, len(delete_requests), batch_size): + batch = delete_requests[batch_start : batch_start + batch_size] + + future_to_document_id = { + executor.submit( + _delete_document, + delete_request, + http_client, + ): delete_request.document_id + for delete_request in batch + } + + for future in concurrent.futures.as_completed( + future_to_document_id + ): + doc_id = future_to_document_id[future] + try: + future.result() + logger.debug(f"Successfully deleted document: {doc_id}") + except httpx.HTTPError as e: + logger.error(f"Failed to delete document {doc_id}: {e}") + # Optionally, implement retry logic or error handling here + + logger.info("Batch deletion completed") + + +class _VespaDeleteRequest: + def __init__(self, document_id: str, index_name: str) -> None: + self.document_id = document_id + # Encode the document ID to ensure it's safe for use in the URL + encoded_doc_id = urllib.parse.quote_plus(self.document_id) + self.url = ( + f"{VESPA_APPLICATION_ENDPOINT}/document/v1/" + f"{index_name}/{index_name}/docid/{encoded_doc_id}" + ) diff --git a/backend/danswer/document_index/vespa/indexing_utils.py b/backend/danswer/document_index/vespa/indexing_utils.py index 6b6ba8709d5..e6913cd9976 100644 --- a/backend/danswer/document_index/vespa/indexing_utils.py +++ b/backend/danswer/document_index/vespa/indexing_utils.py @@ -2,6 +2,7 @@ import json from datetime import datetime from datetime import timezone +from http import HTTPStatus import httpx from retry import retry @@ -37,6 +38,7 @@ from danswer.document_index.vespa_constants import SKIP_TITLE_EMBEDDING from danswer.document_index.vespa_constants import SOURCE_LINKS from danswer.document_index.vespa_constants import SOURCE_TYPE +from danswer.document_index.vespa_constants import TENANT_ID from danswer.document_index.vespa_constants import TITLE from danswer.document_index.vespa_constants import TITLE_EMBEDDING from danswer.indexing.models import DocMetadataAwareIndexChunk @@ -56,7 +58,6 @@ def _does_document_exist( chunk. This checks for whether the chunk exists already in the index""" doc_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_chunk_id}" doc_fetch_response = http_client.get(doc_url) - if doc_fetch_response.status_code == 404: return False @@ -65,6 +66,8 @@ def _does_document_exist( raise RuntimeError( f"Unexpected fetch document by ID value from Vespa " f"with error {doc_fetch_response.status_code}" + f"Index name: {index_name}" + f"Doc chunk id: {doc_chunk_id}" ) return True @@ -115,9 +118,12 @@ def get_existing_documents_from_chunks( return document_ids -@retry(tries=3, delay=1, backoff=2) +@retry(tries=5, delay=1, backoff=2) def _index_vespa_chunk( - chunk: DocMetadataAwareIndexChunk, index_name: str, http_client: httpx.Client + chunk: DocMetadataAwareIndexChunk, + index_name: str, + http_client: httpx.Client, + multitenant: bool, ) -> None: json_header = { "Content-Type": "application/json", @@ -174,6 +180,10 @@ def _index_vespa_chunk( BOOST: chunk.boost, } + if multitenant: + if chunk.tenant_id: + vespa_document_fields[TENANT_ID] = chunk.tenant_id + vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_chunk_id}" logger.debug(f'Indexing to URL "{vespa_url}"') res = http_client.post( @@ -185,6 +195,14 @@ def _index_vespa_chunk( logger.exception( f"Failed to index document: '{document.id}'. Got response: '{res.text}'" ) + if isinstance(e, httpx.HTTPStatusError): + if e.response.status_code == HTTPStatus.INSUFFICIENT_STORAGE: + logger.error( + "NOTE: HTTP Status 507 Insufficient Storage usually means " + "you need to allocate more memory or disk space to the " + "Vespa/index container." + ) + raise e @@ -192,6 +210,7 @@ def batch_index_vespa_chunks( chunks: list[DocMetadataAwareIndexChunk], index_name: str, http_client: httpx.Client, + multitenant: bool, executor: concurrent.futures.ThreadPoolExecutor | None = None, ) -> None: external_executor = True @@ -202,7 +221,9 @@ def batch_index_vespa_chunks( try: chunk_index_future = { - executor.submit(_index_vespa_chunk, chunk, index_name, http_client): chunk + executor.submit( + _index_vespa_chunk, chunk, index_name, http_client, multitenant + ): chunk for chunk in chunks } for future in concurrent.futures.as_completed(chunk_index_future): diff --git a/backend/danswer/document_index/vespa/shared_utils/utils.py b/backend/danswer/document_index/vespa/shared_utils/utils.py index c74afc9a629..49fdd680198 100644 --- a/backend/danswer/document_index/vespa/shared_utils/utils.py +++ b/backend/danswer/document_index/vespa/shared_utils/utils.py @@ -1,4 +1,12 @@ import re +from typing import cast + +import httpx + +from danswer.configs.app_configs import MANAGED_VESPA +from danswer.configs.app_configs import VESPA_CLOUD_CERT_PATH +from danswer.configs.app_configs import VESPA_CLOUD_KEY_PATH +from danswer.configs.app_configs import VESPA_REQUEST_TIMEOUT # NOTE: This does not seem to be used in reality despite the Vespa Docs pointing to this code # See here for reference: https://docs.vespa.ai/en/documents.html @@ -45,3 +53,19 @@ def remove_invalid_unicode_chars(text: str) -> str: "[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]" ) return _illegal_xml_chars_RE.sub("", text) + + +def get_vespa_http_client(no_timeout: bool = False) -> httpx.Client: + """ + Configure and return an HTTP client for communicating with Vespa, + including authentication if needed. + """ + + return httpx.Client( + cert=cast(tuple[str, str], (VESPA_CLOUD_CERT_PATH, VESPA_CLOUD_KEY_PATH)) + if MANAGED_VESPA + else None, + verify=False if not MANAGED_VESPA else True, + timeout=None if no_timeout else VESPA_REQUEST_TIMEOUT, + http2=True, + ) diff --git a/backend/danswer/document_index/vespa/shared_utils/vespa_request_builders.py b/backend/danswer/document_index/vespa/shared_utils/vespa_request_builders.py index 65752aa09c1..a3261288dda 100644 --- a/backend/danswer/document_index/vespa/shared_utils/vespa_request_builders.py +++ b/backend/danswer/document_index/vespa/shared_utils/vespa_request_builders.py @@ -3,6 +3,7 @@ from datetime import timezone from danswer.configs.constants import INDEX_SEPARATOR +from danswer.context.search.models import IndexFilters from danswer.document_index.interfaces import VespaChunkRequest from danswer.document_index.vespa_constants import ACCESS_CONTROL_LIST from danswer.document_index.vespa_constants import CHUNK_ID @@ -12,7 +13,7 @@ from danswer.document_index.vespa_constants import HIDDEN from danswer.document_index.vespa_constants import METADATA_LIST from danswer.document_index.vespa_constants import SOURCE_TYPE -from danswer.search.models import IndexFilters +from danswer.document_index.vespa_constants import TENANT_ID from danswer.utils.logger import setup_logger logger = setup_logger() @@ -53,6 +54,9 @@ def _build_time_filter( filter_str = f"!({HIDDEN}=true) and " if not include_hidden else "" + if filters.tenant_id: + filter_str += f'({TENANT_ID} contains "{filters.tenant_id}") and ' + # CAREFUL touching this one, currently there is no second ACL double-check post retrieval if filters.access_control_list is not None: filter_str += _build_or_filters( diff --git a/backend/danswer/document_index/vespa_constants.py b/backend/danswer/document_index/vespa_constants.py index 8409efe1dea..30039922f1a 100644 --- a/backend/danswer/document_index/vespa_constants.py +++ b/backend/danswer/document_index/vespa_constants.py @@ -1,3 +1,4 @@ +from danswer.configs.app_configs import VESPA_CLOUD_URL from danswer.configs.app_configs import VESPA_CONFIG_SERVER_HOST from danswer.configs.app_configs import VESPA_HOST from danswer.configs.app_configs import VESPA_PORT @@ -9,17 +10,31 @@ DOCUMENT_REPLACEMENT_PAT = "DOCUMENT_REPLACEMENT" SEARCH_THREAD_NUMBER_PAT = "SEARCH_THREAD_NUMBER" DATE_REPLACEMENT = "DATE_REPLACEMENT" +SEARCH_THREAD_NUMBER_PAT = "SEARCH_THREAD_NUMBER" +TENANT_ID_PAT = "TENANT_ID_REPLACEMENT" +TENANT_ID_REPLACEMENT = """field tenant_id type string { + indexing: summary | attribute + rank: filter + attribute: fast-search + }""" # config server -VESPA_CONFIG_SERVER_URL = f"http://{VESPA_CONFIG_SERVER_HOST}:{VESPA_TENANT_PORT}" + + +VESPA_CONFIG_SERVER_URL = ( + VESPA_CLOUD_URL or f"http://{VESPA_CONFIG_SERVER_HOST}:{VESPA_TENANT_PORT}" +) VESPA_APPLICATION_ENDPOINT = f"{VESPA_CONFIG_SERVER_URL}/application/v2" # main search application -VESPA_APP_CONTAINER_URL = f"http://{VESPA_HOST}:{VESPA_PORT}" +VESPA_APP_CONTAINER_URL = VESPA_CLOUD_URL or f"http://{VESPA_HOST}:{VESPA_PORT}" + + # danswer_chunk below is defined in vespa/app_configs/schemas/danswer_chunk.sd DOCUMENT_ID_ENDPOINT = ( f"{VESPA_APP_CONTAINER_URL}/document/v1/default/{{index_name}}/docid" ) + SEARCH_ENDPOINT = f"{VESPA_APP_CONTAINER_URL}/search/" NUM_THREADS = ( @@ -35,7 +50,7 @@ VESPA_TIMEOUT = "3s" BATCH_SIZE = 128 # Specific to Vespa - +TENANT_ID = "tenant_id" DOCUMENT_ID = "document_id" CHUNK_ID = "chunk_id" BLURB = "blurb" diff --git a/backend/danswer/dynamic_configs/factory.py b/backend/danswer/dynamic_configs/factory.py deleted file mode 100644 index 44b6e096b6d..00000000000 --- a/backend/danswer/dynamic_configs/factory.py +++ /dev/null @@ -1,15 +0,0 @@ -from danswer.configs.app_configs import DYNAMIC_CONFIG_STORE -from danswer.dynamic_configs.interface import DynamicConfigStore -from danswer.dynamic_configs.store import FileSystemBackedDynamicConfigStore -from danswer.dynamic_configs.store import PostgresBackedDynamicConfigStore - - -def get_dynamic_config_store() -> DynamicConfigStore: - dynamic_config_store_type = DYNAMIC_CONFIG_STORE - if dynamic_config_store_type == FileSystemBackedDynamicConfigStore.__name__: - raise NotImplementedError("File based config store no longer supported") - if dynamic_config_store_type == PostgresBackedDynamicConfigStore.__name__: - return PostgresBackedDynamicConfigStore() - - # TODO: change exception type - raise Exception("Unknown dynamic config store type") diff --git a/backend/danswer/dynamic_configs/store.py b/backend/danswer/dynamic_configs/store.py deleted file mode 100644 index cc53da938ad..00000000000 --- a/backend/danswer/dynamic_configs/store.py +++ /dev/null @@ -1,102 +0,0 @@ -import json -import os -from collections.abc import Iterator -from contextlib import contextmanager -from pathlib import Path -from typing import cast - -from filelock import FileLock -from sqlalchemy.orm import Session - -from danswer.db.engine import get_session_factory -from danswer.db.models import KVStore -from danswer.dynamic_configs.interface import ConfigNotFoundError -from danswer.dynamic_configs.interface import DynamicConfigStore -from danswer.dynamic_configs.interface import JSON_ro - - -FILE_LOCK_TIMEOUT = 10 - - -def _get_file_lock(file_name: Path) -> FileLock: - return FileLock(file_name.with_suffix(".lock")) - - -class FileSystemBackedDynamicConfigStore(DynamicConfigStore): - def __init__(self, dir_path: str) -> None: - # TODO (chris): maybe require all possible keys to be passed in - # at app start somehow to prevent key overlaps - self.dir_path = Path(dir_path) - - def store(self, key: str, val: JSON_ro, encrypt: bool = False) -> None: - file_path = self.dir_path / key - lock = _get_file_lock(file_path) - with lock.acquire(timeout=FILE_LOCK_TIMEOUT): - with open(file_path, "w+") as f: - json.dump(val, f) - - def load(self, key: str) -> JSON_ro: - file_path = self.dir_path / key - if not file_path.exists(): - raise ConfigNotFoundError - lock = _get_file_lock(file_path) - with lock.acquire(timeout=FILE_LOCK_TIMEOUT): - with open(self.dir_path / key) as f: - return cast(JSON_ro, json.load(f)) - - def delete(self, key: str) -> None: - file_path = self.dir_path / key - if not file_path.exists(): - raise ConfigNotFoundError - lock = _get_file_lock(file_path) - with lock.acquire(timeout=FILE_LOCK_TIMEOUT): - os.remove(file_path) - - -class PostgresBackedDynamicConfigStore(DynamicConfigStore): - @contextmanager - def get_session(self) -> Iterator[Session]: - factory = get_session_factory() - session: Session = factory() - try: - yield session - finally: - session.close() - - def store(self, key: str, val: JSON_ro, encrypt: bool = False) -> None: - # The actual encryption/decryption is done in Postgres, we just need to choose - # which field to set - encrypted_val = val if encrypt else None - plain_val = val if not encrypt else None - with self.get_session() as session: - obj = session.query(KVStore).filter_by(key=key).first() - if obj: - obj.value = plain_val - obj.encrypted_value = encrypted_val - else: - obj = KVStore( - key=key, value=plain_val, encrypted_value=encrypted_val - ) # type: ignore - session.query(KVStore).filter_by(key=key).delete() # just in case - session.add(obj) - session.commit() - - def load(self, key: str) -> JSON_ro: - with self.get_session() as session: - obj = session.query(KVStore).filter_by(key=key).first() - if not obj: - raise ConfigNotFoundError - - if obj.value is not None: - return cast(JSON_ro, obj.value) - if obj.encrypted_value is not None: - return cast(JSON_ro, obj.encrypted_value) - - return None - - def delete(self, key: str) -> None: - with self.get_session() as session: - result = session.query(KVStore).filter_by(key=key).delete() # type: ignore - if result == 0: - raise ConfigNotFoundError - session.commit() diff --git a/backend/danswer/file_processing/extract_file_text.py b/backend/danswer/file_processing/extract_file_text.py index 36df08ac465..9effad5b4e0 100644 --- a/backend/danswer/file_processing/extract_file_text.py +++ b/backend/danswer/file_processing/extract_file_text.py @@ -20,6 +20,8 @@ from danswer.configs.constants import DANSWER_METADATA_FILENAME from danswer.file_processing.html_utils import parse_html_page_basic +from danswer.file_processing.unstructured import get_unstructured_api_key +from danswer.file_processing.unstructured import unstructured_to_text from danswer.utils.logger import setup_logger logger = setup_logger() @@ -206,8 +208,9 @@ def read_pdf_file( # By user request, keep files that are unreadable just so they # can be discoverable by title. return "", metadata - else: - logger.warning("No Password available to to decrypt pdf") + elif pdf_reader.is_encrypted: + logger.warning("No Password available to decrypt pdf, returning empty") + return "", metadata # Extract metadata from the PDF, removing leading '/' from keys if present # This standardizes the metadata keys for consistency @@ -331,9 +334,10 @@ def file_io_to_text(file: IO[Any]) -> str: def extract_file_text( - file_name: str | None, file: IO[Any], + file_name: str, break_on_unprocessable: bool = True, + extension: str | None = None, ) -> str: extension_to_function: dict[str, Callable[[IO[Any]], str]] = { ".pdf": pdf_to_text, @@ -345,22 +349,29 @@ def extract_file_text( ".html": parse_html_page_basic, } - def _process_file() -> str: - if file_name: - extension = get_file_ext(file_name) - if check_file_ext_is_valid(extension): - return extension_to_function.get(extension, file_io_to_text)(file) + try: + if get_unstructured_api_key(): + return unstructured_to_text(file, file_name) + + if file_name or extension: + if extension is not None: + final_extension = extension + elif file_name is not None: + final_extension = get_file_ext(file_name) - # Either the file somehow has no name or the extension is not one that we are familiar with + if check_file_ext_is_valid(final_extension): + return extension_to_function.get(final_extension, file_io_to_text)(file) + + # Either the file somehow has no name or the extension is not one that we recognize if is_text_file(file): return file_io_to_text(file) raise ValueError("Unknown file extension and unknown text encoding") - try: - return _process_file() except Exception as e: if break_on_unprocessable: - raise RuntimeError(f"Failed to process file: {str(e)}") from e - logger.warning(f"Failed to process file: {str(e)}") + raise RuntimeError( + f"Failed to process file {file_name or 'Unknown'}: {str(e)}" + ) from e + logger.warning(f"Failed to process file {file_name or 'Unknown'}: {str(e)}") return "" diff --git a/backend/danswer/file_processing/html_utils.py b/backend/danswer/file_processing/html_utils.py index 48782981f89..d1948d011f5 100644 --- a/backend/danswer/file_processing/html_utils.py +++ b/backend/danswer/file_processing/html_utils.py @@ -4,11 +4,17 @@ from typing import IO import bs4 +import trafilatura # type: ignore +from trafilatura.settings import use_config # type: ignore from danswer.configs.app_configs import HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY +from danswer.configs.app_configs import PARSE_WITH_TRAFILATURA from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_CLASSES from danswer.configs.app_configs import WEB_CONNECTOR_IGNORED_ELEMENTS from danswer.file_processing.enums import HtmlBasedConnectorTransformLinksStrategy +from danswer.utils.logger import setup_logger + +logger = setup_logger() MINTLIFY_UNWANTED = ["sticky", "hidden"] @@ -47,6 +53,18 @@ def format_element_text(element_text: str, link_href: str | None) -> str: return f"[{element_text_no_newlines}]({link_href})" +def parse_html_with_trafilatura(html_content: str) -> str: + """Parse HTML content using trafilatura.""" + config = use_config() + config.set("DEFAULT", "include_links", "True") + config.set("DEFAULT", "include_tables", "True") + config.set("DEFAULT", "include_images", "True") + config.set("DEFAULT", "include_formatting", "True") + + extracted_text = trafilatura.extract(html_content, config=config) + return strip_excessive_newlines_and_spaces(extracted_text) if extracted_text else "" + + def format_document_soup( document: bs4.BeautifulSoup, table_cell_separator: str = "\t" ) -> str: @@ -183,7 +201,21 @@ def web_html_cleanup( for undesired_tag in additional_element_types_to_discard: [tag.extract() for tag in soup.find_all(undesired_tag)] + soup_string = str(soup) + page_text = "" + + if PARSE_WITH_TRAFILATURA: + try: + page_text = parse_html_with_trafilatura(soup_string) + if not page_text: + raise ValueError("Empty content returned by trafilatura.") + except Exception as e: + logger.info(f"Trafilatura parsing failed: {e}. Falling back on bs4.") + page_text = format_document_soup(soup) + else: + page_text = format_document_soup(soup) + # 200B is ZeroWidthSpace which we don't care for - page_text = format_document_soup(soup).replace("\u200B", "") + cleaned_text = page_text.replace("\u200B", "") - return ParsedHTML(title=title, cleaned_text=page_text) + return ParsedHTML(title=title, cleaned_text=cleaned_text) diff --git a/backend/danswer/file_processing/unstructured.py b/backend/danswer/file_processing/unstructured.py new file mode 100644 index 00000000000..dc61869ee9c --- /dev/null +++ b/backend/danswer/file_processing/unstructured.py @@ -0,0 +1,67 @@ +from typing import Any +from typing import cast +from typing import IO + +from unstructured.staging.base import dict_to_elements +from unstructured_client import UnstructuredClient # type: ignore +from unstructured_client.models import operations # type: ignore +from unstructured_client.models import shared + +from danswer.configs.constants import KV_UNSTRUCTURED_API_KEY +from danswer.key_value_store.factory import get_kv_store +from danswer.key_value_store.interface import KvKeyNotFoundError +from danswer.utils.logger import setup_logger + + +logger = setup_logger() + + +def get_unstructured_api_key() -> str | None: + kv_store = get_kv_store() + try: + return cast(str, kv_store.load(KV_UNSTRUCTURED_API_KEY)) + except KvKeyNotFoundError: + return None + + +def update_unstructured_api_key(api_key: str) -> None: + kv_store = get_kv_store() + kv_store.store(KV_UNSTRUCTURED_API_KEY, api_key) + + +def delete_unstructured_api_key() -> None: + kv_store = get_kv_store() + kv_store.delete(KV_UNSTRUCTURED_API_KEY) + + +def _sdk_partition_request( + file: IO[Any], file_name: str, **kwargs: Any +) -> operations.PartitionRequest: + try: + request = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + files=shared.Files(content=file.read(), file_name=file_name), + **kwargs, + ), + ) + return request + except Exception as e: + logger.error(f"Error creating partition request for file {file_name}: {str(e)}") + raise + + +def unstructured_to_text(file: IO[Any], file_name: str) -> str: + logger.debug(f"Starting to read file: {file_name}") + req = _sdk_partition_request(file, file_name, strategy="auto") + + unstructured_client = UnstructuredClient(api_key_auth=get_unstructured_api_key()) + + response = unstructured_client.general.partition(req) # type: ignore + elements = dict_to_elements(response.elements) + + if response.status_code != 200: + err = f"Received unexpected status code {response.status_code} from Unstructured API." + logger.error(err) + raise ValueError(err) + + return "\n\n".join(str(el) for el in elements) diff --git a/backend/danswer/file_store/models.py b/backend/danswer/file_store/models.py index d944a2fd270..5bf964287e3 100644 --- a/backend/danswer/file_store/models.py +++ b/backend/danswer/file_store/models.py @@ -13,6 +13,7 @@ class ChatFileType(str, Enum): DOC = "document" # Plain text only contain the text PLAIN_TEXT = "plain_text" + CSV = "csv" class FileDescriptor(TypedDict): diff --git a/backend/danswer/file_store/utils.py b/backend/danswer/file_store/utils.py index b71d20bbbb4..e9eea2c262d 100644 --- a/backend/danswer/file_store/utils.py +++ b/backend/danswer/file_store/utils.py @@ -8,12 +8,13 @@ from sqlalchemy.orm import Session from danswer.configs.constants import FileOrigin -from danswer.db.engine import get_session_context_manager +from danswer.db.engine import get_session_with_tenant from danswer.db.models import ChatMessage from danswer.file_store.file_store import get_default_file_store from danswer.file_store.models import FileDescriptor from danswer.file_store.models import InMemoryChatFile from danswer.utils.threadpool_concurrency import run_functions_tuples_in_parallel +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR def load_chat_file( @@ -52,11 +53,11 @@ def load_all_chat_files( return files -def save_file_from_url(url: str) -> str: +def save_file_from_url(url: str, tenant_id: str) -> str: """NOTE: using multiple sessions here, since this is often called using multithreading. In practice, sharing a session has resulted in weird errors.""" - with get_session_context_manager() as db_session: + with get_session_with_tenant(tenant_id) as db_session: response = requests.get(url) response.raise_for_status() @@ -75,7 +76,10 @@ def save_file_from_url(url: str) -> str: def save_files_from_urls(urls: list[str]) -> list[str]: + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() + funcs: list[tuple[Callable[..., Any], tuple[Any, ...]]] = [ - (save_file_from_url, (url,)) for url in urls + (save_file_from_url, (url, tenant_id)) for url in urls ] + # Must pass in tenant_id here, since this is called by multithreading return run_functions_tuples_in_parallel(funcs) diff --git a/backend/danswer/indexing/chunker.py b/backend/danswer/indexing/chunker.py index 03a03f30f49..287d3ba2d5e 100644 --- a/backend/danswer/indexing/chunker.py +++ b/backend/danswer/indexing/chunker.py @@ -10,11 +10,13 @@ get_metadata_keys_to_ignore, ) from danswer.connectors.models import Document +from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface from danswer.indexing.models import DocAwareChunk from danswer.natural_language_processing.utils import BaseTokenizer from danswer.utils.logger import setup_logger +from danswer.utils.text_processing import clean_text from danswer.utils.text_processing import shared_precompare_cleanup - +from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT # Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps # actually help quality at all @@ -26,6 +28,7 @@ MAX_METADATA_PERCENTAGE = 0.25 CHUNK_MIN_CONTENT = 256 + logger = setup_logger() @@ -123,6 +126,7 @@ def __init__( chunk_token_limit: int = DOC_EMBEDDING_CONTEXT_SIZE, chunk_overlap: int = CHUNK_OVERLAP, mini_chunk_size: int = MINI_CHUNK_SIZE, + callback: IndexingHeartbeatInterface | None = None, ) -> None: from llama_index.text_splitter import SentenceSplitter @@ -131,6 +135,7 @@ def __init__( self.enable_multipass = enable_multipass self.enable_large_chunks = enable_large_chunks self.tokenizer = tokenizer + self.callback = callback self.blurb_splitter = SentenceSplitter( tokenizer=tokenizer.tokenize, @@ -154,6 +159,24 @@ def __init__( else None ) + def _split_oversized_chunk(self, text: str, content_token_limit: int) -> list[str]: + """ + Splits the text into smaller chunks based on token count to ensure + no chunk exceeds the content_token_limit. + """ + tokens = self.tokenizer.tokenize(text) + chunks = [] + start = 0 + total_tokens = len(tokens) + while start < total_tokens: + end = min(start + content_token_limit, total_tokens) + token_chunk = tokens[start:end] + # Join the tokens to reconstruct the text + chunk_text = " ".join(token_chunk) + chunks.append(chunk_text) + start = end + return chunks + def _extract_blurb(self, text: str) -> str: texts = self.blurb_splitter.split_text(text) if not texts: @@ -198,9 +221,20 @@ def _create_chunk( mini_chunk_texts=self._get_mini_chunk_texts(text), ) - for section in document.sections: - section_text = section.text + for section_idx, section in enumerate(document.sections): + section_text = clean_text(section.text) section_link_text = section.link or "" + # If there is no useful content, not even the title, just drop it + if not section_text and (not document.title or section_idx > 0): + # If a section is empty and the document has no title, we can just drop it. We return a list of + # DocAwareChunks where each one contains the necessary information needed down the line for indexing. + # There is no concern about dropping whole documents from this list, it should not cause any indexing failures. + logger.warning( + f"Skipping section {section.text} from document " + f"{document.semantic_identifier} due to empty text after cleaning " + f" with link {section_link_text}" + ) + continue section_token_count = len(self.tokenizer.tokenize(section_text)) @@ -214,14 +248,37 @@ def _create_chunk( chunk_text = "" split_texts = self.chunk_splitter.split_text(section_text) + for i, split_text in enumerate(split_texts): - chunks.append( - _create_chunk( - text=split_text, - links={0: section_link_text}, - is_continuation=(i != 0), + if ( + STRICT_CHUNK_TOKEN_LIMIT + and + # Tokenizer only runs if STRICT_CHUNK_TOKEN_LIMIT is true + len(self.tokenizer.tokenize(split_text)) > content_token_limit + ): + # If STRICT_CHUNK_TOKEN_LIMIT is true, manually check + # the token count of each split text to ensure it is + # not larger than the content_token_limit + smaller_chunks = self._split_oversized_chunk( + split_text, content_token_limit ) - ) + for i, small_chunk in enumerate(smaller_chunks): + chunks.append( + _create_chunk( + text=small_chunk, + links={0: section_link_text}, + is_continuation=(i != 0), + ) + ) + else: + chunks.append( + _create_chunk( + text=split_text, + links={0: section_link_text}, + is_continuation=(i != 0), + ) + ) + continue current_token_count = len(self.tokenizer.tokenize(chunk_text)) @@ -255,7 +312,7 @@ def _create_chunk( # If the chunk does not have any useable content, it will not be indexed return chunks - def chunk(self, document: Document) -> list[DocAwareChunk]: + def _handle_single_document(self, document: Document) -> list[DocAwareChunk]: # Specifically for reproducing an issue with gmail if document.source == DocumentSource.GMAIL: logger.debug(f"Chunking {document.semantic_identifier}") @@ -302,3 +359,22 @@ def chunk(self, document: Document) -> list[DocAwareChunk]: normal_chunks.extend(large_chunks) return normal_chunks + + def chunk(self, documents: list[Document]) -> list[DocAwareChunk]: + """ + Takes in a list of documents and chunks them into smaller chunks for indexing + while persisting the document metadata. + """ + final_chunks: list[DocAwareChunk] = [] + for document in documents: + if self.callback: + if self.callback.should_stop(): + raise RuntimeError("Chunker.chunk: Stop signal detected") + + chunks = self._handle_single_document(document) + final_chunks.extend(chunks) + + if self.callback: + self.callback.progress("Chunker.chunk", len(chunks)) + + return final_chunks diff --git a/backend/danswer/indexing/embedder.py b/backend/danswer/indexing/embedder.py index d25a0659c62..2e975324186 100644 --- a/backend/danswer/indexing/embedder.py +++ b/backend/danswer/indexing/embedder.py @@ -1,12 +1,8 @@ from abc import ABC from abc import abstractmethod -from sqlalchemy.orm import Session - -from danswer.db.models import IndexModelStatus from danswer.db.models import SearchSettings -from danswer.db.search_settings import get_current_search_settings -from danswer.db.search_settings import get_secondary_search_settings +from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface from danswer.indexing.models import ChunkEmbedding from danswer.indexing.models import DocAwareChunk from danswer.indexing.models import IndexChunk @@ -24,6 +20,9 @@ class IndexingEmbedder(ABC): + """Converts chunks into chunks with embeddings. Note that one chunk may have + multiple embeddings associated with it.""" + def __init__( self, model_name: str, @@ -33,6 +32,9 @@ def __init__( provider_type: EmbeddingProvider | None, api_key: str | None, api_url: str | None, + api_version: str | None, + deployment_name: str | None, + callback: IndexingHeartbeatInterface | None, ): self.model_name = model_name self.normalize = normalize @@ -41,6 +43,8 @@ def __init__( self.provider_type = provider_type self.api_key = api_key self.api_url = api_url + self.api_version = api_version + self.deployment_name = deployment_name self.embedding_model = EmbeddingModel( model_name=model_name, @@ -50,10 +54,13 @@ def __init__( api_key=api_key, provider_type=provider_type, api_url=api_url, + api_version=api_version, + deployment_name=deployment_name, # The below are globally set, this flow always uses the indexing one server_host=INDEXING_MODEL_SERVER_HOST, server_port=INDEXING_MODEL_SERVER_PORT, retrim_content=True, + callback=callback, ) @abstractmethod @@ -74,6 +81,9 @@ def __init__( provider_type: EmbeddingProvider | None = None, api_key: str | None = None, api_url: str | None = None, + api_version: str | None = None, + deployment_name: str | None = None, + callback: IndexingHeartbeatInterface | None = None, ): super().__init__( model_name, @@ -83,6 +93,9 @@ def __init__( provider_type, api_key, api_url, + api_version, + deployment_name, + callback, ) @log_function_time() @@ -90,6 +103,9 @@ def embed_chunks( self, chunks: list[DocAwareChunk], ) -> list[IndexChunk]: + """Adds embeddings to the chunks, the title and metadata suffixes are added to the chunk as well + if they exist. If there is no space for it, it would have been thrown out at the chunking step. + """ # All chunks at this point must have some non-empty content flat_chunk_texts: list[str] = [] large_chunks_present = False @@ -108,6 +124,11 @@ def embed_chunks( flat_chunk_texts.append(chunk_text) if chunk.mini_chunk_texts: + if chunk.large_chunk_reference_ids: + # A large chunk does not contain mini chunks, if it matches the large chunk + # with a high score, then mini chunks would not be used anyway + # otherwise it should match the normal chunk + raise RuntimeError("Large chunk contains mini chunks") flat_chunk_texts.extend(chunk.mini_chunk_texts) embeddings = self.embedding_model.encode( @@ -166,7 +187,7 @@ def embed_chunks( title_embed_dict[title] = title_embedding new_embedded_chunk = IndexChunk( - **chunk.dict(), + **chunk.model_dump(), embeddings=ChunkEmbedding( full_embedding=chunk_embeddings[0], mini_chunk_embeddings=chunk_embeddings[1:], @@ -180,7 +201,9 @@ def embed_chunks( @classmethod def from_db_search_settings( - cls, search_settings: SearchSettings + cls, + search_settings: SearchSettings, + callback: IndexingHeartbeatInterface | None = None, ) -> "DefaultIndexingEmbedder": return cls( model_name=search_settings.model_name, @@ -190,28 +213,7 @@ def from_db_search_settings( provider_type=search_settings.provider_type, api_key=search_settings.api_key, api_url=search_settings.api_url, + api_version=search_settings.api_version, + deployment_name=search_settings.deployment_name, + callback=callback, ) - - -def get_embedding_model_from_search_settings( - db_session: Session, index_model_status: IndexModelStatus = IndexModelStatus.PRESENT -) -> IndexingEmbedder: - search_settings: SearchSettings | None - if index_model_status == IndexModelStatus.PRESENT: - search_settings = get_current_search_settings(db_session) - elif index_model_status == IndexModelStatus.FUTURE: - search_settings = get_secondary_search_settings(db_session) - if not search_settings: - raise RuntimeError("No secondary index configured") - else: - raise RuntimeError("Not supporting embedding model rollbacks") - - return DefaultIndexingEmbedder( - model_name=search_settings.model_name, - normalize=search_settings.normalize, - query_prefix=search_settings.query_prefix, - passage_prefix=search_settings.passage_prefix, - provider_type=search_settings.provider_type, - api_key=search_settings.api_key, - api_url=search_settings.api_url, - ) diff --git a/backend/danswer/indexing/indexing_heartbeat.py b/backend/danswer/indexing/indexing_heartbeat.py new file mode 100644 index 00000000000..fe5f83d0b86 --- /dev/null +++ b/backend/danswer/indexing/indexing_heartbeat.py @@ -0,0 +1,15 @@ +from abc import ABC +from abc import abstractmethod + + +class IndexingHeartbeatInterface(ABC): + """Defines a callback interface to be passed to + to run_indexing_entrypoint.""" + + @abstractmethod + def should_stop(self) -> bool: + """Signal to stop the looping function in flight.""" + + @abstractmethod + def progress(self, tag: str, amount: int) -> None: + """Send progress updates to the caller.""" diff --git a/backend/danswer/indexing/indexing_pipeline.py b/backend/danswer/indexing/indexing_pipeline.py index c1e66f1fcfb..3c26ffb9b2b 100644 --- a/backend/danswer/indexing/indexing_pipeline.py +++ b/backend/danswer/indexing/indexing_pipeline.py @@ -1,11 +1,9 @@ import traceback from functools import partial +from http import HTTPStatus from typing import Protocol -from pydantic import BaseModel -from pydantic import ConfigDict -from sqlalchemy.orm import Session - +import httpx from danswer.access.access import get_access_for_documents from danswer.access.models import DocumentAccess from danswer.configs.app_configs import ENABLE_MULTIPASS_INDEXING @@ -20,7 +18,8 @@ from danswer.db.document import prepare_to_modify_documents from danswer.db.document import update_docs_last_modified__no_commit from danswer.db.document import update_docs_updated_at__no_commit -from danswer.db.document import upsert_documents_complete +from danswer.db.document import upsert_document_by_connector_credential_pair +from danswer.db.document import upsert_documents from danswer.db.document_set import fetch_document_sets_for_documents from danswer.db.index_attempt import create_index_attempt_error from danswer.db.models import Document as DBDocument @@ -31,11 +30,15 @@ from danswer.document_index.interfaces import DocumentMetadata from danswer.indexing.chunker import Chunker from danswer.indexing.embedder import IndexingEmbedder +from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface from danswer.indexing.models import DocAwareChunk from danswer.indexing.models import DocMetadataAwareIndexChunk from danswer.utils.logger import setup_logger from danswer.utils.timing import log_function_time +from pydantic import BaseModel +from pydantic import ConfigDict from shared_configs.enums import EmbeddingProvider +from sqlalchemy.orm import Session logger = setup_logger() @@ -55,13 +58,13 @@ def __call__( ... -def upsert_documents_in_db( +def _upsert_documents_in_db( documents: list[Document], index_attempt_metadata: IndexAttemptMetadata, db_session: Session, ) -> None: # Metadata here refers to basic document info, not metadata about the actual content - doc_m_batch: list[DocumentMetadata] = [] + document_metadata_list: list[DocumentMetadata] = [] for doc in documents: first_link = next( (section.link for section in doc.sections if section.link), "" @@ -76,12 +79,9 @@ def upsert_documents_in_db( secondary_owners=get_experts_stores_representations(doc.secondary_owners), from_ingestion_api=doc.from_ingestion_api, ) - doc_m_batch.append(db_doc_metadata) + document_metadata_list.append(db_doc_metadata) - upsert_documents_complete( - db_session=db_session, - document_metadata_batch=doc_m_batch, - ) + upsert_documents(db_session, document_metadata_list) # Insert document content metadata for doc in documents: @@ -94,21 +94,25 @@ def upsert_documents_in_db( document_id=doc.id, db_session=db_session, ) - else: - create_or_add_document_tag( - tag_key=k, - tag_value=v, - source=doc.source, - document_id=doc.id, - db_session=db_session, - ) + continue + + create_or_add_document_tag( + tag_key=k, + tag_value=v, + source=doc.source, + document_id=doc.id, + db_session=db_session, + ) def get_doc_ids_to_update( documents: list[Document], db_docs: list[DBDocument] ) -> list[Document]: """Figures out which documents actually need to be updated. If a document is already present - and the `updated_at` hasn't changed, we shouldn't need to do anything with it.""" + and the `updated_at` hasn't changed, we shouldn't need to do anything with it. + + NB: Still need to associate the document in the DB if multiple connectors are + indexing the same doc.""" id_update_time_map = { doc.id: doc.doc_updated_at for doc in db_docs if doc.doc_updated_at } @@ -136,6 +140,7 @@ def index_doc_batch_with_handler( attempt_id: int | None, db_session: Session, ignore_time_skip: bool = False, + tenant_id: str | None = None, ) -> tuple[int, int]: r = (0, 0) try: @@ -147,8 +152,17 @@ def index_doc_batch_with_handler( index_attempt_metadata=index_attempt_metadata, db_session=db_session, ignore_time_skip=ignore_time_skip, + tenant_id=tenant_id, ) except Exception as e: + if isinstance(e, httpx.HTTPStatusError): + if e.response.status_code == HTTPStatus.INSUFFICIENT_STORAGE: + logger.error( + "NOTE: HTTP Status 507 Insufficient Storage indicates " + "you need to allocate more memory or disk space to the " + "Vespa/index container." + ) + if INDEXING_EXCEPTION_LIMIT == 0: raise @@ -192,7 +206,9 @@ def index_doc_batch_prepare( db_session: Session, ignore_time_skip: bool = False, ) -> DocumentBatchPrepareContext | None: - documents = [] + """Sets up the documents in the relational DB (source of truth) for permissions, metadata, etc. + This preceeds indexing it into the actual document index.""" + documents: list[Document] = [] for document in document_batch: empty_contents = not any(section.text.strip() for section in document.sections) if ( @@ -207,50 +223,65 @@ def index_doc_batch_prepare( logger.warning( f"Skipping document with ID {document.id} as it has neither title nor content." ) - elif ( - document.title is not None and not document.title.strip() and empty_contents - ): + continue + + if document.title is not None and not document.title.strip() and empty_contents: # The title is explicitly empty ("" and not None) and the document is empty # so when building the chunk text representation, it will be empty and unuseable logger.warning( f"Skipping document with ID {document.id} as the chunks will be empty." ) - else: - documents.append(document) + continue + + documents.append(document) - document_ids = [document.id for document in documents] + # Create a trimmed list of docs that don't have a newer updated at + # Shortcuts the time-consuming flow on connector index retries + document_ids: list[str] = [document.id for document in documents] db_docs: list[DBDocument] = get_documents_by_ids( - document_ids=document_ids, db_session=db_session, + document_ids=document_ids, ) - # Skip indexing docs that don't have a newer updated at - # Shortcuts the time-consuming flow on connector index retries updatable_docs = ( get_doc_ids_to_update(documents=documents, db_docs=db_docs) if not ignore_time_skip else documents ) - # No docs to update either because the batch is empty or every doc was already indexed - if not updatable_docs: - return None + # for all updatable docs, upsert into the DB + # Does not include doc_updated_at which is also used to indicate a successful update + if updatable_docs: + _upsert_documents_in_db( + documents=updatable_docs, + index_attempt_metadata=index_attempt_metadata, + db_session=db_session, + ) - # Create records in the source of truth about these documents, - # does not include doc_updated_at which is also used to indicate a successful update - upsert_documents_in_db( - documents=documents, - index_attempt_metadata=index_attempt_metadata, - db_session=db_session, + logger.info( + f"Upserted {len(updatable_docs)} changed docs out of " + f"{len(documents)} total docs into the DB" + ) + + # for all docs, upsert the document to cc pair relationship + upsert_document_by_connector_credential_pair( + db_session, + index_attempt_metadata.connector_id, + index_attempt_metadata.credential_id, + document_ids, ) + # No docs to process because the batch is empty or every doc was already indexed + if not updatable_docs: + return None + id_to_db_doc_map = {doc.id: doc for doc in db_docs} return DocumentBatchPrepareContext( updatable_docs=updatable_docs, id_to_db_doc_map=id_to_db_doc_map ) -@log_function_time() +@log_function_time(debug_only=True) def index_doc_batch( *, chunker: Chunker, @@ -260,12 +291,22 @@ def index_doc_batch( index_attempt_metadata: IndexAttemptMetadata, db_session: Session, ignore_time_skip: bool = False, + tenant_id: str | None = None, ) -> tuple[int, int]: """Takes different pieces of the indexing pipeline and applies it to a batch of documents Note that the documents should already be batched at this point so that it does not inflate the - memory requirements""" + memory requirements - no_access = DocumentAccess.build(user_ids=[], user_groups=[], is_public=False) + Returns a tuple where the first element is the number of new docs and the + second element is the number of chunks.""" + + no_access = DocumentAccess.build( + user_emails=[], + user_groups=[], + external_user_emails=[], + external_user_group_ids=[], + is_public=False, + ) ctx = index_doc_batch_prepare( document_batch=document_batch, @@ -277,18 +318,10 @@ def index_doc_batch( return 0, 0 logger.debug("Starting chunking") - chunks: list[DocAwareChunk] = [] - for document in ctx.updatable_docs: - chunks.extend(chunker.chunk(document=document)) + chunks: list[DocAwareChunk] = chunker.chunk(ctx.updatable_docs) logger.debug("Starting embedding") - chunks_with_embeddings = ( - embedder.embed_chunks( - chunks=chunks, - ) - if chunks - else [] - ) + chunks_with_embeddings = embedder.embed_chunks(chunks) if chunks else [] updatable_ids = [doc.id for doc in ctx.updatable_docs] @@ -308,9 +341,9 @@ def index_doc_batch( # we're concerned about race conditions where multiple simultaneous indexings might result # in one set of metadata overwriting another one in vespa. - # we still write data here for immediate and most likely correct sync, but + # we still write data here for the immediate and most likely correct sync, but # to resolve this, an update of the last modified field at the end of this loop - # always triggers a final metadata sync + # always triggers a final metadata sync via the celery queue access_aware_chunks = [ DocMetadataAwareIndexChunk.from_index_chunk( index_chunk=chunk, @@ -325,6 +358,7 @@ def index_doc_batch( if chunk.source_document.id in ctx.id_to_db_doc_map else DEFAULT_BOOST ), + tenant_id=tenant_id, ) for chunk in chunks_with_embeddings ] @@ -344,10 +378,12 @@ def index_doc_batch( last_modified_ids = [] ids_to_new_updated_at = {} for doc in successful_docs: - last_modified_ids.append(doc.id) - if doc.doc_updated_at is None: + last_modified_ids.append(doc.id) + # doc_updated_at is the source's idea (on the other end of the connector) + # of when the doc was last modified + if doc.doc_updated_at is None: continue - ids_to_new_updated_at[doc.id] = doc.doc_updated_at + ids_to_new_updated_at[doc.id] = doc.doc_updated_at update_docs_updated_at__no_commit( ids_to_new_updated_at=ids_to_new_updated_at, db_session=db_session @@ -359,10 +395,13 @@ def index_doc_batch( db_session.commit() - return len([r for r in insertion_records if r.already_existed is False]), len( - access_aware_chunks + result = ( + len([r for r in insertion_records if r.already_existed is False]), + len(access_aware_chunks), ) + return result + def build_indexing_pipeline( *, @@ -372,6 +411,8 @@ def build_indexing_pipeline( chunker: Chunker | None = None, ignore_time_skip: bool = False, attempt_id: int | None = None, + tenant_id: str | None = None, + callback: IndexingHeartbeatInterface | None = None, ) -> IndexingPipelineProtocol: """Builds a pipeline which takes in a list (batch) of docs and indexes them.""" search_settings = get_current_search_settings(db_session) @@ -398,6 +439,8 @@ def build_indexing_pipeline( tokenizer=embedder.embedding_model.tokenizer, enable_multipass=multipass, enable_large_chunks=enable_large_chunks, + # after every doc, update status in case there are a bunch of really long docs + callback=callback, ) return partial( @@ -408,4 +451,5 @@ def build_indexing_pipeline( ignore_time_skip=ignore_time_skip, attempt_id=attempt_id, db_session=db_session, + tenant_id=tenant_id, ) diff --git a/backend/danswer/indexing/models.py b/backend/danswer/indexing/models.py index c789a2b351b..39cfa2cca0c 100644 --- a/backend/danswer/indexing/models.py +++ b/backend/danswer/indexing/models.py @@ -75,6 +75,7 @@ class DocMetadataAwareIndexChunk(IndexChunk): negative -> ranked lower. """ + tenant_id: str | None = None access: "DocumentAccess" document_sets: set[str] boost: int @@ -86,6 +87,7 @@ def from_index_chunk( access: "DocumentAccess", document_sets: set[str], boost: int, + tenant_id: str | None, ) -> "DocMetadataAwareIndexChunk": index_chunk_data = index_chunk.model_dump() return cls( @@ -93,6 +95,7 @@ def from_index_chunk( access=access, document_sets=document_sets, boost=boost, + tenant_id=tenant_id, ) diff --git a/backend/danswer/key_value_store/__init__.py b/backend/danswer/key_value_store/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/danswer/key_value_store/factory.py b/backend/danswer/key_value_store/factory.py new file mode 100644 index 00000000000..142e9031b77 --- /dev/null +++ b/backend/danswer/key_value_store/factory.py @@ -0,0 +1,8 @@ +from danswer.key_value_store.interface import KeyValueStore +from danswer.key_value_store.store import PgRedisKVStore + + +def get_kv_store() -> KeyValueStore: + # In the Multi Tenant case, the tenant context is picked up automatically, it does not need to be passed in + # It's read from the global thread level variable + return PgRedisKVStore() diff --git a/backend/danswer/dynamic_configs/interface.py b/backend/danswer/key_value_store/interface.py similarity index 56% rename from backend/danswer/dynamic_configs/interface.py rename to backend/danswer/key_value_store/interface.py index 999ad939615..39c10047692 100644 --- a/backend/danswer/dynamic_configs/interface.py +++ b/backend/danswer/key_value_store/interface.py @@ -1,19 +1,15 @@ import abc -from collections.abc import Mapping -from collections.abc import Sequence -from typing import TypeAlias +from danswer.utils.special_types import JSON_ro -JSON_ro: TypeAlias = ( - Mapping[str, "JSON_ro"] | Sequence["JSON_ro"] | str | int | float | bool | None -) - -class ConfigNotFoundError(Exception): +class KvKeyNotFoundError(Exception): pass -class DynamicConfigStore: +class KeyValueStore: + # In the Multi Tenant case, the tenant context is picked up automatically, it does not need to be passed in + # It's read from the global thread level variable @abc.abstractmethod def store(self, key: str, val: JSON_ro, encrypt: bool = False) -> None: raise NotImplementedError diff --git a/backend/danswer/key_value_store/store.py b/backend/danswer/key_value_store/store.py new file mode 100644 index 00000000000..cd1df75af7c --- /dev/null +++ b/backend/danswer/key_value_store/store.py @@ -0,0 +1,120 @@ +import json +from collections.abc import Iterator +from contextlib import contextmanager +from typing import cast + +from fastapi import HTTPException +from redis.client import Redis +from sqlalchemy import text +from sqlalchemy.orm import Session + +from danswer.db.engine import get_sqlalchemy_engine +from danswer.db.engine import is_valid_schema_name +from danswer.db.models import KVStore +from danswer.key_value_store.interface import KeyValueStore +from danswer.key_value_store.interface import KvKeyNotFoundError +from danswer.redis.redis_pool import get_redis_client +from danswer.utils.logger import setup_logger +from danswer.utils.special_types import JSON_ro +from shared_configs.configs import MULTI_TENANT +from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR + +logger = setup_logger() + + +REDIS_KEY_PREFIX = "danswer_kv_store:" +KV_REDIS_KEY_EXPIRATION = 60 * 60 * 24 # 1 Day + + +class PgRedisKVStore(KeyValueStore): + def __init__( + self, redis_client: Redis | None = None, tenant_id: str | None = None + ) -> None: + # If no redis_client is provided, fall back to the context var + if redis_client is not None: + self.redis_client = redis_client + else: + tenant_id = tenant_id or CURRENT_TENANT_ID_CONTEXTVAR.get() + self.redis_client = get_redis_client(tenant_id=tenant_id) + + @contextmanager + def get_session(self) -> Iterator[Session]: + engine = get_sqlalchemy_engine() + with Session(engine, expire_on_commit=False) as session: + if MULTI_TENANT: + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() + if tenant_id == POSTGRES_DEFAULT_SCHEMA: + raise HTTPException( + status_code=401, detail="User must authenticate" + ) + if not is_valid_schema_name(tenant_id): + raise HTTPException(status_code=400, detail="Invalid tenant ID") + # Set the search_path to the tenant's schema + session.execute(text(f'SET search_path = "{tenant_id}"')) + yield session + + def store(self, key: str, val: JSON_ro, encrypt: bool = False) -> None: + # Not encrypted in Redis, but encrypted in Postgres + try: + self.redis_client.set( + REDIS_KEY_PREFIX + key, json.dumps(val), ex=KV_REDIS_KEY_EXPIRATION + ) + except Exception as e: + # Fallback gracefully to Postgres if Redis fails + logger.error(f"Failed to set value in Redis for key '{key}': {str(e)}") + + encrypted_val = val if encrypt else None + plain_val = val if not encrypt else None + with self.get_session() as session: + obj = session.query(KVStore).filter_by(key=key).first() + if obj: + obj.value = plain_val + obj.encrypted_value = encrypted_val + else: + obj = KVStore( + key=key, value=plain_val, encrypted_value=encrypted_val + ) # type: ignore + session.query(KVStore).filter_by(key=key).delete() # just in case + session.add(obj) + session.commit() + + def load(self, key: str) -> JSON_ro: + try: + redis_value = self.redis_client.get(REDIS_KEY_PREFIX + key) + if redis_value: + assert isinstance(redis_value, bytes) + return json.loads(redis_value.decode("utf-8")) + except Exception as e: + logger.error(f"Failed to get value from Redis for key '{key}': {str(e)}") + + with self.get_session() as session: + obj = session.query(KVStore).filter_by(key=key).first() + if not obj: + raise KvKeyNotFoundError + + if obj.value is not None: + value = obj.value + elif obj.encrypted_value is not None: + value = obj.encrypted_value + else: + value = None + + try: + self.redis_client.set(REDIS_KEY_PREFIX + key, json.dumps(value)) + except Exception as e: + logger.error(f"Failed to set value in Redis for key '{key}': {str(e)}") + + return cast(JSON_ro, value) + + def delete(self, key: str) -> None: + try: + self.redis_client.delete(REDIS_KEY_PREFIX + key) + except Exception as e: + logger.error(f"Failed to delete value from Redis for key '{key}': {str(e)}") + + with self.get_session() as session: + result = session.query(KVStore).filter_by(key=key).delete() # type: ignore + if result == 0: + raise KvKeyNotFoundError + session.commit() diff --git a/backend/danswer/llm/answering/answer.py b/backend/danswer/llm/answering/answer.py index 3964accd6a8..4c1e556d7fa 100644 --- a/backend/danswer/llm/answering/answer.py +++ b/backend/danswer/llm/answering/answer.py @@ -1,99 +1,45 @@ from collections.abc import Callable from collections.abc import Iterator -from typing import Any -from typing import cast from uuid import uuid4 -from danswer.chat.chat_utils import llm_doc_from_inference_section from danswer.chat.models import AnswerQuestionPossibleReturn from danswer.chat.models import CitationInfo from danswer.chat.models import DanswerAnswerPiece -from danswer.chat.models import LlmDoc -from danswer.chat.models import StreamStopInfo -from danswer.chat.models import StreamStopReason -from danswer.configs.chat_configs import QA_PROMPT_OVERRIDE from danswer.file_store.utils import InMemoryChatFile +from danswer.llm.answering.llm_response_handler import LLMCall +from danswer.llm.answering.llm_response_handler import LLMResponseHandlerManager from danswer.llm.answering.models import AnswerStyleConfig from danswer.llm.answering.models import PreviousMessage from danswer.llm.answering.models import PromptConfig -from danswer.llm.answering.models import StreamProcessor from danswer.llm.answering.prompts.build import AnswerPromptBuilder from danswer.llm.answering.prompts.build import default_build_system_message from danswer.llm.answering.prompts.build import default_build_user_message -from danswer.llm.answering.prompts.citations_prompt import ( - build_citations_system_message, -) -from danswer.llm.answering.prompts.citations_prompt import build_citations_user_message -from danswer.llm.answering.prompts.quotes_prompt import build_quotes_user_message -from danswer.llm.answering.stream_processing.citation_processing import ( - build_citation_processor, -) -from danswer.llm.answering.stream_processing.quotes_processing import ( - build_quotes_processor, -) -from danswer.llm.answering.stream_processing.utils import DocumentIdOrderMapping +from danswer.llm.answering.stream_processing.answer_response_handler import AnswerResponseHandler +from danswer.llm.answering.stream_processing.answer_response_handler import CitationResponseHandler +from danswer.llm.answering.stream_processing.answer_response_handler import DummyAnswerResponseHandler +from danswer.llm.answering.stream_processing.answer_response_handler import QuotesResponseHandler from danswer.llm.answering.stream_processing.utils import map_document_id_order +from danswer.llm.answering.tool.tool_response_handler import ToolResponseHandler from danswer.llm.interfaces import LLM -from danswer.llm.interfaces import ToolChoiceOptions from danswer.natural_language_processing.utils import get_tokenizer -from danswer.tools.custom.custom_tool_prompt_builder import ( - build_user_message_for_custom_tool_for_non_tool_calling_llm, -) -from danswer.tools.force import filter_tools_for_force_tool_use from danswer.tools.force import ForceUseTool -from danswer.tools.images.image_generation_tool import IMAGE_GENERATION_RESPONSE_ID -from danswer.tools.images.image_generation_tool import ImageGenerationResponse -from danswer.tools.images.image_generation_tool import ImageGenerationTool -from danswer.tools.images.prompt import build_image_generation_user_prompt -from danswer.tools.internet_search.internet_search_tool import InternetSearchTool -from danswer.tools.message import build_tool_message -from danswer.tools.message import ToolCallSummary -from danswer.tools.search.search_tool import FINAL_CONTEXT_DOCUMENTS_ID -from danswer.tools.search.search_tool import SEARCH_DOC_CONTENT_ID -from danswer.tools.search.search_tool import SEARCH_RESPONSE_SUMMARY_ID -from danswer.tools.search.search_tool import SearchResponseSummary -from danswer.tools.search.search_tool import SearchTool +from danswer.tools.models import ToolResponse from danswer.tools.tool import Tool -from danswer.tools.tool import ToolResponse -from danswer.tools.tool_runner import ( - check_which_tools_should_run_for_non_tool_calling_llm, -) -from danswer.tools.tool_runner import ToolCallFinalResult +from danswer.tools.tool_implementations.search.search_tool import SearchTool from danswer.tools.tool_runner import ToolCallKickoff -from danswer.tools.tool_runner import ToolRunner -from danswer.tools.tool_selection import select_single_tool_for_non_tool_calling_llm from danswer.tools.utils import explicit_tool_calling_supported from danswer.utils.logger import setup_logger from langchain.schema.messages import BaseMessage from langchain_core.messages import AIMessageChunk from langchain_core.messages import HumanMessage +from langchain_core.messages import ToolCall logger = setup_logger() -def _get_answer_stream_processor( - context_docs: list[LlmDoc], - doc_id_to_rank_map: DocumentIdOrderMapping, - answer_style_configs: AnswerStyleConfig, -) -> StreamProcessor: - if answer_style_configs.citation_config: - return build_citation_processor( - context_docs=context_docs, doc_id_to_rank_map=doc_id_to_rank_map - ) - if answer_style_configs.quotes_config: - return build_quotes_processor( - context_docs=context_docs, is_json_prompt=not (QA_PROMPT_OVERRIDE == "weak") - ) - - raise RuntimeError("Not implemented yet") - - AnswerStream = Iterator[AnswerQuestionPossibleReturn | ToolCallKickoff | ToolResponse] -logger = setup_logger() - - class Answer: def __init__( self, @@ -134,8 +80,6 @@ def __init__( self.tools = tools or [] self.force_use_tool = force_use_tool - self.skip_explicit_tool_calling = skip_explicit_tool_calling - self.message_history = message_history or [] # used for QA flow where we only want to send a single message self.single_message_history = single_message_history @@ -160,325 +104,143 @@ def __init__( self.skip_gen_ai_answer_generation = skip_gen_ai_answer_generation self._is_cancelled = False - def _update_prompt_builder_for_search_tool( - self, prompt_builder: AnswerPromptBuilder, final_context_documents: list[LlmDoc] - ) -> None: - if self.answer_style_config.citation_config: - prompt_builder.update_system_prompt( - build_citations_system_message(self.prompt_config, self.user_email) - ) - prompt_builder.update_user_prompt( - build_citations_user_message( - question=self.question, - prompt_config=self.prompt_config, - context_docs=final_context_documents, - files=self.latest_query_files, - all_doc_useful=( - self.answer_style_config.citation_config.all_docs_useful - if self.answer_style_config.citation_config - else False - ), - ) - ) - elif self.answer_style_config.quotes_config: - prompt_builder.update_user_prompt( - build_quotes_user_message( - question=self.question, - context_docs=final_context_documents, - history_str=self.single_message_history or "", - prompt=self.prompt_config, - user_email=self.user_email, - ) + self.using_tool_calling_llm = ( + explicit_tool_calling_supported( + self.llm.config.model_provider, self.llm.config.model_name ) + and not skip_explicit_tool_calling + ) - def _raw_output_for_explicit_tool_calling_llms( - self, - ) -> Iterator[ - str | StreamStopInfo | ToolCallKickoff | ToolResponse | ToolCallFinalResult - ]: - prompt_builder = AnswerPromptBuilder(self.message_history, self.llm.config) - - tool_call_chunk: AIMessageChunk | None = None - if self.force_use_tool.force_use and self.force_use_tool.args is not None: - # if we are forcing a tool WITH args specified, we don't need to check which tools to run - # / need to generate the args - tool_call_chunk = AIMessageChunk( - content="", - ) - tool_call_chunk.tool_calls = [ - { - "name": self.force_use_tool.tool_name, - "args": self.force_use_tool.args, - "id": str(uuid4()), - } - ] - else: - # if tool calling is supported, first try the raw message - # to see if we don't need to use any tools - prompt_builder.update_system_prompt( - default_build_system_message(self.prompt_config) - ) - prompt_builder.update_user_prompt( - default_build_user_message( - self.question, self.prompt_config, self.latest_query_files - ) - ) - prompt = prompt_builder.build() - final_tool_definitions = [ - tool.tool_definition() - for tool in filter_tools_for_force_tool_use( - self.tools, self.force_use_tool - ) - ] - - for message in self.llm.stream( - prompt=prompt, - tools=final_tool_definitions if final_tool_definitions else None, - tool_choice="required" if self.force_use_tool.force_use else None, - ): - if isinstance(message, AIMessageChunk) and ( - message.tool_call_chunks or message.tool_calls - ): - if tool_call_chunk is None: - tool_call_chunk = message - else: - tool_call_chunk += message # type: ignore - else: - if message.content: - if self.is_cancelled: - return - yield cast(str, message.content) - if ( - message.additional_kwargs.get("usage_metadata", {}).get("stop") - == "length" - ): - yield StreamStopInfo( - stop_reason=StreamStopReason.CONTEXT_LENGTH - ) - - if not tool_call_chunk: - return # no tool call needed - - # if we have a tool call, we need to call the tool - tool_call_requests = tool_call_chunk.tool_calls - for tool_call_request in tool_call_requests: - known_tools_by_name = [ - tool for tool in self.tools if tool.name == tool_call_request["name"] - ] - - if not known_tools_by_name: - logger.error( - "Tool call requested with unknown name field. \n" - f"self.tools: {self.tools}" - f"tool_call_request: {tool_call_request}" - ) - if self.tools: - tool = self.tools[0] - else: - continue - else: - tool = known_tools_by_name[0] - tool_args = ( - self.force_use_tool.args - if self.force_use_tool.tool_name == tool.name - and self.force_use_tool.args - else tool_call_request["args"] - ) + def _get_tools_list(self) -> list[Tool]: + if not self.force_use_tool.force_use: + return self.tools - tool_runner = ToolRunner(tool, tool_args) - yield tool_runner.kickoff() - yield from tool_runner.tool_responses() + tool = next( + (t for t in self.tools if t.name == self.force_use_tool.tool_name), None + ) + if tool is None: + raise RuntimeError(f"Tool '{self.force_use_tool.tool_name}' not found") - tool_call_summary = ToolCallSummary( - tool_call_request=tool_call_chunk, - tool_call_result=build_tool_message( - tool_call_request, tool_runner.tool_message_content() - ), + logger.info( + f"Forcefully using tool='{tool.name}'" + + ( + f" with args='{self.force_use_tool.args}'" + if self.force_use_tool.args is not None + else "" ) + ) + return [tool] - if tool.name in {SearchTool._NAME, InternetSearchTool._NAME}: - self._update_prompt_builder_for_search_tool(prompt_builder, []) - elif tool.name == ImageGenerationTool._NAME: - img_urls = [ - img_generation_result["url"] - for img_generation_result in tool_runner.tool_final_result().tool_result - ] - prompt_builder.update_user_prompt( - build_image_generation_user_prompt( - query=self.question, img_urls=img_urls - ) - ) - yield tool_runner.tool_final_result() - - prompt = prompt_builder.build(tool_call_summary=tool_call_summary) - - yield from self._process_llm_stream( - prompt=prompt, - tools=[tool.tool_definition() for tool in self.tools], - ) + def _handle_specified_tool_call( + self, llm_calls: list[LLMCall], tool: Tool, tool_args: dict + ) -> AnswerStream: + current_llm_call = llm_calls[-1] - return + # make a dummy tool handler + tool_handler = ToolResponseHandler([tool]) - # This method processes the LLM stream and yields the content or stop information - def _process_llm_stream( - self, - prompt: Any, - tools: list[dict] | None = None, - tool_choice: ToolChoiceOptions | None = None, - ) -> Iterator[str | StreamStopInfo]: - for message in self.llm.stream( - prompt=prompt, tools=tools, tool_choice=tool_choice - ): - if isinstance(message, AIMessageChunk): - if message.content: - if self.is_cancelled: - return StreamStopInfo(stop_reason=StreamStopReason.CANCELLED) - yield cast(str, message.content) - - if ( - message.additional_kwargs.get("usage_metadata", {}).get("stop") - == "length" - ): - yield StreamStopInfo(stop_reason=StreamStopReason.CONTEXT_LENGTH) - - def _raw_output_for_non_explicit_tool_calling_llms( - self, - ) -> Iterator[ - str | StreamStopInfo | ToolCallKickoff | ToolResponse | ToolCallFinalResult - ]: - prompt_builder = AnswerPromptBuilder(self.message_history, self.llm.config) - chosen_tool_and_args: tuple[Tool, dict] | None = None - - if self.force_use_tool.force_use: - # if we are forcing a tool, we don't need to check which tools to run - tool = next( - iter( - [ - tool - for tool in self.tools - if tool.name == self.force_use_tool.tool_name - ] - ), - None, - ) - if not tool: - raise RuntimeError(f"Tool '{self.force_use_tool.tool_name}' not found") + dummy_tool_call_chunk = AIMessageChunk(content="") + dummy_tool_call_chunk.tool_calls = [ + ToolCall(name=tool.name, args=tool_args, id=str(uuid4())) + ] - tool_args = ( - self.force_use_tool.args - if self.force_use_tool.args is not None - else tool.get_args_for_non_tool_calling_llm( - query=self.question, - history=self.message_history, - llm=self.llm, - force_run=True, - ) - ) - - if tool_args is None: - raise RuntimeError(f"Tool '{tool.name}' did not return args") + response_handler_manager = LLMResponseHandlerManager( + tool_handler, DummyAnswerResponseHandler(), self.is_cancelled + ) + yield from response_handler_manager.handle_llm_response( + iter([dummy_tool_call_chunk]) + ) - chosen_tool_and_args = (tool, tool_args) + new_llm_call = response_handler_manager.next_llm_call(current_llm_call) + if new_llm_call: + yield from self._get_response(llm_calls + [new_llm_call]) else: - tool_options = check_which_tools_should_run_for_non_tool_calling_llm( - tools=self.tools, - query=self.question, - history=self.message_history, - llm=self.llm, - ) + raise RuntimeError("Tool call handler did not return a new LLM call") - available_tools_and_args = [ - (self.tools[ind], args) - for ind, args in enumerate(tool_options) - if args is not None - ] + def _get_response(self, llm_calls: list[LLMCall]) -> AnswerStream: + current_llm_call = llm_calls[-1] - logger.info( - f"Selecting single tool from tools: {[(tool.name, args) for tool, args in available_tools_and_args]}" + # handle the case where no decision has to be made; we simply run the tool + if ( + current_llm_call.force_use_tool.force_use + and current_llm_call.force_use_tool.args is not None + ): + tool_name, tool_args = ( + current_llm_call.force_use_tool.tool_name, + current_llm_call.force_use_tool.args, ) - - chosen_tool_and_args = ( - select_single_tool_for_non_tool_calling_llm( - tools_and_args=available_tools_and_args, - history=self.message_history, - query=self.question, - llm=self.llm, - ) - if available_tools_and_args - else None + tool = next( + (t for t in current_llm_call.tools if t.name == tool_name), None ) + if not tool: + raise RuntimeError(f"Tool '{tool_name}' not found") - logger.notice(f"Chosen tool: {chosen_tool_and_args}") + yield from self._handle_specified_tool_call(llm_calls, tool, tool_args) + return - if not chosen_tool_and_args: - prompt_builder.update_system_prompt( - default_build_system_message(self.prompt_config) - ) - prompt_builder.update_user_prompt( - default_build_user_message( - self.question, self.prompt_config, self.latest_query_files + # special pre-logic for non-tool calling LLM case + if not self.using_tool_calling_llm and current_llm_call.tools: + chosen_tool_and_args = ( + ToolResponseHandler.get_tool_call_for_non_tool_calling_llm( + current_llm_call, self.llm ) ) - prompt = prompt_builder.build() - yield from self._process_llm_stream( - prompt=prompt, - tools=None, - ) + if chosen_tool_and_args: + tool, tool_args = chosen_tool_and_args + yield from self._handle_specified_tool_call(llm_calls, tool, tool_args) + return + + # if we're skipping gen ai answer generation, we should break + # out unless we're forcing a tool call. If we don't, we might generate an + # answer, which is a no-no! + if ( + self.skip_gen_ai_answer_generation + and not current_llm_call.force_use_tool.force_use + ): return - tool, tool_args = chosen_tool_and_args - tool_runner = ToolRunner(tool, tool_args) - yield tool_runner.kickoff() + # set up "handlers" to listen to the LLM response stream and + # feed back the processed results + handle tool call requests + # + figure out what the next LLM call should be + tool_call_handler = ToolResponseHandler(current_llm_call.tools) - if tool.name in {SearchTool._NAME, InternetSearchTool._NAME}: - final_context_documents = None - for response in tool_runner.tool_responses(): - if response.id == FINAL_CONTEXT_DOCUMENTS_ID: - final_context_documents = cast(list[LlmDoc], response.response) - yield response + search_result = SearchTool.get_search_result(current_llm_call) or [] - if final_context_documents is None: - raise RuntimeError( - f"{tool.name} did not return final context documents" - ) - - self._update_prompt_builder_for_search_tool( - prompt_builder, final_context_documents + answer_handler: AnswerResponseHandler + if self.answer_style_config.citation_config: + answer_handler = CitationResponseHandler( + context_docs=search_result, + doc_id_to_rank_map=map_document_id_order(search_result), ) - elif tool.name == ImageGenerationTool._NAME: - img_urls = [] - for response in tool_runner.tool_responses(): - if response.id == IMAGE_GENERATION_RESPONSE_ID: - img_generation_response = cast( - list[ImageGenerationResponse], response.response - ) - img_urls = [img.url for img in img_generation_response] - - yield response - - prompt_builder.update_user_prompt( - build_image_generation_user_prompt( - query=self.question, - img_urls=img_urls, - ) + elif self.answer_style_config.quotes_config: + answer_handler = QuotesResponseHandler( + context_docs=search_result, ) else: - prompt_builder.update_user_prompt( - HumanMessage( - content=build_user_message_for_custom_tool_for_non_tool_calling_llm( - self.question, - tool.name, - *tool_runner.tool_responses(), - ) - ) - ) - final = tool_runner.tool_final_result() + raise ValueError("No answer style config provided") - yield final + response_handler_manager = LLMResponseHandlerManager( + tool_call_handler, answer_handler, self.is_cancelled + ) - prompt = prompt_builder.build() + # DEBUG: good breakpoint + stream = self.llm.stream( + # For tool calling LLMs, we want to insert the task prompt as part of this flow, this is because the LLM + # may choose to not call any tools and just generate the answer, in which case the task prompt is needed. + prompt=current_llm_call.prompt_builder.build(), + tools=[tool.tool_definition() for tool in current_llm_call.tools] or None, + tool_choice=( + "required" + if current_llm_call.tools and current_llm_call.force_use_tool.force_use + else None + ), + structured_response_format=self.answer_style_config.structured_response_format, + ) + yield from response_handler_manager.handle_llm_response(stream) - yield from self._process_llm_stream(prompt=prompt, tools=None) + new_llm_call = response_handler_manager.next_llm_call(current_llm_call) + if new_llm_call: + yield from self._get_response(llm_calls + [new_llm_call]) @property def processed_streamed_output(self) -> AnswerStream: @@ -486,95 +248,31 @@ def processed_streamed_output(self) -> AnswerStream: yield from self._processed_stream return - output_generator = ( - self._raw_output_for_explicit_tool_calling_llms() - if explicit_tool_calling_supported( - self.llm.config.model_provider, self.llm.config.model_name - ) - and not self.skip_explicit_tool_calling - else self._raw_output_for_non_explicit_tool_calling_llms() + prompt_builder = AnswerPromptBuilder( + user_message=default_build_user_message( + user_query=self.question, + prompt_config=self.prompt_config, + files=self.latest_query_files, + ), + message_history=self.message_history, + llm_config=self.llm.config, + single_message_history=self.single_message_history, + raw_user_text=self.question, + ) + prompt_builder.update_system_prompt( + default_build_system_message(self.prompt_config) + ) + llm_call = LLMCall( + prompt_builder=prompt_builder, + tools=self._get_tools_list(), + force_use_tool=self.force_use_tool, + files=self.latest_query_files, + tool_call_info=[], + using_tool_calling_llm=self.using_tool_calling_llm, ) - - def _process_stream( - stream: Iterator[ToolCallKickoff | ToolResponse | str | StreamStopInfo], - ) -> AnswerStream: - message = None - - # special things we need to keep track of for the SearchTool - # raw results that will be displayed to the user - search_results: list[LlmDoc] | None = None - # processed docs to feed into the LLM - final_context_docs: list[LlmDoc] | None = None - - for message in stream: - if isinstance(message, ToolCallKickoff) or isinstance( - message, ToolCallFinalResult - ): - yield message - elif isinstance(message, ToolResponse): - if message.id == SEARCH_RESPONSE_SUMMARY_ID: - # We don't need to run section merging in this flow, this variable is only used - # below to specify the ordering of the documents for the purpose of matching - # citations to the right search documents. The deduplication logic is more lightweight - # there and we don't need to do it twice - search_results = [ - llm_doc_from_inference_section(section) - for section in cast( - SearchResponseSummary, message.response - ).top_sections - ] - elif message.id == FINAL_CONTEXT_DOCUMENTS_ID: - final_context_docs = cast(list[LlmDoc], message.response) - yield message - - elif ( - message.id == SEARCH_DOC_CONTENT_ID - and not self._return_contexts - ): - continue - - yield message - else: - # assumes all tool responses will come first, then the final answer - break - - if not self.skip_gen_ai_answer_generation: - process_answer_stream_fn = _get_answer_stream_processor( - context_docs=final_context_docs or [], - # if doc selection is enabled, then search_results will be None, - # so we need to use the final_context_docs - doc_id_to_rank_map=map_document_id_order( - search_results or final_context_docs or [] - ), - answer_style_configs=self.answer_style_config, - ) - - stream_stop_info = None - - def _stream() -> Iterator[str]: - nonlocal stream_stop_info - yield cast(str, message) - for item in stream: - if isinstance(item, StreamStopInfo): - stream_stop_info = item - return - - # this should never happen, but we're seeing weird behavior here so handling for now - if not isinstance(item, str): - logger.error( - f"Received non-string item in answer stream: {item}. Skipping." - ) - continue - - yield item - - yield from process_answer_stream_fn(_stream()) - - if stream_stop_info: - yield stream_stop_info processed_stream = [] - for processed_packet in _process_stream(output_generator): + for processed_packet in self._get_response([llm_call]): processed_stream.append(processed_packet) yield processed_packet @@ -598,7 +296,6 @@ def citations(self) -> list[CitationInfo]: return citations - @property def is_cancelled(self) -> bool: if self._is_cancelled: return True diff --git a/backend/danswer/llm/answering/llm_response_handler.py b/backend/danswer/llm/answering/llm_response_handler.py new file mode 100644 index 00000000000..f8426844244 --- /dev/null +++ b/backend/danswer/llm/answering/llm_response_handler.py @@ -0,0 +1,84 @@ +from collections.abc import Callable +from collections.abc import Generator +from collections.abc import Iterator +from typing import TYPE_CHECKING + +from langchain_core.messages import BaseMessage +from pydantic.v1 import BaseModel as BaseModel__v1 + +from danswer.chat.models import CitationInfo +from danswer.chat.models import DanswerAnswerPiece +from danswer.chat.models import DanswerQuotes +from danswer.chat.models import StreamStopInfo +from danswer.chat.models import StreamStopReason +from danswer.file_store.models import InMemoryChatFile +from danswer.llm.answering.prompts.build import AnswerPromptBuilder +from danswer.tools.force import ForceUseTool +from danswer.tools.models import ToolCallFinalResult +from danswer.tools.models import ToolCallKickoff +from danswer.tools.models import ToolResponse +from danswer.tools.tool import Tool + + +if TYPE_CHECKING: + from danswer.llm.answering.stream_processing.answer_response_handler import ( + AnswerResponseHandler, + ) + from danswer.llm.answering.tool.tool_response_handler import ToolResponseHandler + + +ResponsePart = ( + DanswerAnswerPiece + | CitationInfo + | DanswerQuotes + | ToolCallKickoff + | ToolResponse + | ToolCallFinalResult + | StreamStopInfo +) + + +class LLMCall(BaseModel__v1): + prompt_builder: AnswerPromptBuilder + tools: list[Tool] + force_use_tool: ForceUseTool + files: list[InMemoryChatFile] + tool_call_info: list[ToolCallKickoff | ToolResponse | ToolCallFinalResult] + using_tool_calling_llm: bool + + class Config: + arbitrary_types_allowed = True + + +class LLMResponseHandlerManager: + def __init__( + self, + tool_handler: "ToolResponseHandler", + answer_handler: "AnswerResponseHandler", + is_cancelled: Callable[[], bool], + ): + self.tool_handler = tool_handler + self.answer_handler = answer_handler + self.is_cancelled = is_cancelled + + def handle_llm_response( + self, + stream: Iterator[BaseMessage], + ) -> Generator[ResponsePart, None, None]: + all_messages: list[BaseMessage] = [] + for message in stream: + if self.is_cancelled(): + yield StreamStopInfo(stop_reason=StreamStopReason.CANCELLED) + return + # tool handler doesn't do anything until the full message is received + # NOTE: still need to run list() to get this to run + list(self.tool_handler.handle_response_part(message, all_messages)) + yield from self.answer_handler.handle_response_part(message, all_messages) + all_messages.append(message) + + # potentially give back all info on the selected tool call + its result + yield from self.tool_handler.handle_response_part(None, all_messages) + yield from self.answer_handler.handle_response_part(None, all_messages) + + def next_llm_call(self, llm_call: LLMCall) -> LLMCall | None: + return self.tool_handler.next_llm_call(llm_call) diff --git a/backend/danswer/llm/answering/models.py b/backend/danswer/llm/answering/models.py index fb5fa9c313e..03f72a0968c 100644 --- a/backend/danswer/llm/answering/models.py +++ b/backend/danswer/llm/answering/models.py @@ -33,7 +33,7 @@ class PreviousMessage(BaseModel): token_count: int message_type: MessageType files: list[InMemoryChatFile] - tool_calls: list[ToolCallFinalResult] + tool_call: ToolCallFinalResult | None @classmethod def from_chat_message( @@ -51,14 +51,13 @@ def from_chat_message( for file in available_files if str(file.file_id) in message_file_ids ], - tool_calls=[ - ToolCallFinalResult( - tool_name=tool_call.tool_name, - tool_args=tool_call.tool_arguments, - tool_result=tool_call.tool_result, - ) - for tool_call in chat_message.tool_calls - ], + tool_call=ToolCallFinalResult( + tool_name=chat_message.tool_call.tool_name, + tool_args=chat_message.tool_call.tool_arguments, + tool_result=chat_message.tool_call.tool_result, + ) + if chat_message.tool_call + else None, ) def to_langchain_msg(self) -> BaseMessage: @@ -116,6 +115,10 @@ class AnswerStyleConfig(BaseModel): document_pruning_config: DocumentPruningConfig = Field( default_factory=DocumentPruningConfig ) + # forces the LLM to return a structured response, see + # https://platform.openai.com/docs/guides/structured-outputs/introduction + # right now, only used by the simple chat API + structured_response_format: dict | None = None @model_validator(mode="after") def check_quotes_and_citation(self) -> "AnswerStyleConfig": diff --git a/backend/danswer/llm/answering/prompts/build.py b/backend/danswer/llm/answering/prompts/build.py index f53d4481f6e..fd44adbe381 100644 --- a/backend/danswer/llm/answering/prompts/build.py +++ b/backend/danswer/llm/answering/prompts/build.py @@ -12,12 +12,12 @@ from danswer.llm.interfaces import LLMConfig from danswer.llm.utils import build_content_with_imgs from danswer.llm.utils import check_message_tokens +from danswer.llm.utils import message_to_prompt_and_imgs from danswer.llm.utils import translate_history_to_basemessages from danswer.natural_language_processing.utils import get_tokenizer from danswer.prompts.chat_prompts import CHAT_USER_CONTEXT_FREE_PROMPT from danswer.prompts.prompt_utils import add_date_time_to_prompt from danswer.prompts.prompt_utils import drop_messages_history_overflow -from danswer.tools.message import ToolCallSummary def default_build_system_message( @@ -54,18 +54,15 @@ def default_build_user_message( class AnswerPromptBuilder: def __init__( - self, message_history: list[PreviousMessage], llm_config: LLMConfig + self, + user_message: HumanMessage, + message_history: list[PreviousMessage], + llm_config: LLMConfig, + raw_user_text: str, + single_message_history: str | None = None, ) -> None: self.max_tokens = compute_max_llm_input_tokens(llm_config) - ( - self.message_history, - self.history_token_cnts, - ) = translate_history_to_basemessages(message_history) - - self.system_message_and_token_cnt: tuple[SystemMessage, int] | None = None - self.user_message_and_token_cnt: tuple[HumanMessage, int] | None = None - llm_tokenizer = get_tokenizer( provider_type=llm_config.model_provider, model_name=llm_config.model_name, @@ -74,6 +71,26 @@ def __init__( Callable[[str], list[int]], llm_tokenizer.encode ) + self.raw_message_history = message_history + ( + self.message_history, + self.history_token_cnts, + ) = translate_history_to_basemessages(message_history) + + # for cases where like the QA flow where we want to condense the chat history + # into a single message rather than a sequence of User / Assistant messages + self.single_message_history = single_message_history + + self.system_message_and_token_cnt: tuple[SystemMessage, int] | None = None + self.user_message_and_token_cnt = ( + user_message, + check_message_tokens(user_message, self.llm_tokenizer_encode_func), + ) + + self.new_messages_and_token_cnts: list[tuple[BaseMessage, int]] = [] + + self.raw_user_message = raw_user_text + def update_system_prompt(self, system_message: SystemMessage | None) -> None: if not system_message: self.system_message_and_token_cnt = None @@ -85,18 +102,21 @@ def update_system_prompt(self, system_message: SystemMessage | None) -> None: ) def update_user_prompt(self, user_message: HumanMessage) -> None: - if not user_message: - self.user_message_and_token_cnt = None - return - self.user_message_and_token_cnt = ( user_message, check_message_tokens(user_message, self.llm_tokenizer_encode_func), ) - def build( - self, tool_call_summary: ToolCallSummary | None = None - ) -> list[BaseMessage]: + def append_message(self, message: BaseMessage) -> None: + """Append a new message to the message history.""" + token_count = check_message_tokens(message, self.llm_tokenizer_encode_func) + self.new_messages_and_token_cnts.append((message, token_count)) + + def get_user_message_content(self) -> str: + query, _ = message_to_prompt_and_imgs(self.user_message_and_token_cnt[0]) + return query + + def build(self) -> list[BaseMessage]: if not self.user_message_and_token_cnt: raise ValueError("User message must be set before building prompt") @@ -113,25 +133,8 @@ def build( final_messages_with_tokens.append(self.user_message_and_token_cnt) - if tool_call_summary: - final_messages_with_tokens.append( - ( - tool_call_summary.tool_call_request, - check_message_tokens( - tool_call_summary.tool_call_request, - self.llm_tokenizer_encode_func, - ), - ) - ) - final_messages_with_tokens.append( - ( - tool_call_summary.tool_call_result, - check_message_tokens( - tool_call_summary.tool_call_result, - self.llm_tokenizer_encode_func, - ), - ) - ) + if self.new_messages_and_token_cnts: + final_messages_with_tokens.extend(self.new_messages_and_token_cnts) return drop_messages_history_overflow( final_messages_with_tokens, self.max_tokens diff --git a/backend/danswer/llm/answering/prompts/citations_prompt.py b/backend/danswer/llm/answering/prompts/citations_prompt.py index 705ea1407d7..826e230369e 100644 --- a/backend/danswer/llm/answering/prompts/citations_prompt.py +++ b/backend/danswer/llm/answering/prompts/citations_prompt.py @@ -1,35 +1,40 @@ from danswer.chat.models import LlmDoc from danswer.configs.model_configs import GEN_AI_SINGLE_USER_MESSAGE_EXPECTED_MAX_TOKENS +from danswer.context.search.models import InferenceChunk from danswer.db.models import Persona from danswer.db.persona import get_default_prompt__read_only from danswer.db.search_settings import get_multilingual_expansion -from danswer.file_store.utils import InMemoryChatFile from danswer.llm.answering.models import PromptConfig -from danswer.llm.factory import get_llms_for_persona -from danswer.llm.factory import get_main_llm_from_tuple +from danswer.llm.factory import get_llms_for_persona, get_main_llm_from_tuple from danswer.llm.interfaces import LLMConfig -from danswer.llm.utils import build_content_with_imgs -from danswer.llm.utils import check_number_of_tokens -from danswer.llm.utils import get_max_input_tokens +from danswer.llm.utils import ( + build_content_with_imgs, + check_number_of_tokens, + get_max_input_tokens, + message_to_prompt_and_imgs, +) from danswer.prompts.chat_prompts import REQUIRE_CITATION_STATEMENT from danswer.prompts.constants import DEFAULT_IGNORE_STATEMENT -from danswer.prompts.direct_qa_prompts import CITATIONS_PROMPT -from danswer.prompts.direct_qa_prompts import CITATIONS_PROMPT_FOR_TOOL_CALLING -from danswer.prompts.prompt_utils import add_date_time_to_prompt -from danswer.prompts.prompt_utils import add_employee_context_to_prompt -from danswer.prompts.prompt_utils import build_complete_context_str -from danswer.prompts.prompt_utils import build_task_prompt_reminders -from danswer.prompts.token_counts import ADDITIONAL_INFO_TOKEN_CNT +from danswer.prompts.direct_qa_prompts import ( + CITATIONS_PROMPT, + CITATIONS_PROMPT_FOR_TOOL_CALLING, + HISTORY_BLOCK, +) +from danswer.prompts.prompt_utils import ( + add_date_time_to_prompt, + add_employee_context_to_prompt, + build_complete_context_str, + build_task_prompt_reminders, +) from danswer.prompts.token_counts import ( + ADDITIONAL_INFO_TOKEN_CNT, CHAT_USER_PROMPT_WITH_CONTEXT_OVERHEAD_TOKEN_CNT, + CITATION_REMINDER_TOKEN_CNT, + CITATION_STATEMENT_TOKEN_CNT, + LANGUAGE_HINT_TOKEN_CNT, ) -from danswer.prompts.token_counts import CITATION_REMINDER_TOKEN_CNT -from danswer.prompts.token_counts import CITATION_STATEMENT_TOKEN_CNT -from danswer.prompts.token_counts import LANGUAGE_HINT_TOKEN_CNT -from danswer.search.models import InferenceChunk from danswer.utils.logger import setup_logger -from langchain.schema.messages import HumanMessage -from langchain.schema.messages import SystemMessage +from langchain.schema.messages import HumanMessage, SystemMessage logger = setup_logger() @@ -138,10 +143,9 @@ def build_citations_system_message( def build_citations_user_message( - question: str, + message: HumanMessage, prompt_config: PromptConfig, context_docs: list[LlmDoc] | list[InferenceChunk], - files: list[InMemoryChatFile], all_doc_useful: bool, history_message: str = "", ) -> HumanMessage: @@ -150,6 +154,13 @@ def build_citations_user_message( prompt=prompt_config, use_language_hint=bool(multilingual_expansion) ) + history_block = ( + HISTORY_BLOCK.format(history_str=history_message) + "\n" + if history_message + else "" + ) + query, img_urls = message_to_prompt_and_imgs(message) + if context_docs: context_docs_str = build_complete_context_str(context_docs) optional_ignore = "" if all_doc_useful else DEFAULT_IGNORE_STATEMENT @@ -158,19 +169,24 @@ def build_citations_user_message( optional_ignore_statement=optional_ignore, context_docs_str=context_docs_str, task_prompt=task_prompt_with_reminder, - user_query=question, - history_block=history_message, + user_query=query, + history_block=history_block, ) else: # if no context docs provided, assume we're in the tool calling flow user_prompt = CITATIONS_PROMPT_FOR_TOOL_CALLING.format( task_prompt=task_prompt_with_reminder, - user_query=question, + user_query=query, + history_block=history_block, ) user_prompt = user_prompt.strip() user_msg = HumanMessage( - content=build_content_with_imgs(user_prompt, files) if files else user_prompt + content=( + build_content_with_imgs(user_prompt, img_urls=img_urls) + if img_urls + else user_prompt + ) ) return user_msg diff --git a/backend/danswer/llm/answering/prompts/quotes_prompt.py b/backend/danswer/llm/answering/prompts/quotes_prompt.py index 35a8059810c..f7f28d4b9ea 100644 --- a/backend/danswer/llm/answering/prompts/quotes_prompt.py +++ b/backend/danswer/llm/answering/prompts/quotes_prompt.py @@ -1,54 +1,17 @@ from danswer.chat.models import LlmDoc from danswer.configs.chat_configs import LANGUAGE_HINT -from danswer.configs.chat_configs import QA_PROMPT_OVERRIDE +from danswer.context.search.models import InferenceChunk from danswer.db.search_settings import get_multilingual_expansion from danswer.llm.answering.models import PromptConfig +from danswer.llm.utils import message_to_prompt_and_imgs from danswer.prompts.direct_qa_prompts import CONTEXT_BLOCK from danswer.prompts.direct_qa_prompts import HISTORY_BLOCK from danswer.prompts.direct_qa_prompts import JSON_PROMPT -from danswer.prompts.direct_qa_prompts import WEAK_LLM_PROMPT from danswer.prompts.prompt_utils import add_date_time_to_prompt from danswer.prompts.prompt_utils import add_employee_context_to_prompt from danswer.prompts.prompt_utils import build_complete_context_str -from danswer.search.models import InferenceChunk -from danswer.utils.logger import setup_logger from langchain.schema.messages import HumanMessage -logger = setup_logger() - - -def _build_weak_llm_quotes_prompt( - question: str, - context_docs: list[LlmDoc] | list[InferenceChunk], - history_str: str, - prompt: PromptConfig, - user_email: str | None = None, -) -> HumanMessage: - """Since Danswer supports a variety of LLMs, this less demanding prompt is provided - as an option to use with weaker LLMs such as small version, low float precision, quantized, - or distilled models. It only uses one context document and has very weak requirements of - output format. - """ - context_block = "" - if context_docs: - context_block = CONTEXT_BLOCK.format(context_docs_str=context_docs[0].content) - - prompt_str = WEAK_LLM_PROMPT.format( - system_prompt=prompt.system_prompt, - context_block=context_block, - task_prompt=prompt.task_prompt, - user_query=question, - ) - - if prompt.datetime_aware: - prompt_str = add_date_time_to_prompt(prompt_str=prompt_str) - - if user_email: - prompt_str = add_employee_context_to_prompt( - prompt_str=prompt_str, user_email=user_email - ) - return HumanMessage(content=prompt_str) - def _build_strong_llm_quotes_prompt( question: str, @@ -102,42 +65,18 @@ def _build_strong_llm_quotes_prompt( def build_quotes_user_message( - question: str, + message: HumanMessage, context_docs: list[LlmDoc] | list[InferenceChunk], history_str: str, prompt: PromptConfig, user_email: str, ) -> HumanMessage: - prompt_builder = ( - _build_weak_llm_quotes_prompt - if QA_PROMPT_OVERRIDE == "weak" - else _build_strong_llm_quotes_prompt - ) + query, _ = message_to_prompt_and_imgs(message) - return prompt_builder( - question=question, + return _build_strong_llm_quotes_prompt( + question=query, context_docs=context_docs, history_str=history_str, prompt=prompt, user_email=user_email, ) - - -def build_quotes_prompt( - question: str, - context_docs: list[LlmDoc] | list[InferenceChunk], - history_str: str, - prompt: PromptConfig, -) -> HumanMessage: - prompt_builder = ( - _build_weak_llm_quotes_prompt - if QA_PROMPT_OVERRIDE == "weak" - else _build_strong_llm_quotes_prompt - ) - - return prompt_builder( - question=question, - context_docs=context_docs, - history_str=history_str, - prompt=prompt, - ) diff --git a/backend/danswer/llm/answering/prune_and_merge.py b/backend/danswer/llm/answering/prune_and_merge.py index 0193de1f2aa..21ea2226d97 100644 --- a/backend/danswer/llm/answering/prune_and_merge.py +++ b/backend/danswer/llm/answering/prune_and_merge.py @@ -10,6 +10,8 @@ ) from danswer.configs.constants import IGNORE_FOR_QA from danswer.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE +from danswer.context.search.models import InferenceChunk +from danswer.context.search.models import InferenceSection from danswer.llm.answering.models import ContextualPruningConfig from danswer.llm.answering.models import PromptConfig from danswer.llm.answering.prompts.citations_prompt import compute_max_document_tokens @@ -17,9 +19,7 @@ from danswer.natural_language_processing.utils import get_tokenizer from danswer.natural_language_processing.utils import tokenizer_trim_content from danswer.prompts.prompt_utils import build_doc_context_str -from danswer.search.models import InferenceChunk -from danswer.search.models import InferenceSection -from danswer.tools.search.search_utils import section_to_dict +from danswer.tools.tool_implementations.search.search_utils import section_to_dict from danswer.utils.logger import setup_logger diff --git a/backend/danswer/llm/answering/stream_processing/answer_response_handler.py b/backend/danswer/llm/answering/stream_processing/answer_response_handler.py new file mode 100644 index 00000000000..edb0c500a28 --- /dev/null +++ b/backend/danswer/llm/answering/stream_processing/answer_response_handler.py @@ -0,0 +1,97 @@ +import abc +from collections.abc import Generator + +from langchain_core.messages import BaseMessage + +from danswer.chat.models import CitationInfo +from danswer.chat.models import LlmDoc +from danswer.llm.answering.llm_response_handler import ResponsePart +from danswer.llm.answering.stream_processing.citation_processing import ( + CitationProcessor, +) +from danswer.llm.answering.stream_processing.quotes_processing import ( + QuotesProcessor, +) +from danswer.llm.answering.stream_processing.utils import DocumentIdOrderMapping +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +class AnswerResponseHandler(abc.ABC): + @abc.abstractmethod + def handle_response_part( + self, + response_item: BaseMessage | None, + previous_response_items: list[BaseMessage], + ) -> Generator[ResponsePart, None, None]: + raise NotImplementedError + + +class DummyAnswerResponseHandler(AnswerResponseHandler): + def handle_response_part( + self, + response_item: BaseMessage | None, + previous_response_items: list[BaseMessage], + ) -> Generator[ResponsePart, None, None]: + # This is a dummy handler that returns nothing + yield from [] + + +class CitationResponseHandler(AnswerResponseHandler): + def __init__( + self, context_docs: list[LlmDoc], doc_id_to_rank_map: DocumentIdOrderMapping + ): + self.context_docs = context_docs + self.doc_id_to_rank_map = doc_id_to_rank_map + self.citation_processor = CitationProcessor( + context_docs=self.context_docs, + doc_id_to_rank_map=self.doc_id_to_rank_map, + ) + self.processed_text = "" + self.citations: list[CitationInfo] = [] + + # TODO remove this after citation issue is resolved + logger.debug(f"Document to ranking map {self.doc_id_to_rank_map}") + + def handle_response_part( + self, + response_item: BaseMessage | None, + previous_response_items: list[BaseMessage], + ) -> Generator[ResponsePart, None, None]: + if response_item is None: + return + + content = ( + response_item.content if isinstance(response_item.content, str) else "" + ) + + # Process the new content through the citation processor + yield from self.citation_processor.process_token(content) + + +class QuotesResponseHandler(AnswerResponseHandler): + def __init__( + self, + context_docs: list[LlmDoc], + is_json_prompt: bool = True, + ): + self.quotes_processor = QuotesProcessor( + context_docs=context_docs, + is_json_prompt=is_json_prompt, + ) + + def handle_response_part( + self, + response_item: BaseMessage | None, + previous_response_items: list[BaseMessage], + ) -> Generator[ResponsePart, None, None]: + if response_item is None: + yield from self.quotes_processor.process_token(None) + return + + content = ( + response_item.content if isinstance(response_item.content, str) else "" + ) + + yield from self.quotes_processor.process_token(content) diff --git a/backend/danswer/llm/answering/stream_processing/citation_processing.py b/backend/danswer/llm/answering/stream_processing/citation_processing.py index a72fc70a8ff..950ad207878 100644 --- a/backend/danswer/llm/answering/stream_processing/citation_processing.py +++ b/backend/danswer/llm/answering/stream_processing/citation_processing.py @@ -1,12 +1,10 @@ import re -from collections.abc import Iterator +from collections.abc import Generator -from danswer.chat.models import AnswerQuestionStreamReturn from danswer.chat.models import CitationInfo from danswer.chat.models import DanswerAnswerPiece from danswer.chat.models import LlmDoc from danswer.configs.chat_configs import STOP_STREAM_PAT -from danswer.llm.answering.models import StreamProcessor from danswer.llm.answering.stream_processing.utils import DocumentIdOrderMapping from danswer.prompts.constants import TRIPLE_BACKTICK from danswer.utils.logger import setup_logger @@ -19,119 +17,104 @@ def in_code_block(llm_text: str) -> bool: return count % 2 != 0 -def extract_citations_from_stream( - tokens: Iterator[str], - context_docs: list[LlmDoc], - doc_id_to_rank_map: DocumentIdOrderMapping, - stop_stream: str | None = STOP_STREAM_PAT, -) -> Iterator[DanswerAnswerPiece | CitationInfo]: - """ - Key aspects: - - 1. Stream Processing: - - Processes tokens one by one, allowing for real-time handling of large texts. - - 2. Citation Detection: - - Uses regex to find citations in the format [number]. - - Example: [1], [2], etc. - - 3. Citation Mapping: - - Maps detected citation numbers to actual document ranks using doc_id_to_rank_map. - - Example: [1] might become [3] if doc_id_to_rank_map maps it to 3. - - 4. Citation Formatting: - - Replaces citations with properly formatted versions. - - Adds links if available: [[1]](https://example.com) - - Handles cases where links are not available: [[1]]() - - 5. Duplicate Handling: - - Skips consecutive citations of the same document to avoid redundancy. - - 6. Output Generation: - - Yields DanswerAnswerPiece objects for regular text. - - Yields CitationInfo objects for each unique citation encountered. - - 7. Context Awareness: - - Uses context_docs to access document information for citations. - - This function effectively processes a stream of text, identifies and reformats citations, - and provides both the processed text and citation information as output. - """ - order_mapping = doc_id_to_rank_map.order_mapping - llm_out = "" - max_citation_num = len(context_docs) - citation_order = [] - curr_segment = "" - cited_inds = set() - hold = "" - - raw_out = "" - current_citations: list[int] = [] - past_cite_count = 0 - for raw_token in tokens: - raw_out += raw_token - if stop_stream: - next_hold = hold + raw_token - if stop_stream in next_hold: - break - if next_hold == stop_stream[: len(next_hold)]: - hold = next_hold - continue +class CitationProcessor: + def __init__( + self, + context_docs: list[LlmDoc], + doc_id_to_rank_map: DocumentIdOrderMapping, + stop_stream: str | None = STOP_STREAM_PAT, + ): + self.context_docs = context_docs + self.doc_id_to_rank_map = doc_id_to_rank_map + self.stop_stream = stop_stream + self.order_mapping = doc_id_to_rank_map.order_mapping + self.llm_out = "" + self.max_citation_num = len(context_docs) + self.citation_order: list[int] = [] + self.curr_segment = "" + self.cited_inds: set[int] = set() + self.hold = "" + self.current_citations: list[int] = [] + self.past_cite_count = 0 + + def process_token( + self, token: str | None + ) -> Generator[DanswerAnswerPiece | CitationInfo, None, None]: + # None -> end of stream + if token is None: + yield DanswerAnswerPiece(answer_piece=self.curr_segment) + return + + if self.stop_stream: + next_hold = self.hold + token + if self.stop_stream in next_hold: + return + if next_hold == self.stop_stream[: len(next_hold)]: + self.hold = next_hold + return token = next_hold - hold = "" - else: - token = raw_token + self.hold = "" - curr_segment += token - llm_out += token + self.curr_segment += token + self.llm_out += token - citation_pattern = r"\[(\d+)\]" + # Handle code blocks without language tags + if "`" in self.curr_segment: + if self.curr_segment.endswith("`"): + return + elif "```" in self.curr_segment: + piece_that_comes_after = self.curr_segment.split("```")[1][0] + if piece_that_comes_after == "\n" and in_code_block(self.llm_out): + self.curr_segment = self.curr_segment.replace("```", "```plaintext") - citations_found = list(re.finditer(citation_pattern, curr_segment)) + citation_pattern = r"\[(\d+)\]" + citations_found = list(re.finditer(citation_pattern, self.curr_segment)) possible_citation_pattern = r"(\[\d*$)" # [1, [, etc - possible_citation_found = re.search(possible_citation_pattern, curr_segment) + possible_citation_found = re.search( + possible_citation_pattern, self.curr_segment + ) - # `past_cite_count`: number of characters since past citation - # 5 to ensure a citation hasn't occured - if len(citations_found) == 0 and len(llm_out) - past_cite_count > 5: - current_citations = [] + if len(citations_found) == 0 and len(self.llm_out) - self.past_cite_count > 5: + self.current_citations = [] - if citations_found and not in_code_block(llm_out): + result = "" # Initialize result here + if citations_found and not in_code_block(self.llm_out): last_citation_end = 0 length_to_add = 0 while len(citations_found) > 0: citation = citations_found.pop(0) numerical_value = int(citation.group(1)) - if 1 <= numerical_value <= max_citation_num: - context_llm_doc = context_docs[numerical_value - 1] - real_citation_num = order_mapping[context_llm_doc.document_id] + if 1 <= numerical_value <= self.max_citation_num: + context_llm_doc = self.context_docs[numerical_value - 1] + real_citation_num = self.order_mapping[context_llm_doc.document_id] - if real_citation_num not in citation_order: - citation_order.append(real_citation_num) + if real_citation_num not in self.citation_order: + self.citation_order.append(real_citation_num) - target_citation_num = citation_order.index(real_citation_num) + 1 + target_citation_num = ( + self.citation_order.index(real_citation_num) + 1 + ) # Skip consecutive citations of the same work - if target_citation_num in current_citations: + if target_citation_num in self.current_citations: start, end = citation.span() real_start = length_to_add + start diff = end - start - curr_segment = ( - curr_segment[: length_to_add + start] - + curr_segment[real_start + diff :] + self.curr_segment = ( + self.curr_segment[: length_to_add + start] + + self.curr_segment[real_start + diff :] ) length_to_add -= diff continue # Handle edge case where LLM outputs citation itself - # by allowing it to generate citations on its own. - if curr_segment.startswith("[["): - match = re.match(r"\[\[(\d+)\]\]", curr_segment) + if self.curr_segment.startswith("[["): + match = re.match(r"\[\[(\d+)\]\]", self.curr_segment) if match: try: doc_id = int(match.group(1)) - context_llm_doc = context_docs[doc_id - 1] + context_llm_doc = self.context_docs[doc_id - 1] yield CitationInfo( citation_num=target_citation_num, document_id=context_llm_doc.document_id, @@ -141,75 +124,57 @@ def extract_citations_from_stream( f"Manual LLM citation didn't properly cite documents {e}" ) else: - # Will continue attempt on next loops logger.warning( "Manual LLM citation wasn't able to close brackets" ) - continue link = context_llm_doc.link # Replace the citation in the current segment start, end = citation.span() - curr_segment = ( - curr_segment[: start + length_to_add] + self.curr_segment = ( + self.curr_segment[: start + length_to_add] + f"[{target_citation_num}]" - + curr_segment[end + length_to_add :] + + self.curr_segment[end + length_to_add :] ) - past_cite_count = len(llm_out) - current_citations.append(target_citation_num) + self.past_cite_count = len(self.llm_out) + self.current_citations.append(target_citation_num) - if target_citation_num not in cited_inds: - cited_inds.add(target_citation_num) + if target_citation_num not in self.cited_inds: + self.cited_inds.add(target_citation_num) yield CitationInfo( citation_num=target_citation_num, document_id=context_llm_doc.document_id, ) if link: - prev_length = len(curr_segment) - curr_segment = ( - curr_segment[: start + length_to_add] + prev_length = len(self.curr_segment) + self.curr_segment = ( + self.curr_segment[: start + length_to_add] + f"[[{target_citation_num}]]({link})" - + curr_segment[end + length_to_add :] + + self.curr_segment[end + length_to_add :] ) - length_to_add += len(curr_segment) - prev_length - + length_to_add += len(self.curr_segment) - prev_length else: - prev_length = len(curr_segment) - curr_segment = ( - curr_segment[: start + length_to_add] + prev_length = len(self.curr_segment) + self.curr_segment = ( + self.curr_segment[: start + length_to_add] + f"[[{target_citation_num}]]()" - + curr_segment[end + length_to_add :] + + self.curr_segment[end + length_to_add :] ) - length_to_add += len(curr_segment) - prev_length + length_to_add += len(self.curr_segment) - prev_length last_citation_end = end + length_to_add if last_citation_end > 0: - yield DanswerAnswerPiece(answer_piece=curr_segment[:last_citation_end]) - curr_segment = curr_segment[last_citation_end:] - if possible_citation_found: - continue - yield DanswerAnswerPiece(answer_piece=curr_segment) - curr_segment = "" - - if curr_segment: - yield DanswerAnswerPiece(answer_piece=curr_segment) - - -def build_citation_processor( - context_docs: list[LlmDoc], doc_id_to_rank_map: DocumentIdOrderMapping -) -> StreamProcessor: - def stream_processor( - tokens: Iterator[str], - ) -> AnswerQuestionStreamReturn: - yield from extract_citations_from_stream( - tokens=tokens, - context_docs=context_docs, - doc_id_to_rank_map=doc_id_to_rank_map, - ) + result += self.curr_segment[:last_citation_end] + self.curr_segment = self.curr_segment[last_citation_end:] + + if not possible_citation_found: + result += self.curr_segment + self.curr_segment = "" - return stream_processor + if result: + yield DanswerAnswerPiece(answer_piece=result) diff --git a/backend/danswer/llm/answering/stream_processing/quotes_processing.py b/backend/danswer/llm/answering/stream_processing/quotes_processing.py index 501a56b5aa7..1f1afc1aaba 100644 --- a/backend/danswer/llm/answering/stream_processing/quotes_processing.py +++ b/backend/danswer/llm/answering/stream_processing/quotes_processing.py @@ -1,23 +1,20 @@ import math import re -from collections.abc import Callable from collections.abc import Generator -from collections.abc import Iterator from json import JSONDecodeError from typing import Optional import regex -from danswer.chat.models import AnswerQuestionStreamReturn from danswer.chat.models import DanswerAnswer from danswer.chat.models import DanswerAnswerPiece from danswer.chat.models import DanswerQuote from danswer.chat.models import DanswerQuotes from danswer.chat.models import LlmDoc from danswer.configs.chat_configs import QUOTE_ALLOWED_ERROR_PERCENT +from danswer.context.search.models import InferenceChunk from danswer.prompts.constants import ANSWER_PAT from danswer.prompts.constants import QUOTE_PAT -from danswer.search.models import InferenceChunk from danswer.utils.logger import setup_logger from danswer.utils.text_processing import clean_model_quote from danswer.utils.text_processing import clean_up_code_blocks @@ -157,7 +154,7 @@ def separate_answer_quotes( return _extract_answer_quotes_freeform(clean_up_code_blocks(answer_raw)) -def process_answer( +def _process_answer( answer_raw: str, docs: list[LlmDoc], is_json_prompt: bool = True, @@ -195,7 +192,7 @@ def _stream_json_answer_end(answer_so_far: str, next_token: str) -> bool: def _extract_quotes_from_completed_token_stream( model_output: str, context_docs: list[LlmDoc], is_json_prompt: bool = True ) -> DanswerQuotes: - answer, quotes = process_answer(model_output, context_docs, is_json_prompt) + answer, quotes = _process_answer(model_output, context_docs, is_json_prompt) if answer: logger.notice(answer) elif model_output: @@ -204,94 +201,101 @@ def _extract_quotes_from_completed_token_stream( return quotes -def process_model_tokens( - tokens: Iterator[str], - context_docs: list[LlmDoc], - is_json_prompt: bool = True, -) -> Generator[DanswerAnswerPiece | DanswerQuotes, None, None]: - """Used in the streaming case to process the model output - into an Answer and Quotes - - Yields Answer tokens back out in a dict for streaming to frontend - When Answer section ends, yields dict with answer_finished key - Collects all the tokens at the end to form the complete model output""" - quote_pat = f"\n{QUOTE_PAT}" - # Sometimes worse model outputs new line instead of : - quote_loose = f"\n{quote_pat[:-1]}\n" - # Sometime model outputs two newlines before quote section - quote_pat_full = f"\n{quote_pat}" - model_output: str = "" - found_answer_start = False if is_json_prompt else True - found_answer_end = False - hold_quote = "" - - for token in tokens: - model_previous = model_output - model_output += token - - if not found_answer_start: - m = answer_pattern.search(model_output) +class QuotesProcessor: + def __init__( + self, + context_docs: list[LlmDoc], + is_json_prompt: bool = True, + ): + self.context_docs = context_docs + self.is_json_prompt = is_json_prompt + + self.found_answer_start = False if is_json_prompt else True + self.found_answer_end = False + self.hold_quote = "" + self.model_output = "" + self.hold = "" + + def process_token( + self, token: str | None + ) -> Generator[DanswerAnswerPiece | DanswerQuotes, None, None]: + # None -> end of stream + if token is None: + if self.model_output: + yield _extract_quotes_from_completed_token_stream( + model_output=self.model_output, + context_docs=self.context_docs, + is_json_prompt=self.is_json_prompt, + ) + return + + model_previous = self.model_output + self.model_output += token + if not self.found_answer_start: + m = answer_pattern.search(self.model_output) if m: - found_answer_start = True + self.found_answer_start = True - # Prevent heavy cases of hallucinations where model is never providing a JSON - # We want to quickly update the user - not stream forever - if is_json_prompt and len(model_output) > 70: + # Prevent heavy cases of hallucinations + if self.is_json_prompt and len(self.model_output) > 400: + self.found_answer_end = True logger.warning("LLM did not produce json as prompted") - found_answer_end = True - continue - - remaining = model_output[m.end() :] + logger.debug("Model output thus far:", self.model_output) + return + + remaining = self.model_output[m.end() :] + + # Look for an unescaped quote, which means the answer is entirely contained + # in this token e.g. if the token is `{"answer": "blah", "qu` + quote_indices = [i for i, char in enumerate(remaining) if char == '"'] + for quote_idx in quote_indices: + # Check if quote is escaped by counting backslashes before it + num_backslashes = 0 + pos = quote_idx - 1 + while pos >= 0 and remaining[pos] == "\\": + num_backslashes += 1 + pos -= 1 + # If even number of backslashes, quote is not escaped + if num_backslashes % 2 == 0: + yield DanswerAnswerPiece(answer_piece=remaining[:quote_idx]) + return + + # If no unescaped quote found, yield the remaining string if len(remaining) > 0: yield DanswerAnswerPiece(answer_piece=remaining) - continue + return - if found_answer_start and not found_answer_end: - if is_json_prompt and _stream_json_answer_end(model_previous, token): - found_answer_end = True + if self.found_answer_start and not self.found_answer_end: + if self.is_json_prompt and _stream_json_answer_end(model_previous, token): + self.found_answer_end = True - # return the remaining part of the answer e.g. token might be 'd.", ' and we should yield 'd.' if token: try: answer_token_section = token.index('"') yield DanswerAnswerPiece( - answer_piece=hold_quote + token[:answer_token_section] + answer_piece=self.hold_quote + token[:answer_token_section] ) except ValueError: logger.error("Quotation mark not found in token") - yield DanswerAnswerPiece(answer_piece=hold_quote + token) + yield DanswerAnswerPiece(answer_piece=self.hold_quote + token) yield DanswerAnswerPiece(answer_piece=None) - continue - elif not is_json_prompt: - if quote_pat in hold_quote + token or quote_loose in hold_quote + token: - found_answer_end = True + return + + elif not self.is_json_prompt: + quote_pat = f"\n{QUOTE_PAT}" + quote_loose = f"\n{quote_pat[:-1]}\n" + quote_pat_full = f"\n{quote_pat}" + + if ( + quote_pat in self.hold_quote + token + or quote_loose in self.hold_quote + token + ): + self.found_answer_end = True yield DanswerAnswerPiece(answer_piece=None) - continue - if hold_quote + token in quote_pat_full: - hold_quote += token - continue - yield DanswerAnswerPiece(answer_piece=hold_quote + token) - hold_quote = "" - - logger.debug(f"Raw Model QnA Output: {model_output}") - - yield _extract_quotes_from_completed_token_stream( - model_output=model_output, - context_docs=context_docs, - is_json_prompt=is_json_prompt, - ) - - -def build_quotes_processor( - context_docs: list[LlmDoc], is_json_prompt: bool -) -> Callable[[Iterator[str]], AnswerQuestionStreamReturn]: - def stream_processor( - tokens: Iterator[str], - ) -> AnswerQuestionStreamReturn: - yield from process_model_tokens( - tokens=tokens, - context_docs=context_docs, - is_json_prompt=is_json_prompt, - ) - - return stream_processor + return + if self.hold_quote + token in quote_pat_full: + self.hold_quote += token + return + + yield DanswerAnswerPiece(answer_piece=self.hold_quote + token) + self.hold_quote = "" diff --git a/backend/danswer/llm/answering/stream_processing/utils.py b/backend/danswer/llm/answering/stream_processing/utils.py index b4fb83747de..4f328fc7240 100644 --- a/backend/danswer/llm/answering/stream_processing/utils.py +++ b/backend/danswer/llm/answering/stream_processing/utils.py @@ -3,7 +3,7 @@ from pydantic import BaseModel from danswer.chat.models import LlmDoc -from danswer.search.models import InferenceChunk +from danswer.context.search.models import InferenceChunk class DocumentIdOrderMapping(BaseModel): diff --git a/backend/danswer/llm/answering/tool/tool_response_handler.py b/backend/danswer/llm/answering/tool/tool_response_handler.py new file mode 100644 index 00000000000..d9ea1d5a936 --- /dev/null +++ b/backend/danswer/llm/answering/tool/tool_response_handler.py @@ -0,0 +1,199 @@ +from collections.abc import Generator + +from danswer.llm.answering.llm_response_handler import LLMCall, ResponsePart +from danswer.llm.interfaces import LLM +from danswer.tools.force import ForceUseTool +from danswer.tools.message import ToolCallSummary, build_tool_message +from danswer.tools.models import ToolCallFinalResult, ToolCallKickoff, ToolResponse +from danswer.tools.tool import Tool +from danswer.tools.tool_runner import ( + ToolRunner, + check_which_tools_should_run_for_non_tool_calling_llm, +) +from danswer.tools.tool_selection import select_single_tool_for_non_tool_calling_llm +from danswer.utils.logger import setup_logger +from langchain_core.messages import AIMessageChunk, BaseMessage, ToolCall + +logger = setup_logger() + + +class ToolResponseHandler: + def __init__(self, tools: list[Tool]): + self.tools = tools + + self.tool_call_chunk: AIMessageChunk | None = None + self.tool_call_requests: list[ToolCall] = [] + + self.tool_runner: ToolRunner | None = None + self.tool_call_summary: ToolCallSummary | None = None + + self.tool_kickoff: ToolCallKickoff | None = None + self.tool_responses: list[ToolResponse] = [] + self.tool_final_result: ToolCallFinalResult | None = None + + @classmethod + def get_tool_call_for_non_tool_calling_llm( + cls, llm_call: LLMCall, llm: LLM + ) -> tuple[Tool, dict] | None: + if llm_call.force_use_tool.force_use: + # if we are forcing a tool, we don't need to check which tools to run + tool = next( + ( + t + for t in llm_call.tools + if t.name == llm_call.force_use_tool.tool_name + ), + None, + ) + if not tool: + raise RuntimeError( + f"Tool '{llm_call.force_use_tool.tool_name}' not found" + ) + + tool_args = ( + llm_call.force_use_tool.args + if llm_call.force_use_tool.args is not None + else tool.get_args_for_non_tool_calling_llm( + query=llm_call.prompt_builder.raw_user_message, + history=llm_call.prompt_builder.raw_message_history, + llm=llm, + force_run=True, + ) + ) + + if tool_args is None: + raise RuntimeError(f"Tool '{tool.name}' did not return args") + + return (tool, tool_args) + else: + tool_options = check_which_tools_should_run_for_non_tool_calling_llm( + tools=llm_call.tools, + query=llm_call.prompt_builder.raw_user_message, + history=llm_call.prompt_builder.raw_message_history, + llm=llm, + ) + + available_tools_and_args = [ + (llm_call.tools[ind], args) + for ind, args in enumerate(tool_options) + if args is not None + ] + + logger.info( + f"Selecting single tool from tools: {[(tool.name, args) for tool, args in available_tools_and_args]}" + ) + + chosen_tool_and_args = ( + select_single_tool_for_non_tool_calling_llm( + tools_and_args=available_tools_and_args, + history=llm_call.prompt_builder.raw_message_history, + query=llm_call.prompt_builder.raw_user_message, + llm=llm, + ) + if available_tools_and_args + else None + ) + + logger.notice(f"Chosen tool: {chosen_tool_and_args}") + return chosen_tool_and_args + + def _handle_tool_call(self) -> Generator[ResponsePart, None, None]: + if not self.tool_call_chunk or not self.tool_call_chunk.tool_calls: + return + + self.tool_call_requests = self.tool_call_chunk.tool_calls + + selected_tool: Tool | None = None + selected_tool_call_request: ToolCall | None = None + for tool_call_request in self.tool_call_requests: + known_tools_by_name = [ + tool for tool in self.tools if tool.name == tool_call_request["name"] + ] + + if not known_tools_by_name: + logger.error( + "Tool call requested with unknown name field. \n" + f"self.tools: {self.tools}" + f"tool_call_request: {tool_call_request}" + ) + continue + else: + selected_tool = known_tools_by_name[0] + selected_tool_call_request = tool_call_request + + if selected_tool and selected_tool_call_request: + break + + if not selected_tool or not selected_tool_call_request: + return + + logger.info(f"Selected tool: {selected_tool.name}") + logger.debug(f"Selected tool call request: {selected_tool_call_request}") + self.tool_runner = ToolRunner(selected_tool, selected_tool_call_request["args"]) + self.tool_kickoff = self.tool_runner.kickoff() + yield self.tool_kickoff + + for response in self.tool_runner.tool_responses(): + self.tool_responses.append(response) + yield response + + self.tool_final_result = self.tool_runner.tool_final_result() + yield self.tool_final_result + + self.tool_call_summary = ToolCallSummary( + tool_call_request=self.tool_call_chunk, + tool_call_result=build_tool_message( + selected_tool_call_request, self.tool_runner.tool_message_content() + ), + ) + + def handle_response_part( + self, + response_item: BaseMessage | None, + previous_response_items: list[BaseMessage], + ) -> Generator[ResponsePart, None, None]: + if response_item is None: + yield from self._handle_tool_call() + + if isinstance(response_item, AIMessageChunk) and ( + response_item.tool_call_chunks or response_item.tool_calls + ): + if self.tool_call_chunk is None: + self.tool_call_chunk = response_item + else: + self.tool_call_chunk += response_item # type: ignore + + return + + def next_llm_call(self, current_llm_call: LLMCall) -> LLMCall | None: + if ( + self.tool_runner is None + or self.tool_call_summary is None + or self.tool_kickoff is None + or self.tool_final_result is None + ): + return None + + tool_runner = self.tool_runner + new_prompt_builder = tool_runner.tool.build_next_prompt( + prompt_builder=current_llm_call.prompt_builder, + tool_call_summary=self.tool_call_summary, + tool_responses=self.tool_responses, + using_tool_calling_llm=current_llm_call.using_tool_calling_llm, + ) + return LLMCall( + prompt_builder=new_prompt_builder, + tools=[], # for now, only allow one tool call per response + force_use_tool=ForceUseTool( + force_use=False, + tool_name="", + args=None, + ), + files=current_llm_call.files, + using_tool_calling_llm=current_llm_call.using_tool_calling_llm, + tool_call_info=[ + self.tool_kickoff, + *self.tool_responses, + self.tool_final_result, + ], + ) diff --git a/backend/danswer/llm/chat_llm.py b/backend/danswer/llm/chat_llm.py index 08131f581a4..031fcd7163a 100644 --- a/backend/danswer/llm/chat_llm.py +++ b/backend/danswer/llm/chat_llm.py @@ -1,6 +1,8 @@ import json import os +import traceback from collections.abc import Iterator +from collections.abc import Sequence from typing import Any from typing import cast @@ -21,15 +23,18 @@ from langchain_core.messages import SystemMessageChunk from langchain_core.messages.tool import ToolCallChunk from langchain_core.messages.tool import ToolMessage +from langchain_core.prompt_values import PromptValue -from danswer.configs.app_configs import LOG_ALL_MODEL_INTERACTIONS from danswer.configs.app_configs import LOG_DANSWER_MODEL_INTERACTIONS from danswer.configs.model_configs import DISABLE_LITELLM_STREAMING from danswer.configs.model_configs import GEN_AI_TEMPERATURE +from danswer.configs.model_configs import LITELLM_EXTRA_BODY from danswer.llm.interfaces import LLM from danswer.llm.interfaces import LLMConfig from danswer.llm.interfaces import ToolChoiceOptions +from danswer.server.utils import mask_string from danswer.utils.logger import setup_logger +from danswer.utils.long_term_log import LongTermLogger logger = setup_logger() @@ -39,7 +44,7 @@ litellm.drop_params = True litellm.telemetry = False -litellm.set_verbose = LOG_ALL_MODEL_INTERACTIONS +_LLM_PROMPT_LONG_TERM_LOG_CATEGORY = "llm_prompt" def _base_msg_to_role(msg: BaseMessage) -> str: @@ -83,8 +88,10 @@ def _convert_litellm_message_to_langchain_message( "args": json.loads(tool_call.function.arguments), "id": tool_call.id, } - for tool_call in (tool_calls if tool_calls else []) - ], + for tool_call in tool_calls + ] + if tool_calls + else [], ) elif role == "system": return SystemMessage(content=content) @@ -109,7 +116,7 @@ def _convert_message_to_dict(message: BaseMessage) -> dict: "arguments": json.dumps(tool_call["args"]), }, "type": "function", - "index": 0, # only support a single tool call atm + "index": tool_call.get("index", 0), } for tool_call in message.tool_calls ] @@ -158,12 +165,13 @@ def _convert_delta_to_message_chunk( if tool_calls: tool_call = tool_calls[0] tool_name = tool_call.function.name or (curr_msg and curr_msg.name) or "" + idx = tool_call.index tool_call_chunk = ToolCallChunk( name=tool_name, id=tool_call.id, args=tool_call.function.arguments, - index=0, # only support a single tool call atm + index=idx, ) return AIMessageChunk( @@ -192,6 +200,23 @@ def _convert_delta_to_message_chunk( raise ValueError(f"Unknown role: {role}") +def _prompt_to_dict( + prompt: LanguageModelInput, +) -> Sequence[str | list[str] | dict[str, Any] | tuple[str, str]]: + # NOTE: this must go first, since it is also a Sequence + if isinstance(prompt, str): + return [_convert_message_to_dict(HumanMessage(content=prompt))] + + if isinstance(prompt, (list, Sequence)): + return [ + _convert_message_to_dict(msg) if isinstance(msg, BaseMessage) else msg + for msg in prompt + ] + + if isinstance(prompt, PromptValue): + return [_convert_message_to_dict(message) for message in prompt.to_messages()] + + class DefaultMultiLLM(LLM): """Uses Litellm library to allow easy configuration to use a multitude of LLMs See https://python.langchain.com/docs/integrations/chat/litellm""" @@ -204,20 +229,25 @@ def __init__( model_name: str, api_base: str | None = None, api_version: str | None = None, + deployment_name: str | None = None, max_output_tokens: int | None = None, custom_llm_provider: str | None = None, temperature: float = GEN_AI_TEMPERATURE, custom_config: dict[str, str] | None = None, extra_headers: dict[str, str] | None = None, + extra_body: dict | None = LITELLM_EXTRA_BODY, + long_term_logger: LongTermLogger | None = None, ): self._timeout = timeout self._model_provider = model_provider self._model_version = model_name self._temperature = temperature self._api_key = api_key + self._deployment_name = deployment_name self._api_base = api_base self._api_version = api_version self._custom_llm_provider = custom_llm_provider + self._long_term_logger = long_term_logger # This can be used to store the maximum output tokens for this model. # self._max_output_tokens = ( @@ -241,12 +271,60 @@ def __init__( model_kwargs: dict[str, Any] = {} if extra_headers: model_kwargs.update({"extra_headers": extra_headers}) + if extra_body: + model_kwargs.update({"extra_body": extra_body}) self._model_kwargs = model_kwargs def log_model_configs(self) -> None: logger.debug(f"Config: {self.config}") + def _safe_model_config(self) -> dict: + dump = self.config.model_dump() + dump["api_key"] = mask_string(dump.get("api_key", "")) + return dump + + def _record_call(self, prompt: LanguageModelInput) -> None: + if self._long_term_logger: + self._long_term_logger.record( + {"prompt": _prompt_to_dict(prompt), "model": self._safe_model_config()}, + category=_LLM_PROMPT_LONG_TERM_LOG_CATEGORY, + ) + + def _record_result( + self, prompt: LanguageModelInput, model_output: BaseMessage + ) -> None: + if self._long_term_logger: + self._long_term_logger.record( + { + "prompt": _prompt_to_dict(prompt), + "content": model_output.content, + "tool_calls": ( + model_output.tool_calls + if hasattr(model_output, "tool_calls") + else [] + ), + "model": self._safe_model_config(), + }, + category=_LLM_PROMPT_LONG_TERM_LOG_CATEGORY, + ) + + def _record_error(self, prompt: LanguageModelInput, error: Exception) -> None: + if self._long_term_logger: + self._long_term_logger.record( + { + "prompt": _prompt_to_dict(prompt), + "error": str(error), + "traceback": "".join( + traceback.format_exception( + type(error), error, error.__traceback__ + ) + ), + "model": self._safe_model_config(), + }, + category=_LLM_PROMPT_LONG_TERM_LOG_CATEGORY, + ) + # def _calculate_max_output_tokens(self, prompt: LanguageModelInput) -> int: # # NOTE: This method can be used for calculating the maximum tokens for the stream, # # but it isn't used in practice due to the computational cost of counting tokens @@ -277,25 +355,25 @@ def _completion( tools: list[dict] | None, tool_choice: ToolChoiceOptions | None, stream: bool, + structured_response_format: dict | None = None, ) -> litellm.ModelResponse | litellm.CustomStreamWrapper: - if isinstance(prompt, list): - prompt = [ - _convert_message_to_dict(msg) if isinstance(msg, BaseMessage) else msg - for msg in prompt - ] - elif isinstance(prompt, str): - prompt = [_convert_message_to_dict(HumanMessage(content=prompt))] + # litellm doesn't accept LangChain BaseMessage objects, so we need to convert them + # to a dict representation + processed_prompt = _prompt_to_dict(prompt) + self._record_call(processed_prompt) try: return litellm.completion( # model choice - model=f"{self.config.model_provider}/{self.config.model_name}", - api_key=self._api_key, - base_url=self._api_base, - api_version=self._api_version, - custom_llm_provider=self._custom_llm_provider, + model=f"{self.config.model_provider}/{self.config.deployment_name or self.config.model_name}", + # NOTE: have to pass in None instead of empty string for these + # otherwise litellm can have some issues with bedrock + api_key=self._api_key or None, + base_url=self._api_base or None, + api_version=self._api_version or None, + custom_llm_provider=self._custom_llm_provider or None, # actual input - messages=prompt, + messages=processed_prompt, tools=tools, tool_choice=tool_choice if tools else None, # streaming choice @@ -307,9 +385,15 @@ def _completion( # NOTE: we can't pass this in if tools are not specified # or else OpenAI throws an error **({"parallel_tool_calls": False} if tools else {}), + **( + {"response_format": structured_response_format} + if structured_response_format + else {} + ), **self._model_kwargs, ) except Exception as e: + self._record_error(processed_prompt, e) # for break pointing raise e @@ -322,6 +406,7 @@ def config(self) -> LLMConfig: api_key=self._api_key, api_base=self._api_base, api_version=self._api_version, + deployment_name=self._deployment_name, ) def _invoke_implementation( @@ -329,16 +414,23 @@ def _invoke_implementation( prompt: LanguageModelInput, tools: list[dict] | None = None, tool_choice: ToolChoiceOptions | None = None, + structured_response_format: dict | None = None, ) -> BaseMessage: if LOG_DANSWER_MODEL_INTERACTIONS: self.log_model_configs() response = cast( - litellm.ModelResponse, self._completion(prompt, tools, tool_choice, False) + litellm.ModelResponse, + self._completion( + prompt, tools, tool_choice, False, structured_response_format + ), ) choice = response.choices[0] if hasattr(choice, "message"): - return _convert_litellm_message_to_langchain_message(choice.message) + output = _convert_litellm_message_to_langchain_message(choice.message) + if output: + self._record_result(prompt, output) + return output else: raise ValueError("Unexpected response choice type") @@ -347,18 +439,21 @@ def _stream_implementation( prompt: LanguageModelInput, tools: list[dict] | None = None, tool_choice: ToolChoiceOptions | None = None, + structured_response_format: dict | None = None, ) -> Iterator[BaseMessage]: if LOG_DANSWER_MODEL_INTERACTIONS: self.log_model_configs() if DISABLE_LITELLM_STREAMING: - yield self.invoke(prompt) + yield self.invoke(prompt, tools, tool_choice, structured_response_format) return output = None response = cast( litellm.CustomStreamWrapper, - self._completion(prompt, tools, tool_choice, True), + self._completion( + prompt, tools, tool_choice, True, structured_response_format + ), ) try: for part in response: @@ -384,6 +479,9 @@ def _stream_implementation( "The AI model failed partway through generation, please try again." ) + if output: + self._record_result(prompt, output) + if LOG_DANSWER_MODEL_INTERACTIONS and output: content = output.content or "" if isinstance(output, AIMessage): diff --git a/backend/danswer/llm/custom_llm.py b/backend/danswer/llm/custom_llm.py index 4a5ba7857c3..6b80406cf2f 100644 --- a/backend/danswer/llm/custom_llm.py +++ b/backend/danswer/llm/custom_llm.py @@ -80,6 +80,7 @@ def _invoke_implementation( prompt: LanguageModelInput, tools: list[dict] | None = None, tool_choice: ToolChoiceOptions | None = None, + structured_response_format: dict | None = None, ) -> BaseMessage: return self._execute(prompt) @@ -88,5 +89,6 @@ def _stream_implementation( prompt: LanguageModelInput, tools: list[dict] | None = None, tool_choice: ToolChoiceOptions | None = None, + structured_response_format: dict | None = None, ) -> Iterator[BaseMessage]: yield self._execute(prompt) diff --git a/backend/danswer/llm/factory.py b/backend/danswer/llm/factory.py index f57bfb524b9..9a2ae66d396 100644 --- a/backend/danswer/llm/factory.py +++ b/backend/danswer/llm/factory.py @@ -7,9 +7,10 @@ from danswer.db.models import Persona from danswer.llm.chat_llm import DefaultMultiLLM from danswer.llm.exceptions import GenAIDisabledException -from danswer.llm.headers import build_llm_extra_headers from danswer.llm.interfaces import LLM from danswer.llm.override_models import LLMOverride +from danswer.utils.headers import build_llm_extra_headers +from danswer.utils.long_term_log import LongTermLogger def get_main_llm_from_tuple( @@ -22,6 +23,7 @@ def get_llms_for_persona( persona: Persona, llm_override: LLMOverride | None = None, additional_headers: dict[str, str] | None = None, + long_term_logger: LongTermLogger | None = None, ) -> tuple[LLM, LLM]: model_provider_override = llm_override.model_provider if llm_override else None model_version_override = llm_override.model_version if llm_override else None @@ -32,6 +34,7 @@ def get_llms_for_persona( return get_default_llms( temperature=temperature_override or GEN_AI_TEMPERATURE, additional_headers=additional_headers, + long_term_logger=long_term_logger, ) with get_session_context_manager() as db_session: @@ -51,11 +54,13 @@ def _create_llm(model: str) -> LLM: return get_llm( provider=llm_provider.provider, model=model, + deployment_name=llm_provider.deployment_name, api_key=llm_provider.api_key, api_base=llm_provider.api_base, api_version=llm_provider.api_version, custom_config=llm_provider.custom_config, additional_headers=additional_headers, + long_term_logger=long_term_logger, ) return _create_llm(model), _create_llm(fast_model) @@ -65,6 +70,7 @@ def get_default_llms( timeout: int = QA_TIMEOUT, temperature: float = GEN_AI_TEMPERATURE, additional_headers: dict[str, str] | None = None, + long_term_logger: LongTermLogger | None = None, ) -> tuple[LLM, LLM]: if DISABLE_GENERATIVE_AI: raise GenAIDisabledException() @@ -88,6 +94,7 @@ def _create_llm(model: str) -> LLM: return get_llm( provider=llm_provider.provider, model=model, + deployment_name=llm_provider.deployment_name, api_key=llm_provider.api_key, api_base=llm_provider.api_base, api_version=llm_provider.api_version, @@ -95,6 +102,7 @@ def _create_llm(model: str) -> LLM: timeout=timeout, temperature=temperature, additional_headers=additional_headers, + long_term_logger=long_term_logger, ) return _create_llm(model_name), _create_llm(fast_model_name) @@ -103,6 +111,7 @@ def _create_llm(model: str) -> LLM: def get_llm( provider: str, model: str, + deployment_name: str | None, api_key: str | None = None, api_base: str | None = None, api_version: str | None = None, @@ -110,10 +119,12 @@ def get_llm( temperature: float = GEN_AI_TEMPERATURE, timeout: int = QA_TIMEOUT, additional_headers: dict[str, str] | None = None, + long_term_logger: LongTermLogger | None = None, ) -> LLM: return DefaultMultiLLM( model_provider=provider, model_name=model, + deployment_name=deployment_name, api_key=api_key, api_base=api_base, api_version=api_version, @@ -121,4 +132,5 @@ def get_llm( temperature=temperature, custom_config=custom_config, extra_headers=build_llm_extra_headers(additional_headers), + long_term_logger=long_term_logger, ) diff --git a/backend/danswer/llm/headers.py b/backend/danswer/llm/headers.py deleted file mode 100644 index b43c83e141e..00000000000 --- a/backend/danswer/llm/headers.py +++ /dev/null @@ -1,34 +0,0 @@ -from fastapi.datastructures import Headers - -from danswer.configs.model_configs import LITELLM_EXTRA_HEADERS -from danswer.configs.model_configs import LITELLM_PASS_THROUGH_HEADERS - - -def get_litellm_additional_request_headers( - headers: dict[str, str] | Headers -) -> dict[str, str]: - if not LITELLM_PASS_THROUGH_HEADERS: - return {} - - pass_through_headers: dict[str, str] = {} - for key in LITELLM_PASS_THROUGH_HEADERS: - if key in headers: - pass_through_headers[key] = headers[key] - else: - # fastapi makes all header keys lowercase, handling that here - lowercase_key = key.lower() - if lowercase_key in headers: - pass_through_headers[lowercase_key] = headers[lowercase_key] - - return pass_through_headers - - -def build_llm_extra_headers( - additional_headers: dict[str, str] | None = None -) -> dict[str, str]: - extra_headers: dict[str, str] = {} - if additional_headers: - extra_headers.update(additional_headers) - if LITELLM_EXTRA_HEADERS: - extra_headers.update(LITELLM_EXTRA_HEADERS) - return extra_headers diff --git a/backend/danswer/llm/interfaces.py b/backend/danswer/llm/interfaces.py index f3775c2c61b..bbbaafebb9a 100644 --- a/backend/danswer/llm/interfaces.py +++ b/backend/danswer/llm/interfaces.py @@ -2,16 +2,16 @@ from collections.abc import Iterator from typing import Literal +from danswer.configs.app_configs import ( + DISABLE_GENERATIVE_AI, + LOG_DANSWER_MODEL_INTERACTIONS, + LOG_INDIVIDUAL_MODEL_TOKENS, +) +from danswer.utils.logger import setup_logger from langchain.schema.language_model import LanguageModelInput -from langchain_core.messages import AIMessageChunk -from langchain_core.messages import BaseMessage +from langchain_core.messages import AIMessageChunk, BaseMessage from pydantic import BaseModel -from danswer.configs.app_configs import DISABLE_GENERATIVE_AI -from danswer.configs.app_configs import LOG_DANSWER_MODEL_INTERACTIONS -from danswer.utils.logger import setup_logger - - logger = setup_logger() ToolChoiceOptions = Literal["required"] | Literal["auto"] | Literal["none"] @@ -24,7 +24,7 @@ class LLMConfig(BaseModel): api_key: str | None = None api_base: str | None = None api_version: str | None = None - + deployment_name: str | None = None # This disables the "model_" protected namespace for pydantic model_config = {"protected_namespaces": ()} @@ -55,12 +55,6 @@ def log_prompt(prompt: LanguageModelInput) -> None: logger.debug(f"Prompt:\n{prompt}") -class LLMConfig(BaseModel): - model_provider: str - model_name: str - temperature: float - - class LLM(abc.ABC): """Mimics the LangChain LLM / BaseChatModel interfaces to make it easy to use these implementations to connect to a variety of LLM providers.""" @@ -94,11 +88,14 @@ def invoke( prompt: LanguageModelInput, tools: list[dict] | None = None, tool_choice: ToolChoiceOptions | None = None, + structured_response_format: dict | None = None, ) -> BaseMessage: self._precall(prompt) # TODO add a postcall to log model outputs independent of concrete class # implementation - return self._invoke_implementation(prompt, tools, tool_choice) + return self._invoke_implementation( + prompt, tools, tool_choice, structured_response_format + ) @abc.abstractmethod def _invoke_implementation( @@ -106,6 +103,7 @@ def _invoke_implementation( prompt: LanguageModelInput, tools: list[dict] | None = None, tool_choice: ToolChoiceOptions | None = None, + structured_response_format: dict | None = None, ) -> BaseMessage: raise NotImplementedError @@ -114,11 +112,23 @@ def stream( prompt: LanguageModelInput, tools: list[dict] | None = None, tool_choice: ToolChoiceOptions | None = None, + structured_response_format: dict | None = None, ) -> Iterator[BaseMessage]: self._precall(prompt) # TODO add a postcall to log model outputs independent of concrete class # implementation - return self._stream_implementation(prompt, tools, tool_choice) + messages = self._stream_implementation( + prompt, tools, tool_choice, structured_response_format + ) + + tokens = [] + for message in messages: + if LOG_INDIVIDUAL_MODEL_TOKENS: + tokens.append(message.content) + yield message + + if LOG_INDIVIDUAL_MODEL_TOKENS and tokens: + logger.debug(f"Model Tokens: {tokens}") @abc.abstractmethod def _stream_implementation( @@ -126,5 +136,6 @@ def _stream_implementation( prompt: LanguageModelInput, tools: list[dict] | None = None, tool_choice: ToolChoiceOptions | None = None, + structured_response_format: dict | None = None, ) -> Iterator[BaseMessage]: raise NotImplementedError diff --git a/backend/danswer/llm/llm_provider_options.py b/backend/danswer/llm/llm_provider_options.py index 1bcfdf7e506..cf562ee5a27 100644 --- a/backend/danswer/llm/llm_provider_options.py +++ b/backend/danswer/llm/llm_provider_options.py @@ -16,14 +16,19 @@ class WellKnownLLMProviderDescriptor(BaseModel): api_base_required: bool api_version_required: bool custom_config_keys: list[CustomConfigKey] | None = None - llm_names: list[str] default_model: str | None = None default_fast_model: str | None = None + # set for providers like Azure, which require a deployment name. + deployment_name_required: bool = False + # set for providers like Azure, which support a single model per deployment. + single_model_supported: bool = False OPENAI_PROVIDER_NAME = "openai" OPEN_AI_MODEL_NAMES = [ + "o1-mini", + "o1-preview", "gpt-4", "gpt-4o", "gpt-4o-mini", @@ -56,6 +61,7 @@ class WellKnownLLMProviderDescriptor(BaseModel): IGNORABLE_ANTHROPIC_MODELS = [ "claude-2", "claude-instant-1", + "anthropic/claude-3-5-sonnet-20241022", ] ANTHROPIC_PROVIDER_NAME = "anthropic" ANTHROPIC_MODEL_NAMES = [ @@ -95,8 +101,8 @@ def fetch_available_well_known_llms() -> list[WellKnownLLMProviderDescriptor]: api_version_required=False, custom_config_keys=[], llm_names=fetch_models_for_provider(ANTHROPIC_PROVIDER_NAME), - default_model="claude-3-5-sonnet-20240620", - default_fast_model="claude-3-5-sonnet-20240620", + default_model="claude-3-5-sonnet-20241022", + default_fast_model="claude-3-5-sonnet-20241022", ), WellKnownLLMProviderDescriptor( name=AZURE_PROVIDER_NAME, @@ -106,6 +112,8 @@ def fetch_available_well_known_llms() -> list[WellKnownLLMProviderDescriptor]: api_version_required=True, custom_config_keys=[], llm_names=fetch_models_for_provider(AZURE_PROVIDER_NAME), + deployment_name_required=True, + single_model_supported=True, ), WellKnownLLMProviderDescriptor( name=BEDROCK_PROVIDER_NAME, @@ -128,8 +136,8 @@ def fetch_available_well_known_llms() -> list[WellKnownLLMProviderDescriptor]: ), ], llm_names=fetch_models_for_provider(BEDROCK_PROVIDER_NAME), - default_model="anthropic.claude-3-5-sonnet-20240620-v1:0", - default_fast_model="anthropic.claude-3-5-sonnet-20240620-v1:0", + default_model="anthropic.claude-3-5-sonnet-20241022-v2:0", + default_fast_model="anthropic.claude-3-5-sonnet-20241022-v2:0", ), ] diff --git a/backend/danswer/llm/utils.py b/backend/danswer/llm/utils.py index c367f0aa522..343f93147d8 100644 --- a/backend/danswer/llm/utils.py +++ b/backend/danswer/llm/utils.py @@ -1,3 +1,4 @@ +import io import json from collections.abc import Callable from collections.abc import Iterator @@ -7,6 +8,7 @@ from typing import Union import litellm # type: ignore +import pandas as pd import tiktoken from langchain.prompts.base import StringPromptValue from langchain.prompts.chat import ChatPromptValue @@ -47,7 +49,9 @@ logger = setup_logger() -def litellm_exception_to_error_msg(e: Exception, llm: LLM) -> str: +def litellm_exception_to_error_msg( + e: Exception, llm: LLM, fallback_to_error_msg: bool = False +) -> str: error_msg = str(e) if isinstance(e, BadRequestError): @@ -94,7 +98,7 @@ def litellm_exception_to_error_msg(e: Exception, llm: LLM) -> str: error_msg = "Request timed out: The operation took too long to complete. Please try again." elif isinstance(e, APIError): error_msg = f"API error: An error occurred while communicating with the API. Details: {str(e)}" - else: + elif not fallback_to_error_msg: error_msg = "An unexpected error occurred while processing your request. Please try again later." return error_msg @@ -105,11 +109,10 @@ def translate_danswer_msg_to_langchain( files: list[InMemoryChatFile] = [] # If the message is a `ChatMessage`, it doesn't have the downloaded files - # attached. Just ignore them for now. Also, OpenAI doesn't allow files to - # be attached to AI messages, so we must remove them - if not isinstance(msg, ChatMessage) and msg.message_type != MessageType.ASSISTANT: + # attached. Just ignore them for now. + if not isinstance(msg, ChatMessage): files = msg.files - content = build_content_with_imgs(msg.message, files) + content = build_content_with_imgs(msg.message, files, message_type=msg.message_type) if msg.message_type == MessageType.SYSTEM: raise ValueError("System messages are not currently part of history") @@ -133,6 +136,20 @@ def translate_history_to_basemessages( return history_basemessages, history_token_counts +# Processes CSV files to show the first 5 rows and max_columns (default 40) columns +def _process_csv_file(file: InMemoryChatFile, max_columns: int = 40) -> str: + df = pd.read_csv(io.StringIO(file.content.decode("utf-8"))) + + csv_preview = df.head().to_string(max_cols=max_columns) + + file_name_section = ( + f"CSV FILE NAME: {file.filename}\n" + if file.filename + else "CSV FILE (NO NAME PROVIDED):\n" + ) + return f"{file_name_section}{CODE_BLOCK_PAT.format(csv_preview)}\n\n\n" + + def _build_content( message: str, files: list[InMemoryChatFile] | None = None, @@ -143,16 +160,26 @@ def _build_content( if files else None ) - if not text_files: + + csv_files = ( + [file for file in files if file.file_type == ChatFileType.CSV] + if files + else None + ) + + if not text_files and not csv_files: return message final_message_with_files = "FILES:\n\n" - for file in text_files: + for file in text_files or []: file_content = file.content.decode("utf-8") file_name_section = f"DOCUMENT: {file.filename}\n" if file.filename else "" final_message_with_files += ( f"{file_name_section}{CODE_BLOCK_PAT.format(file_content.strip())}\n\n\n" ) + for file in csv_files or []: + final_message_with_files += _process_csv_file(file) + final_message_with_files += message return final_message_with_files @@ -162,10 +189,19 @@ def build_content_with_imgs( message: str, files: list[InMemoryChatFile] | None = None, img_urls: list[str] | None = None, + message_type: MessageType = MessageType.USER, ) -> str | list[str | dict[str, Any]]: # matching Langchain's BaseMessage content type files = files or [] - img_files = [file for file in files if file.file_type == ChatFileType.IMAGE] + + # Only include image files for user messages + img_files = ( + [file for file in files if file.file_type == ChatFileType.IMAGE] + if message_type == MessageType.USER + else [] + ) + img_urls = img_urls or [] + message_main_content = _build_content(message, files) if not img_files and not img_urls: @@ -201,6 +237,28 @@ def build_content_with_imgs( ) +def message_to_prompt_and_imgs(message: BaseMessage) -> tuple[str, list[str]]: + if isinstance(message.content, str): + return message.content, [] + + imgs = [] + texts = [] + for part in message.content: + if isinstance(part, dict): + if part.get("type") == "image_url": + img_url = part.get("image_url", {}).get("url") + if img_url: + imgs.append(img_url) + elif part.get("type") == "text": + text = part.get("text") + if text: + texts.append(text) + else: + texts.append(part) + + return "".join(texts), imgs + + def dict_based_prompt_to_langchain_prompt( messages: list[dict[str, str]] ) -> list[BaseMessage]: @@ -340,12 +398,26 @@ def get_llm_max_tokens( try: model_obj = model_map.get(f"{model_provider}/{model_name}") - if not model_obj: - model_obj = model_map[model_name] - logger.debug(f"Using model object for {model_name}") - else: + if model_obj: logger.debug(f"Using model object for {model_provider}/{model_name}") + if not model_obj: + model_obj = model_map.get(model_name) + if model_obj: + logger.debug(f"Using model object for {model_name}") + + if not model_obj: + model_name_split = model_name.split("/") + if len(model_name_split) > 1: + model_obj = model_map.get(model_name_split[1]) + if model_obj: + logger.debug(f"Using model object for {model_name_split[1]}") + + if not model_obj: + raise RuntimeError( + f"No litellm entry found for {model_provider}/{model_name}" + ) + if "max_input_tokens" in model_obj: max_tokens = model_obj["max_input_tokens"] logger.info( diff --git a/backend/danswer/main.py b/backend/danswer/main.py index 3306dd12d83..da8b22787ca 100644 --- a/backend/danswer/main.py +++ b/backend/danswer/main.py @@ -1,78 +1,38 @@ -import time +import sys import traceback from collections.abc import AsyncGenerator from contextlib import asynccontextmanager from typing import Any from typing import cast +import sentry_sdk import uvicorn -from fastapi import APIRouter -from fastapi import FastAPI -from fastapi import HTTPException -from fastapi import Request -from fastapi import status -from fastapi.exceptions import RequestValidationError -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse -from httpx_oauth.clients.google import GoogleOAuth2 -from sqlalchemy.orm import Session - from danswer import __version__ from danswer.auth.schemas import UserCreate from danswer.auth.schemas import UserRead from danswer.auth.schemas import UserUpdate from danswer.auth.users import auth_backend +from danswer.auth.users import BasicAuthenticationError +from danswer.auth.users import create_danswer_oauth_router from danswer.auth.users import fastapi_users -from danswer.chat.load_yamls import load_chat_yamls from danswer.configs.app_configs import APP_API_PREFIX from danswer.configs.app_configs import APP_HOST from danswer.configs.app_configs import APP_PORT from danswer.configs.app_configs import AUTH_TYPE from danswer.configs.app_configs import DISABLE_GENERATIVE_AI -from danswer.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP from danswer.configs.app_configs import LOG_ENDPOINT_LATENCY from danswer.configs.app_configs import OAUTH_CLIENT_ID from danswer.configs.app_configs import OAUTH_CLIENT_SECRET +from danswer.configs.app_configs import POSTGRES_API_SERVER_POOL_OVERFLOW +from danswer.configs.app_configs import POSTGRES_API_SERVER_POOL_SIZE +from danswer.configs.app_configs import SYSTEM_RECURSION_LIMIT from danswer.configs.app_configs import USER_AUTH_SECRET from danswer.configs.app_configs import WEB_DOMAIN from danswer.configs.constants import AuthType -from danswer.configs.constants import KV_REINDEX_KEY -from danswer.configs.constants import KV_SEARCH_SETTINGS from danswer.configs.constants import POSTGRES_WEB_APP_NAME -from danswer.configs.model_configs import FAST_GEN_AI_MODEL_VERSION -from danswer.configs.model_configs import GEN_AI_API_KEY -from danswer.configs.model_configs import GEN_AI_MODEL_VERSION -from danswer.db.connector import check_connectors_exist -from danswer.db.connector import create_initial_default_connector -from danswer.db.connector_credential_pair import associate_default_cc_pair -from danswer.db.connector_credential_pair import get_connector_credential_pairs -from danswer.db.connector_credential_pair import resync_cc_pair -from danswer.db.credentials import create_initial_public_credential -from danswer.db.document import check_docs_exist -from danswer.db.engine import get_sqlalchemy_engine -from danswer.db.engine import init_sqlalchemy_engine +from danswer.db.engine import SqlEngine from danswer.db.engine import warm_up_connections -from danswer.db.index_attempt import cancel_indexing_attempts_past_model -from danswer.db.index_attempt import expire_index_attempts -from danswer.db.llm import fetch_default_provider -from danswer.db.llm import update_default_provider -from danswer.db.llm import upsert_llm_provider -from danswer.db.persona import delete_old_default_personas -from danswer.db.search_settings import get_current_search_settings -from danswer.db.search_settings import get_secondary_search_settings -from danswer.db.search_settings import update_current_search_settings -from danswer.db.search_settings import update_secondary_search_settings -from danswer.db.swap_index import check_index_swap -from danswer.document_index.factory import get_default_document_index -from danswer.document_index.interfaces import DocumentIndex -from danswer.dynamic_configs.factory import get_dynamic_config_store -from danswer.dynamic_configs.interface import ConfigNotFoundError -from danswer.indexing.models import IndexingSetting -from danswer.natural_language_processing.search_nlp_models import EmbeddingModel -from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder -from danswer.natural_language_processing.search_nlp_models import warm_up_cross_encoder -from danswer.search.models import SavedSearchSettings -from danswer.search.retrieval.search_runner import download_nltk_data +from danswer.server.api_key.api import router as api_key_router from danswer.server.auth_check import check_router_auth from danswer.server.danswer_api.ingestion import router as danswer_api_router from danswer.server.documents.cc_pair import router as cc_pair_router @@ -86,23 +46,30 @@ admin_router as admin_input_prompt_router, ) from danswer.server.features.input_prompt.api import basic_router as input_prompt_router +from danswer.server.features.notifications.api import router as notification_router from danswer.server.features.persona.api import admin_router as admin_persona_router from danswer.server.features.persona.api import basic_router as persona_router from danswer.server.features.prompt.api import basic_router as prompt_router from danswer.server.features.tool.api import admin_router as admin_tool_router from danswer.server.features.tool.api import router as tool_router from danswer.server.gpts.api import router as gpts_router +from danswer.server.long_term_logs.long_term_logs_api import ( + router as long_term_logs_router, +) from danswer.server.manage.administrative import router as admin_router from danswer.server.manage.embedding.api import admin_router as embedding_admin_router from danswer.server.manage.embedding.api import basic_router as embedding_router from danswer.server.manage.get_state import router as state_router from danswer.server.manage.llm.api import admin_router as llm_admin_router from danswer.server.manage.llm.api import basic_router as llm_router -from danswer.server.manage.llm.models import LLMProviderUpsertRequest from danswer.server.manage.search_settings import router as search_settings_router from danswer.server.manage.slack_bot import router as slack_bot_management_router +from danswer.server.manage.standard_answer import router as standard_answer_router from danswer.server.manage.users import router as user_router from danswer.server.middleware.latency_logging import add_latency_logging_middleware +from danswer.server.openai_assistants_api.full_openai_assistants_api import ( + get_full_openai_assistants_api_router, +) from danswer.server.query_and_chat.chat_backend import router as chat_router from danswer.server.query_and_chat.query_backend import ( admin_router as admin_query_router, @@ -110,12 +77,11 @@ from danswer.server.query_and_chat.query_backend import basic_router as query_router from danswer.server.settings.api import admin_router as settings_admin_router from danswer.server.settings.api import basic_router as settings_router -from danswer.server.settings.store import load_settings -from danswer.server.settings.store import store_settings from danswer.server.token_rate_limits.api import ( router as token_rate_limit_settings_router, ) -from danswer.server.manage.standard_answer import router as standard_answer_router +from danswer.setup import setup_danswer +from danswer.setup import setup_multitenant_danswer from danswer.tools.built_in_tools import auto_add_search_tool_to_personas from danswer.tools.built_in_tools import load_builtin_tools from danswer.tools.built_in_tools import refresh_built_in_tools_cache @@ -127,9 +93,21 @@ from danswer.utils.variable_functionality import fetch_versioned_implementation from danswer.utils.variable_functionality import global_version from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable +from fastapi import APIRouter +from fastapi import FastAPI +from fastapi import HTTPException +from fastapi import Request +from fastapi import status +from fastapi.exceptions import RequestValidationError +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse +from httpx_oauth.clients.google import GoogleOAuth2 +from sentry_sdk.integrations.fastapi import FastApiIntegration +from sentry_sdk.integrations.starlette import StarletteIntegration from shared_configs.configs import CORS_ALLOWED_ORIGIN -from shared_configs.configs import MODEL_SERVER_HOST -from shared_configs.configs import MODEL_SERVER_PORT +from shared_configs.configs import MULTI_TENANT +from shared_configs.configs import SENTRY_DSN +from sqlalchemy.orm import Session logger = setup_logger() @@ -182,184 +160,24 @@ def include_router_with_global_prefix_prepended( application.include_router(router, **final_kwargs) -def setup_postgres(db_session: Session) -> None: - logger.notice("Verifying default connector/credential exist.") - create_initial_public_credential(db_session) - create_initial_default_connector(db_session) - associate_default_cc_pair(db_session) - - logger.notice("Loading default Prompts and Personas") - delete_old_default_personas(db_session) - load_chat_yamls() - - logger.notice("Loading built-in tools") - load_builtin_tools(db_session) - refresh_built_in_tools_cache(db_session) - auto_add_search_tool_to_personas(db_session) - - if GEN_AI_API_KEY and fetch_default_provider(db_session) is None: - # Only for dev flows - logger.notice("Setting up default OpenAI LLM for dev.") - llm_model = GEN_AI_MODEL_VERSION or "gpt-4o-mini" - fast_model = FAST_GEN_AI_MODEL_VERSION or "gpt-4o-mini" - model_req = LLMProviderUpsertRequest( - name="DevEnvPresetOpenAI", - provider="openai", - api_key=GEN_AI_API_KEY, - api_base=None, - api_version=None, - custom_config=None, - default_model_name=llm_model, - fast_default_model_name=fast_model, - is_public=True, - groups=[], - display_model_names=[llm_model, fast_model], - model_names=[llm_model, fast_model], - ) - new_llm_provider = upsert_llm_provider( - llm_provider=model_req, db_session=db_session - ) - update_default_provider(provider_id=new_llm_provider.id, db_session=db_session) - - -def update_default_multipass_indexing(db_session: Session) -> None: - docs_exist = check_docs_exist(db_session) - connectors_exist = check_connectors_exist(db_session) - logger.debug(f"Docs exist: {docs_exist}, Connectors exist: {connectors_exist}") - - if not docs_exist and not connectors_exist: - logger.info( - "No existing docs or connectors found. Checking GPU availability for multipass indexing." - ) - gpu_available = gpu_status_request() - logger.info(f"GPU available: {gpu_available}") - - current_settings = get_current_search_settings(db_session) - - logger.notice(f"Updating multipass indexing setting to: {gpu_available}") - updated_settings = SavedSearchSettings.from_db_model(current_settings) - # Enable multipass indexing if GPU is available or if using a cloud provider - updated_settings.multipass_indexing = ( - gpu_available or current_settings.cloud_provider is not None - ) - update_current_search_settings(db_session, updated_settings) - - # Update settings with GPU availability - settings = load_settings() - settings.gpu_enabled = gpu_available - store_settings(settings) - logger.notice(f"Updated settings with GPU availability: {gpu_available}") - - else: - logger.debug( - "Existing docs or connectors found. Skipping multipass indexing update." - ) - - -def translate_saved_search_settings(db_session: Session) -> None: - kv_store = get_dynamic_config_store() - - try: - search_settings_dict = kv_store.load(KV_SEARCH_SETTINGS) - if isinstance(search_settings_dict, dict): - # Update current search settings - current_settings = get_current_search_settings(db_session) - - # Update non-preserved fields - if current_settings: - current_settings_dict = SavedSearchSettings.from_db_model( - current_settings - ).dict() - - new_current_settings = SavedSearchSettings( - **{**current_settings_dict, **search_settings_dict} - ) - update_current_search_settings(db_session, new_current_settings) - - # Update secondary search settings - secondary_settings = get_secondary_search_settings(db_session) - if secondary_settings: - secondary_settings_dict = SavedSearchSettings.from_db_model( - secondary_settings - ).dict() - - new_secondary_settings = SavedSearchSettings( - **{**secondary_settings_dict, **search_settings_dict} - ) - update_secondary_search_settings( - db_session, - new_secondary_settings, - ) - # Delete the KV store entry after successful update - kv_store.delete(KV_SEARCH_SETTINGS) - logger.notice("Search settings updated and KV store entry deleted.") - else: - logger.notice("KV store search settings is empty.") - except ConfigNotFoundError: - logger.notice("No search config found in KV store.") - - -def mark_reindex_flag(db_session: Session) -> None: - kv_store = get_dynamic_config_store() - try: - value = kv_store.load(KV_REINDEX_KEY) - logger.debug(f"Re-indexing flag has value {value}") - return - except ConfigNotFoundError: - # Only need to update the flag if it hasn't been set - pass - - # If their first deployment is after the changes, it will - # enable this when the other changes go in, need to avoid - # this being set to False, then the user indexes things on the old version - docs_exist = check_docs_exist(db_session) - connectors_exist = check_connectors_exist(db_session) - if docs_exist or connectors_exist: - kv_store.store(KV_REINDEX_KEY, True) - else: - kv_store.store(KV_REINDEX_KEY, False) - - -def setup_vespa( - document_index: DocumentIndex, - index_setting: IndexingSetting, - secondary_index_setting: IndexingSetting | None, -) -> bool: - # Vespa startup is a bit slow, so give it a few seconds - WAIT_SECONDS = 5 - VESPA_ATTEMPTS = 5 - for x in range(VESPA_ATTEMPTS): - try: - logger.notice(f"Setting up Vespa (attempt {x+1}/{VESPA_ATTEMPTS})...") - document_index.ensure_indices_exist( - index_embedding_dim=index_setting.model_dim, - secondary_index_embedding_dim=secondary_index_setting.model_dim - if secondary_index_setting - else None, - ) - - logger.notice("Vespa setup complete.") - return True - except Exception: - logger.notice( - f"Vespa setup did not succeed. The Vespa service may not be ready yet. Retrying in {WAIT_SECONDS} seconds." - ) - time.sleep(WAIT_SECONDS) - - logger.error( - f"Vespa setup did not succeed. Attempt limit reached. ({VESPA_ATTEMPTS})" - ) - return False - - @asynccontextmanager async def lifespan(app: FastAPI) -> AsyncGenerator: - init_sqlalchemy_engine(POSTGRES_WEB_APP_NAME) - engine = get_sqlalchemy_engine() + # Set recursion limit + if SYSTEM_RECURSION_LIMIT is not None: + sys.setrecursionlimit(SYSTEM_RECURSION_LIMIT) + logger.notice(f"System recursion limit set to {SYSTEM_RECURSION_LIMIT}") + + SqlEngine.set_app_name(POSTGRES_WEB_APP_NAME) + SqlEngine.init_engine( + pool_size=POSTGRES_API_SERVER_POOL_SIZE, + max_overflow=POSTGRES_API_SERVER_POOL_OVERFLOW, + ) + engine = SqlEngine.get_engine() verify_auth = fetch_versioned_implementation( "danswer.auth.users", "verify_auth_setting" ) + # Will throw exception if an issue is found verify_auth() @@ -372,93 +190,15 @@ async def lifespan(app: FastAPI) -> AsyncGenerator: # fill up Postgres connection pools await warm_up_connections() - # We cache this at the beginning so there is no delay in the first telemetry - get_or_generate_uuid() - - with Session(engine) as db_session: - check_index_swap(db_session=db_session) - search_settings = get_current_search_settings(db_session) - secondary_search_settings = get_secondary_search_settings(db_session) - - # Break bad state for thrashing indexes - if secondary_search_settings and DISABLE_INDEX_UPDATE_ON_SWAP: - expire_index_attempts( - search_settings_id=search_settings.id, db_session=db_session - ) - - for cc_pair in get_connector_credential_pairs(db_session): - resync_cc_pair(cc_pair, db_session=db_session) - - # Expire all old embedding models indexing attempts, technically redundant - cancel_indexing_attempts_past_model(db_session) - - logger.notice(f'Using Embedding model: "{search_settings.model_name}"') - if search_settings.query_prefix or search_settings.passage_prefix: - logger.notice(f'Query embedding prefix: "{search_settings.query_prefix}"') - logger.notice( - f'Passage embedding prefix: "{search_settings.passage_prefix}"' - ) - - if search_settings: - if not search_settings.disable_rerank_for_streaming: - logger.notice("Reranking is enabled.") - - if search_settings.multilingual_expansion: - logger.notice( - f"Multilingual query expansion is enabled with {search_settings.multilingual_expansion}." - ) - if ( - search_settings.rerank_model_name - and not search_settings.provider_type - and not search_settings.rerank_provider_type - ): - warm_up_cross_encoder(search_settings.rerank_model_name) - - logger.notice("Verifying query preprocessing (NLTK) data is downloaded") - download_nltk_data() - - # setup Postgres with default credential, llm providers, etc. - setup_postgres(db_session) - - translate_saved_search_settings(db_session) - - # Does the user need to trigger a reindexing to bring the document index - # into a good state, marked in the kv store - mark_reindex_flag(db_session) - - # ensure Vespa is setup correctly - logger.notice("Verifying Document Index(s) is/are available.") - document_index = get_default_document_index( - primary_index_name=search_settings.index_name, - secondary_index_name=secondary_search_settings.index_name - if secondary_search_settings - else None, - ) + if not MULTI_TENANT: + # We cache this at the beginning so there is no delay in the first telemetry + get_or_generate_uuid() - success = setup_vespa( - document_index, - IndexingSetting.from_db_model(search_settings), - IndexingSetting.from_db_model(secondary_search_settings) - if secondary_search_settings - else None, - ) - if not success: - raise RuntimeError( - "Could not connect to Vespa within the specified timeout." - ) - - logger.notice(f"Model Server: http://{MODEL_SERVER_HOST}:{MODEL_SERVER_PORT}") - if search_settings.provider_type is None: - warm_up_bi_encoder( - embedding_model=EmbeddingModel.from_db_model( - search_settings=search_settings, - server_host=MODEL_SERVER_HOST, - server_port=MODEL_SERVER_PORT, - ), - ) - - # update multipass indexing setting based on GPU availability - update_default_multipass_indexing(db_session) + # If we are multi-tenant, we need to only set up initial public tables + with Session(engine) as db_session: + setup_danswer(db_session, None) + else: + setup_multitenant_danswer() optional_telemetry(record_type=RecordType.VERSION, data={"version": __version__}) yield @@ -466,7 +206,12 @@ async def lifespan(app: FastAPI) -> AsyncGenerator: def log_http_error(_: Request, exc: Exception) -> JSONResponse: status_code = getattr(exc, "status_code", 500) - if status_code >= 400: + + if isinstance(exc, BasicAuthenticationError): + # For BasicAuthenticationError, just log a brief message without stack trace (almost always spam) + logger.error(f"Authentication failed: {str(exc)}") + + elif status_code >= 400: error_msg = f"{str(exc)}\n" error_msg += "".join(traceback.format_tb(exc.__traceback__)) logger.error(error_msg) @@ -482,8 +227,16 @@ def get_application() -> FastAPI: application = FastAPI( title="Danswer Backend", version=__version__, lifespan=lifespan ) + if SENTRY_DSN: + sentry_sdk.init( + dsn=SENTRY_DSN, + integrations=[StarletteIntegration(), FastApiIntegration()], + traces_sample_rate=0.1, + ) + logger.info("Sentry initialized") + else: + logger.debug("Sentry DSN not provided, skipping Sentry initialization") - # Add the custom exception handler application.add_exception_handler(status.HTTP_400_BAD_REQUEST, log_http_error) application.add_exception_handler(status.HTTP_401_UNAUTHORIZED, log_http_error) application.add_exception_handler(status.HTTP_403_FORBIDDEN, log_http_error) @@ -512,6 +265,7 @@ def get_application() -> FastAPI: include_router_with_global_prefix_prepended(application, admin_persona_router) include_router_with_global_prefix_prepended(application, input_prompt_router) include_router_with_global_prefix_prepended(application, admin_input_prompt_router) + include_router_with_global_prefix_prepended(application, notification_router) include_router_with_global_prefix_prepended(application, prompt_router) include_router_with_global_prefix_prepended(application, tool_router) include_router_with_global_prefix_prepended(application, admin_tool_router) @@ -528,24 +282,31 @@ def get_application() -> FastAPI: application, token_rate_limit_settings_router ) include_router_with_global_prefix_prepended(application, indexing_router) + include_router_with_global_prefix_prepended( + application, get_full_openai_assistants_api_router() + ) + include_router_with_global_prefix_prepended(application, long_term_logs_router) + include_router_with_global_prefix_prepended(application, api_key_router) if AUTH_TYPE == AuthType.DISABLED: # Server logs this during auth setup verification step pass - elif AUTH_TYPE == AuthType.BASIC: + if AUTH_TYPE == AuthType.BASIC or AUTH_TYPE == AuthType.CLOUD: include_router_with_global_prefix_prepended( application, fastapi_users.get_auth_router(auth_backend), prefix="/auth", tags=["auth"], ) + include_router_with_global_prefix_prepended( application, fastapi_users.get_register_router(UserRead, UserCreate), prefix="/auth", tags=["auth"], ) + include_router_with_global_prefix_prepended( application, fastapi_users.get_reset_password_router(), @@ -565,11 +326,11 @@ def get_application() -> FastAPI: tags=["users"], ) - elif AUTH_TYPE == AuthType.GOOGLE_OAUTH: + if AUTH_TYPE == AuthType.GOOGLE_OAUTH: oauth_client = GoogleOAuth2(OAUTH_CLIENT_ID, OAUTH_CLIENT_SECRET) include_router_with_global_prefix_prepended( application, - fastapi_users.get_oauth_router( + create_danswer_oauth_router( oauth_client, auth_backend, USER_AUTH_SECRET, @@ -581,6 +342,7 @@ def get_application() -> FastAPI: prefix="/auth/oauth", tags=["auth"], ) + # Need basic auth router for `logout` endpoint include_router_with_global_prefix_prepended( application, @@ -625,7 +387,7 @@ def get_application() -> FastAPI: f"Starting Danswer Backend version {__version__} on http://{APP_HOST}:{str(APP_PORT)}/" ) - if global_version.get_is_ee_version(): + if global_version.is_ee_version(): logger.notice("Running Enterprise Edition") uvicorn.run(app, host=APP_HOST, port=APP_PORT) diff --git a/backend/danswer/natural_language_processing/exceptions.py b/backend/danswer/natural_language_processing/exceptions.py new file mode 100644 index 00000000000..5ca112f64ea --- /dev/null +++ b/backend/danswer/natural_language_processing/exceptions.py @@ -0,0 +1,4 @@ +class ModelServerRateLimitError(Exception): + """ + Exception raised for rate limiting errors from the model server. + """ diff --git a/backend/danswer/natural_language_processing/search_nlp_models.py b/backend/danswer/natural_language_processing/search_nlp_models.py index 6dcec724345..9fed0d489e7 100644 --- a/backend/danswer/natural_language_processing/search_nlp_models.py +++ b/backend/danswer/natural_language_processing/search_nlp_models.py @@ -1,4 +1,3 @@ -import re import threading import time from collections.abc import Callable @@ -7,6 +6,9 @@ import requests from httpx import HTTPError +from requests import JSONDecodeError +from requests import RequestException +from requests import Response from retry import retry from danswer.configs.app_configs import LARGE_CHUNK_RATIO @@ -16,6 +18,10 @@ ) from danswer.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE from danswer.db.models import SearchSettings +from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface +from danswer.natural_language_processing.exceptions import ( + ModelServerRateLimitError, +) from danswer.natural_language_processing.utils import get_tokenizer from danswer.natural_language_processing.utils import tokenizer_trim_content from danswer.utils.logger import setup_logger @@ -49,25 +55,6 @@ def clean_model_name(model_str: str) -> str: return model_str.replace("/", "_").replace("-", "_").replace(".", "_") -_WHITELIST = set( - " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\n\t" -) -_INITIAL_FILTER = re.compile( - "[" - "\U00000080-\U0000FFFF" # All Unicode characters beyond ASCII - "\U00010000-\U0010FFFF" # All Unicode characters in supplementary planes - "]+", - flags=re.UNICODE, -) - - -def clean_openai_text(text: str) -> str: - # First, remove all weird characters - cleaned = _INITIAL_FILTER.sub("", text) - # Then, keep only whitelisted characters - return "".join(char for char in cleaned if char in _WHITELIST) - - def build_model_server_url( model_server_host: str, model_server_port: int, @@ -95,6 +82,9 @@ def __init__( api_url: str | None, provider_type: EmbeddingProvider | None, retrim_content: bool = False, + callback: IndexingHeartbeatInterface | None = None, + api_version: str | None = None, + deployment_name: str | None = None, ) -> None: self.api_key = api_key self.provider_type = provider_type @@ -104,36 +94,54 @@ def __init__( self.model_name = model_name self.retrim_content = retrim_content self.api_url = api_url + self.api_version = api_version + self.deployment_name = deployment_name self.tokenizer = get_tokenizer( model_name=model_name, provider_type=provider_type ) + self.callback = callback model_server_url = build_model_server_url(server_host, server_port) self.embed_server_endpoint = f"{model_server_url}/encoder/bi-encoder-embed" def _make_model_server_request(self, embed_request: EmbedRequest) -> EmbedResponse: - def _make_request() -> EmbedResponse: + def _make_request() -> Response: response = requests.post( self.embed_server_endpoint, json=embed_request.model_dump() ) - try: - response.raise_for_status() - except requests.HTTPError as e: - try: - error_detail = response.json().get("detail", str(e)) - except Exception: - error_detail = response.text - raise HTTPError(f"HTTP error occurred: {error_detail}") from e - except requests.RequestException as e: - raise HTTPError(f"Request failed: {str(e)}") from e + # signify that this is a rate limit error + if response.status_code == 429: + raise ModelServerRateLimitError(response.text) - return EmbedResponse(**response.json()) + response.raise_for_status() + return response - # only perform retries for the non-realtime embedding of passages (e.g. for indexing) + final_make_request_func = _make_request + + # if the text type is a passage, add some default + # retries + handling for rate limiting if embed_request.text_type == EmbedTextType.PASSAGE: - return retry(tries=3, delay=5)(_make_request)() - else: - return _make_request() + final_make_request_func = retry( + tries=3, + delay=5, + exceptions=(RequestException, ValueError, JSONDecodeError), + )(final_make_request_func) + # use 10 second delay as per Azure suggestion + final_make_request_func = retry( + tries=10, delay=10, exceptions=ModelServerRateLimitError + )(final_make_request_func) + + try: + response = final_make_request_func() + return EmbedResponse(**response.json()) + except requests.HTTPError as e: + try: + error_detail = response.json().get("detail", str(e)) + except Exception: + error_detail = response.text + raise HTTPError(f"HTTP error occurred: {error_detail}") from e + except requests.RequestException as e: + raise HTTPError(f"Request failed: {str(e)}") from e def _batch_encode_texts( self, @@ -150,10 +158,16 @@ def _batch_encode_texts( embeddings: list[Embedding] = [] for idx, text_batch in enumerate(text_batches, start=1): + if self.callback: + if self.callback.should_stop(): + raise RuntimeError("_batch_encode_texts detected stop signal") + logger.debug(f"Encoding batch {idx} of {len(text_batches)}") embed_request = EmbedRequest( model_name=self.model_name, texts=text_batch, + api_version=self.api_version, + deployment_name=self.deployment_name, max_context_length=max_seq_length, normalize_embeddings=self.normalize, api_key=self.api_key, @@ -166,6 +180,9 @@ def _batch_encode_texts( response = self._make_model_server_request(embed_request) embeddings.extend(response.embeddings) + + if self.callback: + self.callback.progress("_batch_encode_texts", 1) return embeddings def encode( @@ -196,11 +213,6 @@ def encode( for text in texts ] - if self.provider_type == EmbeddingProvider.OPENAI: - # If the provider is openai, we need to clean the text - # as a temporary workaround for the openai API - texts = [clean_openai_text(text) for text in texts] - batch_size = ( api_embedding_batch_size if self.provider_type @@ -233,6 +245,8 @@ def from_db_model( provider_type=search_settings.provider_type, api_url=search_settings.api_url, retrim_content=retrim_content, + api_version=search_settings.api_version, + deployment_name=search_settings.deployment_name, ) diff --git a/backend/danswer/natural_language_processing/utils.py b/backend/danswer/natural_language_processing/utils.py index d2b9a7d7f1e..35f5629e06f 100644 --- a/backend/danswer/natural_language_processing/utils.py +++ b/backend/danswer/natural_language_processing/utils.py @@ -7,7 +7,7 @@ from danswer.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE from danswer.configs.model_configs import DOCUMENT_ENCODER_MODEL -from danswer.search.models import InferenceChunk +from danswer.context.search.models import InferenceChunk from danswer.utils.logger import setup_logger from shared_configs.enums import EmbeddingProvider @@ -35,23 +35,31 @@ def decode(self, tokens: list[int]) -> str: class TiktokenTokenizer(BaseTokenizer): _instances: dict[str, "TiktokenTokenizer"] = {} - def __new__(cls, encoding_name: str = "cl100k_base") -> "TiktokenTokenizer": - if encoding_name not in cls._instances: - cls._instances[encoding_name] = super(TiktokenTokenizer, cls).__new__(cls) - return cls._instances[encoding_name] + def __new__(cls, model_name: str) -> "TiktokenTokenizer": + if model_name not in cls._instances: + cls._instances[model_name] = super(TiktokenTokenizer, cls).__new__(cls) + return cls._instances[model_name] - def __init__(self, encoding_name: str = "cl100k_base"): + def __init__(self, model_name: str): if not hasattr(self, "encoder"): import tiktoken - self.encoder = tiktoken.get_encoding(encoding_name) + self.encoder = tiktoken.encoding_for_model(model_name) def encode(self, string: str) -> list[int]: - # this returns no special tokens + # this ignores special tokens that the model is trained on, see encode_ordinary for details return self.encoder.encode_ordinary(string) def tokenize(self, string: str) -> list[str]: - return [self.encoder.decode([token]) for token in self.encode(string)] + encoded = self.encode(string) + decoded = [self.encoder.decode([token]) for token in encoded] + + if len(decoded) != len(encoded): + logger.warning( + f"OpenAI tokenized length {len(decoded)} does not match encoded length {len(encoded)} for string: {string}" + ) + + return decoded def decode(self, tokens: list[int]) -> str: return self.encoder.decode(tokens) @@ -74,42 +82,60 @@ def decode(self, tokens: list[int]) -> str: return self.encoder.decode(tokens) -_TOKENIZER_CACHE: dict[str, BaseTokenizer] = {} +_TOKENIZER_CACHE: dict[tuple[EmbeddingProvider | None, str | None], BaseTokenizer] = {} -def _check_tokenizer_cache(tokenizer_name: str) -> BaseTokenizer: +def _check_tokenizer_cache( + model_provider: EmbeddingProvider | None, model_name: str | None +) -> BaseTokenizer: global _TOKENIZER_CACHE + id_tuple = (model_provider, model_name) + + if id_tuple not in _TOKENIZER_CACHE: + tokenizer = None + + if model_name: + tokenizer = _try_initialize_tokenizer(model_name, model_provider) - if tokenizer_name not in _TOKENIZER_CACHE: - if tokenizer_name == "openai": - _TOKENIZER_CACHE[tokenizer_name] = TiktokenTokenizer("cl100k_base") - return _TOKENIZER_CACHE[tokenizer_name] + if not tokenizer: + logger.info( + f"Falling back to default embedding model: {DOCUMENT_ENCODER_MODEL}" + ) + tokenizer = HuggingFaceTokenizer(DOCUMENT_ENCODER_MODEL) + + _TOKENIZER_CACHE[id_tuple] = tokenizer + + return _TOKENIZER_CACHE[id_tuple] + + +def _try_initialize_tokenizer( + model_name: str, model_provider: EmbeddingProvider | None +) -> BaseTokenizer | None: + tokenizer: BaseTokenizer | None = None + + if model_provider is not None: + # Try using TiktokenTokenizer first if model_provider exists try: - logger.debug(f"Initializing HuggingFaceTokenizer for: {tokenizer_name}") - _TOKENIZER_CACHE[tokenizer_name] = HuggingFaceTokenizer(tokenizer_name) - except Exception as primary_error: - logger.error( - f"Error initializing HuggingFaceTokenizer for {tokenizer_name}: {primary_error}" + tokenizer = TiktokenTokenizer(model_name) + logger.info(f"Initialized TiktokenTokenizer for: {model_name}") + return tokenizer + except Exception as tiktoken_error: + logger.debug( + f"TiktokenTokenizer not available for model {model_name}: {tiktoken_error}" ) + else: + # If no provider specified, try HuggingFaceTokenizer + try: + tokenizer = HuggingFaceTokenizer(model_name) + logger.info(f"Initialized HuggingFaceTokenizer for: {model_name}") + return tokenizer + except Exception as hf_error: logger.warning( - f"Falling back to default embedding model: {DOCUMENT_ENCODER_MODEL}" + f"Failed to initialize HuggingFaceTokenizer for {model_name}: {hf_error}" ) - try: - # Cache this tokenizer name to the default so we don't have to try to load it again - # and fail again - _TOKENIZER_CACHE[tokenizer_name] = HuggingFaceTokenizer( - DOCUMENT_ENCODER_MODEL - ) - except Exception as fallback_error: - logger.error( - f"Error initializing fallback HuggingFaceTokenizer: {fallback_error}" - ) - raise ValueError( - f"Failed to initialize tokenizer for {tokenizer_name} and fallback model" - ) from fallback_error - - return _TOKENIZER_CACHE[tokenizer_name] + # If both initializations fail, return None + return None _DEFAULT_TOKENIZER: BaseTokenizer = HuggingFaceTokenizer(DOCUMENT_ENCODER_MODEL) @@ -118,12 +144,15 @@ def _check_tokenizer_cache(tokenizer_name: str) -> BaseTokenizer: def get_tokenizer( model_name: str | None, provider_type: EmbeddingProvider | str | None ) -> BaseTokenizer: - # Currently all of the viable models use the same sentencepiece tokenizer - # OpenAI uses a different one but currently it's not supported due to quality issues - # the inconsistent chunking makes using the sentencepiece tokenizer default better for now - # LLM tokenizers are specified by strings - global _DEFAULT_TOKENIZER - return _DEFAULT_TOKENIZER + if isinstance(provider_type, str): + try: + provider_type = EmbeddingProvider(provider_type) + except ValueError: + logger.debug( + f"Invalid provider_type '{provider_type}'. Falling back to default tokenizer." + ) + return _DEFAULT_TOKENIZER + return _check_tokenizer_cache(provider_type, model_name) def tokenizer_trim_content( diff --git a/backend/danswer/one_shot_answer/answer_question.py b/backend/danswer/one_shot_answer/answer_question.py index 76aa9907d19..206ecc2d4c5 100644 --- a/backend/danswer/one_shot_answer/answer_question.py +++ b/backend/danswer/one_shot_answer/answer_question.py @@ -16,6 +16,11 @@ from danswer.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT from danswer.configs.chat_configs import QA_TIMEOUT from danswer.configs.constants import MessageType +from danswer.context.search.enums import LLMEvaluationType +from danswer.context.search.models import RerankMetricsContainer +from danswer.context.search.models import RetrievalMetricsContainer +from danswer.context.search.utils import chunks_or_sections_to_search_docs +from danswer.context.search.utils import dedupe_documents from danswer.db.chat import create_chat_session from danswer.db.chat import create_db_search_doc from danswer.db.chat import create_new_chat_message @@ -40,26 +45,27 @@ from danswer.one_shot_answer.models import OneShotQAResponse from danswer.one_shot_answer.models import QueryRephrase from danswer.one_shot_answer.qa_utils import combine_message_thread -from danswer.search.enums import LLMEvaluationType -from danswer.search.models import RerankMetricsContainer -from danswer.search.models import RetrievalMetricsContainer -from danswer.search.utils import chunks_or_sections_to_search_docs -from danswer.search.utils import dedupe_documents +from danswer.one_shot_answer.qa_utils import slackify_message_thread from danswer.secondary_llm_flows.answer_validation import get_answer_validity from danswer.secondary_llm_flows.query_expansion import thread_based_query_rephrase from danswer.server.query_and_chat.models import ChatMessageDetail from danswer.server.utils import get_json_line from danswer.tools.force import ForceUseTool -from danswer.tools.search.search_tool import SEARCH_DOC_CONTENT_ID -from danswer.tools.search.search_tool import SEARCH_RESPONSE_SUMMARY_ID -from danswer.tools.search.search_tool import SearchResponseSummary -from danswer.tools.search.search_tool import SearchTool -from danswer.tools.search.search_tool import SECTION_RELEVANCE_LIST_ID -from danswer.tools.tool import ToolResponse +from danswer.tools.models import ToolResponse +from danswer.tools.tool_implementations.search.search_tool import SEARCH_DOC_CONTENT_ID +from danswer.tools.tool_implementations.search.search_tool import ( + SEARCH_RESPONSE_SUMMARY_ID, +) +from danswer.tools.tool_implementations.search.search_tool import SearchResponseSummary +from danswer.tools.tool_implementations.search.search_tool import SearchTool +from danswer.tools.tool_implementations.search.search_tool import ( + SECTION_RELEVANCE_LIST_ID, +) from danswer.tools.tool_runner import ToolCallKickoff from danswer.utils.logger import setup_logger +from danswer.utils.long_term_log import LongTermLogger from danswer.utils.timing import log_generator_function_time -from ee.danswer.server.query_and_chat.utils import create_temporary_persona +from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop from sqlalchemy.orm import Session logger = setup_logger() @@ -121,16 +127,35 @@ def stream_answer_objects( danswerbot_flow=danswerbot_flow, ) + # permanent "log" store, used primarily for debugging + long_term_logger = LongTermLogger( + metadata={"user_id": str(user_id), "chat_session_id": str(chat_session.id)} + ) + temporary_persona: Persona | None = None + if query_req.persona_config is not None: - new_persona = create_temporary_persona( - db_session=db_session, persona_config=query_req.persona_config, user=user - ) - temporary_persona = new_persona + temporary_persona = fetch_ee_implementation_or_noop( + "danswer.server.query_and_chat.utils", "create_temporary_persona", None + )(db_session=db_session, persona_config=query_req.persona_config, user=user) persona = temporary_persona if temporary_persona else chat_session.persona - llm, fast_llm = get_llms_for_persona(persona=persona) + try: + llm, fast_llm = get_llms_for_persona( + persona=persona, long_term_logger=long_term_logger + ) + except ValueError as e: + logger.error( + f"Failed to initialize LLMs for persona '{persona.name}': {str(e)}" + ) + if "No LLM provider" in str(e): + raise ValueError( + "Please configure a Generative AI model to use this feature." + ) from e + raise ValueError( + "Failed to initialize the AI model. Please check your configuration and try again." + ) from e llm_tokenizer = get_tokenizer( model_name=llm.config.model_name, @@ -171,13 +196,22 @@ def stream_answer_objects( ) prompt = persona.prompts[0] + user_message_str = query_msg.message + # For this endpoint, we only save one user message to the chat session + # However, for slackbot, we want to include the history of the entire thread + if danswerbot_flow: + # Right now, we only support bringing over citations and search docs + # from the last message in the thread, not the entire thread + # in the future, we may want to retrieve the entire thread + user_message_str = slackify_message_thread(query_req.messages) + # Create the first User query message new_user_message = create_new_chat_message( chat_session_id=chat_session.id, parent_message=root_message, prompt_id=query_req.prompt_id, - message=query_msg.message, - token_count=len(llm_tokenizer.encode(query_msg.message)), + message=user_message_str, + token_count=len(llm_tokenizer.encode(user_message_str)), message_type=MessageType.USER, db_session=db_session, commit=True, @@ -191,6 +225,12 @@ def stream_answer_objects( max_tokens=max_document_tokens, ) + answer_config = AnswerStyleConfig( + citation_config=CitationConfig() if use_citations else None, + quotes_config=QuotesConfig() if not use_citations else None, + document_pruning_config=document_pruning_config, + ) + search_tool = SearchTool( db_session=db_session, user=user, @@ -205,24 +245,21 @@ def stream_answer_objects( llm=llm, fast_llm=fast_llm, pruning_config=document_pruning_config, + answer_style_config=answer_config, bypass_acl=bypass_acl, chunks_above=query_req.chunks_above, chunks_below=query_req.chunks_below, full_doc=query_req.full_doc, ) - answer_config = AnswerStyleConfig( - citation_config=CitationConfig() if use_citations else None, - quotes_config=QuotesConfig() if not use_citations else None, - document_pruning_config=document_pruning_config, - ) - answer = Answer( question=query_msg.message, user_email=user_email, answer_style_config=answer_config, prompt_config=PromptConfig.from_model(prompt), - llm=get_main_llm_from_tuple(get_llms_for_persona(persona=persona)), + llm=get_main_llm_from_tuple( + get_llms_for_persona(persona=persona, long_term_logger=long_term_logger) + ), single_message_history=history_str, tools=[search_tool] if search_tool else [], force_use_tool=( @@ -238,7 +275,7 @@ def stream_answer_objects( return_contexts=query_req.return_contexts, skip_gen_ai_answer_generation=query_req.skip_gen_ai_answer_generation, ) - # won't be any ImageGenerationDisplay responses since that tool is never passed in + # won't be any FileChatDisplay responses since that tool is never passed in for packet in cast(AnswerObjectIterator, answer.processed_streamed_output): # for one-shot flow, don't currently do anything with these if isinstance(packet, ToolResponse): diff --git a/backend/danswer/one_shot_answer/models.py b/backend/danswer/one_shot_answer/models.py index 735fc12bbb9..630c7b5cab4 100644 --- a/backend/danswer/one_shot_answer/models.py +++ b/backend/danswer/one_shot_answer/models.py @@ -9,12 +9,12 @@ from danswer.chat.models import DanswerQuotes from danswer.chat.models import QADocsResponse from danswer.configs.constants import MessageType -from danswer.search.enums import LLMEvaluationType -from danswer.search.enums import RecencyBiasSetting -from danswer.search.enums import SearchType -from danswer.search.models import ChunkContext -from danswer.search.models import RerankingDetails -from danswer.search.models import RetrievalDetails +from danswer.context.search.enums import LLMEvaluationType +from danswer.context.search.enums import RecencyBiasSetting +from danswer.context.search.enums import SearchType +from danswer.context.search.models import ChunkContext +from danswer.context.search.models import RerankingDetails +from danswer.context.search.models import RetrievalDetails class QueryRephrase(BaseModel): @@ -36,10 +36,6 @@ class PromptConfig(BaseModel): datetime_aware: bool = True -class DocumentSetConfig(BaseModel): - id: int - - class ToolConfig(BaseModel): id: int diff --git a/backend/danswer/one_shot_answer/qa_utils.py b/backend/danswer/one_shot_answer/qa_utils.py index 6fbad99eff1..8770a3b1413 100644 --- a/backend/danswer/one_shot_answer/qa_utils.py +++ b/backend/danswer/one_shot_answer/qa_utils.py @@ -51,3 +51,31 @@ def combine_message_thread( total_token_count += message_token_count return "\n\n".join(message_strs) + + +def slackify_message(message: ThreadMessage) -> str: + if message.role != MessageType.USER: + return message.message + + return f"{message.sender or 'Unknown User'} said in Slack:\n{message.message}" + + +def slackify_message_thread(messages: list[ThreadMessage]) -> str: + if not messages: + return "" + + message_strs: list[str] = [] + for message in messages: + if message.role == MessageType.USER: + message_text = ( + f"{message.sender or 'Unknown User'} said in Slack:\n{message.message}" + ) + elif message.role == MessageType.ASSISTANT: + message_text = f"DanswerBot said in Slack:\n{message.message}" + else: + message_text = ( + f"{message.role.value.upper()} said in Slack:\n{message.message}" + ) + message_strs.append(message_text) + + return "\n\n".join(message_strs) diff --git a/backend/danswer/prompts/chat_prompts.py b/backend/danswer/prompts/chat_prompts.py index a5fa973f37c..a9653254f9a 100644 --- a/backend/danswer/prompts/chat_prompts.py +++ b/backend/danswer/prompts/chat_prompts.py @@ -110,8 +110,8 @@ and additional information or details would provide little or no value. - The query is some task that does not require additional information to handle. -{GENERAL_SEP_PAT} Conversation History: +{GENERAL_SEP_PAT} {{chat_history}} {GENERAL_SEP_PAT} @@ -135,8 +135,8 @@ Strip out any information that is not relevant for the retrieval task. If the follow up message is an error or code snippet, repeat the same input back EXACTLY. -{GENERAL_SEP_PAT} Chat History: +{GENERAL_SEP_PAT} {{chat_history}} {GENERAL_SEP_PAT} @@ -152,8 +152,8 @@ If there is a clear change in topic, ensure the query reflects the new topic accurately. Strip out any information that is not relevant for the internet search. -{GENERAL_SEP_PAT} Chat History: +{GENERAL_SEP_PAT} {{chat_history}} {GENERAL_SEP_PAT} @@ -210,6 +210,7 @@ Focus the name on the important keywords to convey the topic of the conversation. Chat History: +{GENERAL_SEP_PAT} {{chat_history}} {GENERAL_SEP_PAT} diff --git a/backend/danswer/prompts/direct_qa_prompts.py b/backend/danswer/prompts/direct_qa_prompts.py index 1b7448c081c..5f62cb203a9 100644 --- a/backend/danswer/prompts/direct_qa_prompts.py +++ b/backend/danswer/prompts/direct_qa_prompts.py @@ -2,13 +2,11 @@ # It is used also for the one shot direct QA flow import json -from danswer.prompts.constants import ( - DEFAULT_IGNORE_STATEMENT, - FINAL_QUERY_PAT, - GENERAL_SEP_PAT, - QUESTION_PAT, - THOUGHT_PAT, -) +from danswer.prompts.constants import DEFAULT_IGNORE_STATEMENT +from danswer.prompts.constants import FINAL_QUERY_PAT +from danswer.prompts.constants import GENERAL_SEP_PAT +from danswer.prompts.constants import QUESTION_PAT +from danswer.prompts.constants import THOUGHT_PAT ONE_SHOT_SYSTEM_PROMPT = """ You are a question answering system that is constantly learning and improving. @@ -73,7 +71,8 @@ JSON_PROMPT = f""" {{system_prompt}} {REQUIRE_JSON} -{{context_block}}{{history_block}}{{task_prompt}} +{{context_block}}{{history_block}} +{{task_prompt}} SAMPLE RESPONSE: ``` @@ -112,25 +111,13 @@ Make sure you take into account my employee information in the system message.{DEFAULT_IGNORE_STATEMENT} \ You should always get right to the point, and never use extraneous language. -{{task_prompt}} +{{history_block}}{{task_prompt}} {QUESTION_PAT.upper()} {{user_query}} """ -# For weak LLM which only takes one chunk and cannot output json -# Also not requiring quotes as it tends to not work -WEAK_LLM_PROMPT = f""" -{{system_prompt}} -{{context_block}} -{{task_prompt}} - -{QUESTION_PAT.upper()} -{{user_query}} -""".strip() - - # This is only for visualization for the users to specify their own prompts # The actual flow does not work like this PARAMATERIZED_PROMPT = f""" diff --git a/backend/danswer/prompts/prompt_utils.py b/backend/danswer/prompts/prompt_utils.py index 600ff07209b..ac22797bd6d 100644 --- a/backend/danswer/prompts/prompt_utils.py +++ b/backend/danswer/prompts/prompt_utils.py @@ -4,26 +4,29 @@ import redis from danswer.chat.models import LlmDoc -from danswer.configs.app_configs import AIRTABLE_API_TOKEN -from danswer.configs.app_configs import AIRTABLE_EMPLOYEE_BASE_ID -from danswer.configs.app_configs import AIRTABLE_EMPLOYEE_TABLE_NAME_OR_ID -from danswer.configs.app_configs import REDIS_DB_NUMBER -from danswer.configs.app_configs import REDIS_HOST -from danswer.configs.app_configs import REDIS_PORT +from danswer.configs.app_configs import ( + AIRTABLE_API_TOKEN, + AIRTABLE_EMPLOYEE_BASE_ID, + AIRTABLE_EMPLOYEE_TABLE_NAME_OR_ID, + REDIS_DB_NUMBER, + REDIS_HOST, + REDIS_PORT, +) from danswer.configs.chat_configs import LANGUAGE_HINT from danswer.configs.constants import DocumentSource +from danswer.context.search.models import InferenceChunk from danswer.db.models import Prompt from danswer.llm.answering.models import PromptConfig -from danswer.prompts.chat_prompts import ADDITIONAL_INFO -from danswer.prompts.chat_prompts import CITATION_REMINDER +from danswer.prompts.chat_prompts import ADDITIONAL_INFO, CITATION_REMINDER from danswer.prompts.constants import CODE_BLOCK_PAT -from danswer.search.models import InferenceChunk from danswer.utils.logger import setup_logger from langchain_core.messages import BaseMessage from pyairtable import Api as AirtableApi logger = setup_logger() +logger = setup_logger() + MOST_BASIC_PROMPT = "You are a helpful AI assistant." DANSWER_DATETIME_REPLACEMENT = "DANSWER_DATETIME_REPLACEMENT" BASIC_TIME_STR = "The current date is {datetime_info}." @@ -71,7 +74,7 @@ def add_employee_context_to_prompt(prompt_str: str, user_email: str) -> str: # Check Redis for cached employee context cached_context = redis_client.get(user_email) if cached_context: - logger.info("Employee context retrieved from Redis.") + logger.info(f"Employee context retrieved from Redis for email: {user_email}") return prompt_str.replace( DANSWER_EMPLOYEE_REPLACEMENT, cached_context.decode("utf-8") ) @@ -186,14 +189,23 @@ def find_last_index(lst: list[int], max_prompt_tokens: int) -> int: before the list exceeds the maximum""" running_sum = 0 + if not lst: + logger.warning("Empty message history passed to find_last_index") + return 0 + last_ind = 0 for i in range(len(lst) - 1, -1, -1): running_sum += lst[i] + _PER_MESSAGE_TOKEN_BUFFER if running_sum > max_prompt_tokens: last_ind = i + 1 break + if last_ind >= len(lst): + logger.error( + f"Last message alone is too large! max_prompt_tokens: {max_prompt_tokens}, message_token_counts: {lst}" + ) raise ValueError("Last message alone is too large!") + return last_ind diff --git a/backend/danswer/redis/redis_connector.py b/backend/danswer/redis/redis_connector.py new file mode 100644 index 00000000000..8b52a2fd811 --- /dev/null +++ b/backend/danswer/redis/redis_connector.py @@ -0,0 +1,78 @@ +import redis + +from danswer.redis.redis_connector_delete import RedisConnectorDelete +from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync +from danswer.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync +from danswer.redis.redis_connector_index import RedisConnectorIndex +from danswer.redis.redis_connector_prune import RedisConnectorPrune +from danswer.redis.redis_connector_stop import RedisConnectorStop +from danswer.redis.redis_pool import get_redis_client + + +class RedisConnector: + """Composes several classes to simplify interacting with a connector and its + associated background tasks / associated redis interactions.""" + + def __init__(self, tenant_id: str | None, id: int) -> None: + self.tenant_id: str | None = tenant_id + self.id: int = id + self.redis: redis.Redis = get_redis_client(tenant_id=tenant_id) + + self.stop = RedisConnectorStop(tenant_id, id, self.redis) + self.prune = RedisConnectorPrune(tenant_id, id, self.redis) + self.delete = RedisConnectorDelete(tenant_id, id, self.redis) + self.permissions = RedisConnectorPermissionSync(tenant_id, id, self.redis) + self.external_group_sync = RedisConnectorExternalGroupSync( + tenant_id, id, self.redis + ) + + def new_index(self, search_settings_id: int) -> RedisConnectorIndex: + return RedisConnectorIndex( + self.tenant_id, self.id, search_settings_id, self.redis + ) + + @staticmethod + def get_id_from_fence_key(key: str) -> str | None: + """ + Extracts the object ID from a fence key in the format `PREFIX_fence_X`. + + Args: + key (str): The fence key string. + + Returns: + Optional[int]: The extracted ID if the key is in the correct format, otherwise None. + """ + parts = key.split("_") + if len(parts) != 3: + return None + + object_id = parts[2] + return object_id + + @staticmethod + def get_id_from_task_id(task_id: str) -> str | None: + """ + Extracts the object ID from a task ID string. + + This method assumes the task ID is formatted as `prefix_objectid_suffix`, where: + - `prefix` is an arbitrary string (e.g., the name of the task or entity), + - `objectid` is the ID you want to extract, + - `suffix` is another arbitrary string (e.g., a UUID). + + Example: + If the input `task_id` is `documentset_1_cbfdc96a-80ca-4312-a242-0bb68da3c1dc`, + this method will return the string `"1"`. + + Args: + task_id (str): The task ID string from which to extract the object ID. + + Returns: + str | None: The extracted object ID if the task ID is in the correct format, otherwise None. + """ + # example: task_id=documentset_1_cbfdc96a-80ca-4312-a242-0bb68da3c1dc + parts = task_id.split("_") + if len(parts) != 3: + return None + + object_id = parts[1] + return object_id diff --git a/backend/danswer/redis/redis_connector_credential_pair.py b/backend/danswer/redis/redis_connector_credential_pair.py new file mode 100644 index 00000000000..f624fa1542a --- /dev/null +++ b/backend/danswer/redis/redis_connector_credential_pair.py @@ -0,0 +1,118 @@ +import time +from typing import cast +from uuid import uuid4 + +from celery import Celery +from redis import Redis +from redis.lock import Lock as RedisLock +from sqlalchemy.orm import Session + +from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.configs.constants import DanswerCeleryPriority +from danswer.configs.constants import DanswerCeleryQueues +from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id +from danswer.db.document import ( + construct_document_select_for_connector_credential_pair_by_needs_sync, +) +from danswer.db.models import Document +from danswer.redis.redis_object_helper import RedisObjectHelper + + +class RedisConnectorCredentialPair(RedisObjectHelper): + """This class is used to scan documents by cc_pair in the db and collect them into + a unified set for syncing. + + It differs from the other redis helpers in that the taskset used spans + all connectors and is not per connector.""" + + PREFIX = "connectorsync" + FENCE_PREFIX = PREFIX + "_fence" + TASKSET_PREFIX = PREFIX + "_taskset" + + def __init__(self, tenant_id: str | None, id: int) -> None: + super().__init__(tenant_id, str(id)) + + # documents that should be skipped + self.skip_docs: set[str] = set() + + @classmethod + def get_fence_key(cls) -> str: + return RedisConnectorCredentialPair.FENCE_PREFIX + + @classmethod + def get_taskset_key(cls) -> str: + return RedisConnectorCredentialPair.TASKSET_PREFIX + + @property + def taskset_key(self) -> str: + """Notice that this is intentionally reusing the same taskset for all + connector syncs""" + # example: connector_taskset + return f"{self.TASKSET_PREFIX}" + + def set_skip_docs(self, skip_docs: set[str]) -> None: + # documents that should be skipped. Note that this classes updates + # the list on the fly + self.skip_docs = skip_docs + + def generate_tasks( + self, + celery_app: Celery, + db_session: Session, + redis_client: Redis, + lock: RedisLock, + tenant_id: str | None, + ) -> tuple[int, int] | None: + last_lock_time = time.monotonic() + + async_results = [] + cc_pair = get_connector_credential_pair_from_id(int(self._id), db_session) + if not cc_pair: + return None + + stmt = construct_document_select_for_connector_credential_pair_by_needs_sync( + cc_pair.connector_id, cc_pair.credential_id + ) + + num_docs = 0 + + for doc in db_session.scalars(stmt).yield_per(1): + doc = cast(Document, doc) + current_time = time.monotonic() + if current_time - last_lock_time >= ( + CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4 + ): + lock.reacquire() + last_lock_time = current_time + + num_docs += 1 + + # check if we should skip the document (typically because it's already syncing) + if doc.id in self.skip_docs: + continue + + # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac" + # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac" + # we prefix the task id so it's easier to keep track of who created the task + # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac" + custom_task_id = f"{self.task_id_prefix}_{uuid4()}" + + # add to the tracking taskset in redis BEFORE creating the celery task. + # note that for the moment we are using a single taskset key, not differentiated by cc_pair id + redis_client.sadd( + RedisConnectorCredentialPair.get_taskset_key(), custom_task_id + ) + + # Priority on sync's triggered by new indexing should be medium + result = celery_app.send_task( + "vespa_metadata_sync_task", + kwargs=dict(document_id=doc.id, tenant_id=tenant_id), + queue=DanswerCeleryQueues.VESPA_METADATA_SYNC, + task_id=custom_task_id, + priority=DanswerCeleryPriority.MEDIUM, + ) + + async_results.append(result) + self.skip_docs.add(doc.id) + + return len(async_results), num_docs diff --git a/backend/danswer/redis/redis_connector_delete.py b/backend/danswer/redis/redis_connector_delete.py new file mode 100644 index 00000000000..1b7a440b2e5 --- /dev/null +++ b/backend/danswer/redis/redis_connector_delete.py @@ -0,0 +1,150 @@ +import time +from datetime import datetime +from typing import cast +from uuid import uuid4 + +import redis +from celery import Celery +from pydantic import BaseModel +from redis.lock import Lock as RedisLock +from sqlalchemy.orm import Session + +from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.configs.constants import DanswerCeleryPriority +from danswer.configs.constants import DanswerCeleryQueues +from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id +from danswer.db.document import construct_document_select_for_connector_credential_pair +from danswer.db.models import Document as DbDocument + + +class RedisConnectorDeletePayload(BaseModel): + num_tasks: int | None + submitted: datetime + + +class RedisConnectorDelete: + """Manages interactions with redis for deletion tasks. Should only be accessed + through RedisConnector.""" + + PREFIX = "connectordeletion" + FENCE_PREFIX = f"{PREFIX}_fence" # "connectordeletion_fence" + TASKSET_PREFIX = f"{PREFIX}_taskset" # "connectordeletion_taskset" + + def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None: + self.tenant_id: str | None = tenant_id + self.id = id + self.redis = redis + + self.fence_key: str = f"{self.FENCE_PREFIX}_{id}" + self.taskset_key = f"{self.TASKSET_PREFIX}_{id}" + + def taskset_clear(self) -> None: + self.redis.delete(self.taskset_key) + + def get_remaining(self) -> int: + # todo: move into fence + remaining = cast(int, self.redis.scard(self.taskset_key)) + return remaining + + @property + def fenced(self) -> bool: + if self.redis.exists(self.fence_key): + return True + + return False + + @property + def payload(self) -> RedisConnectorDeletePayload | None: + # read related data and evaluate/print task progress + fence_bytes = cast(bytes, self.redis.get(self.fence_key)) + if fence_bytes is None: + return None + + fence_str = fence_bytes.decode("utf-8") + payload = RedisConnectorDeletePayload.model_validate_json(cast(str, fence_str)) + + return payload + + def set_fence(self, payload: RedisConnectorDeletePayload | None) -> None: + if not payload: + self.redis.delete(self.fence_key) + return + + self.redis.set(self.fence_key, payload.model_dump_json()) + + def _generate_task_id(self) -> str: + # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac" + # we prefix the task id so it's easier to keep track of who created the task + # aka "connectordeletion_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac" + + return f"{self.PREFIX}_{self.id}_{uuid4()}" + + def generate_tasks( + self, + celery_app: Celery, + db_session: Session, + lock: RedisLock, + ) -> int | None: + """Returns None if the cc_pair doesn't exist. + Otherwise, returns an int with the number of generated tasks.""" + last_lock_time = time.monotonic() + + async_results = [] + cc_pair = get_connector_credential_pair_from_id(int(self.id), db_session) + if not cc_pair: + return None + + stmt = construct_document_select_for_connector_credential_pair( + cc_pair.connector_id, cc_pair.credential_id + ) + for doc_temp in db_session.scalars(stmt).yield_per(1): + doc: DbDocument = doc_temp + current_time = time.monotonic() + if current_time - last_lock_time >= ( + CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4 + ): + lock.reacquire() + last_lock_time = current_time + + custom_task_id = self._generate_task_id() + + # add to the tracking taskset in redis BEFORE creating the celery task. + # note that for the moment we are using a single taskset key, not differentiated by cc_pair id + self.redis.sadd(self.taskset_key, custom_task_id) + + # Priority on sync's triggered by new indexing should be medium + result = celery_app.send_task( + "document_by_cc_pair_cleanup_task", + kwargs=dict( + document_id=doc.id, + connector_id=cc_pair.connector_id, + credential_id=cc_pair.credential_id, + tenant_id=self.tenant_id, + ), + queue=DanswerCeleryQueues.CONNECTOR_DELETION, + task_id=custom_task_id, + priority=DanswerCeleryPriority.MEDIUM, + ) + + async_results.append(result) + + return len(async_results) + + def reset(self) -> None: + self.redis.delete(self.taskset_key) + self.redis.delete(self.fence_key) + + @staticmethod + def remove_from_taskset(id: int, task_id: str, r: redis.Redis) -> None: + taskset_key = f"{RedisConnectorDelete.TASKSET_PREFIX}_{id}" + r.srem(taskset_key, task_id) + return + + @staticmethod + def reset_all(r: redis.Redis) -> None: + """Deletes all redis values for all connectors""" + for key in r.scan_iter(RedisConnectorDelete.TASKSET_PREFIX + "*"): + r.delete(key) + + for key in r.scan_iter(RedisConnectorDelete.FENCE_PREFIX + "*"): + r.delete(key) diff --git a/backend/danswer/redis/redis_connector_doc_perm_sync.py b/backend/danswer/redis/redis_connector_doc_perm_sync.py new file mode 100644 index 00000000000..d9c3cd814ff --- /dev/null +++ b/backend/danswer/redis/redis_connector_doc_perm_sync.py @@ -0,0 +1,188 @@ +import time +from datetime import datetime +from typing import cast +from uuid import uuid4 + +import redis +from celery import Celery +from pydantic import BaseModel +from redis.lock import Lock as RedisLock + +from danswer.access.models import DocExternalAccess +from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.configs.constants import DanswerCeleryPriority +from danswer.configs.constants import DanswerCeleryQueues + + +class RedisConnectorPermissionSyncData(BaseModel): + started: datetime | None + + +class RedisConnectorPermissionSync: + """Manages interactions with redis for doc permission sync tasks. Should only be accessed + through RedisConnector.""" + + PREFIX = "connectordocpermissionsync" + + FENCE_PREFIX = f"{PREFIX}_fence" + + # phase 1 - geneartor task and progress signals + GENERATORTASK_PREFIX = f"{PREFIX}+generator" # connectorpermissions+generator + GENERATOR_PROGRESS_PREFIX = ( + PREFIX + "_generator_progress" + ) # connectorpermissions_generator_progress + GENERATOR_COMPLETE_PREFIX = ( + PREFIX + "_generator_complete" + ) # connectorpermissions_generator_complete + + TASKSET_PREFIX = f"{PREFIX}_taskset" # connectorpermissions_taskset + SUBTASK_PREFIX = f"{PREFIX}+sub" # connectorpermissions+sub + + def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None: + self.tenant_id: str | None = tenant_id + self.id = id + self.redis = redis + + self.fence_key: str = f"{self.FENCE_PREFIX}_{id}" + self.generator_task_key = f"{self.GENERATORTASK_PREFIX}_{id}" + self.generator_progress_key = f"{self.GENERATOR_PROGRESS_PREFIX}_{id}" + self.generator_complete_key = f"{self.GENERATOR_COMPLETE_PREFIX}_{id}" + + self.taskset_key = f"{self.TASKSET_PREFIX}_{id}" + + self.subtask_prefix: str = f"{self.SUBTASK_PREFIX}_{id}" + + def taskset_clear(self) -> None: + self.redis.delete(self.taskset_key) + + def generator_clear(self) -> None: + self.redis.delete(self.generator_progress_key) + self.redis.delete(self.generator_complete_key) + + def get_remaining(self) -> int: + remaining = cast(int, self.redis.scard(self.taskset_key)) + return remaining + + def get_active_task_count(self) -> int: + """Count of active permission sync tasks""" + count = 0 + for _ in self.redis.scan_iter(RedisConnectorPermissionSync.FENCE_PREFIX + "*"): + count += 1 + return count + + @property + def fenced(self) -> bool: + if self.redis.exists(self.fence_key): + return True + + return False + + @property + def payload(self) -> RedisConnectorPermissionSyncData | None: + # read related data and evaluate/print task progress + fence_bytes = cast(bytes, self.redis.get(self.fence_key)) + if fence_bytes is None: + return None + + fence_str = fence_bytes.decode("utf-8") + payload = RedisConnectorPermissionSyncData.model_validate_json( + cast(str, fence_str) + ) + + return payload + + def set_fence( + self, + payload: RedisConnectorPermissionSyncData | None, + ) -> None: + if not payload: + self.redis.delete(self.fence_key) + return + + self.redis.set(self.fence_key, payload.model_dump_json()) + + @property + def generator_complete(self) -> int | None: + """the fence payload is an int representing the starting number of + permission sync tasks to be processed ... just after the generator completes.""" + fence_bytes = self.redis.get(self.generator_complete_key) + if fence_bytes is None: + return None + + if fence_bytes == b"None": + return None + + fence_int = int(cast(bytes, fence_bytes).decode()) + return fence_int + + @generator_complete.setter + def generator_complete(self, payload: int | None) -> None: + """Set the payload to an int to set the fence, otherwise if None it will + be deleted""" + if payload is None: + self.redis.delete(self.generator_complete_key) + return + + self.redis.set(self.generator_complete_key, payload) + + def generate_tasks( + self, + celery_app: Celery, + lock: RedisLock | None, + new_permissions: list[DocExternalAccess], + source_string: str, + ) -> int | None: + last_lock_time = time.monotonic() + async_results = [] + + # Create a task for each document permission sync + for doc_perm in new_permissions: + current_time = time.monotonic() + if lock and current_time - last_lock_time >= ( + CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4 + ): + lock.reacquire() + last_lock_time = current_time + # Add task for document permissions sync + custom_task_id = f"{self.subtask_prefix}_{uuid4()}" + self.redis.sadd(self.taskset_key, custom_task_id) + + result = celery_app.send_task( + "update_external_document_permissions_task", + kwargs=dict( + tenant_id=self.tenant_id, + serialized_doc_external_access=doc_perm.to_dict(), + source_string=source_string, + ), + queue=DanswerCeleryQueues.DOC_PERMISSIONS_UPSERT, + task_id=custom_task_id, + priority=DanswerCeleryPriority.MEDIUM, + ) + async_results.append(result) + + return len(async_results) + + @staticmethod + def remove_from_taskset(id: int, task_id: str, r: redis.Redis) -> None: + taskset_key = f"{RedisConnectorPermissionSync.TASKSET_PREFIX}_{id}" + r.srem(taskset_key, task_id) + return + + @staticmethod + def reset_all(r: redis.Redis) -> None: + """Deletes all redis values for all connectors""" + for key in r.scan_iter(RedisConnectorPermissionSync.TASKSET_PREFIX + "*"): + r.delete(key) + + for key in r.scan_iter( + RedisConnectorPermissionSync.GENERATOR_COMPLETE_PREFIX + "*" + ): + r.delete(key) + + for key in r.scan_iter( + RedisConnectorPermissionSync.GENERATOR_PROGRESS_PREFIX + "*" + ): + r.delete(key) + + for key in r.scan_iter(RedisConnectorPermissionSync.FENCE_PREFIX + "*"): + r.delete(key) diff --git a/backend/danswer/redis/redis_connector_ext_group_sync.py b/backend/danswer/redis/redis_connector_ext_group_sync.py new file mode 100644 index 00000000000..631845648c3 --- /dev/null +++ b/backend/danswer/redis/redis_connector_ext_group_sync.py @@ -0,0 +1,134 @@ +from typing import cast + +import redis +from celery import Celery +from redis.lock import Lock as RedisLock +from sqlalchemy.orm import Session + + +class RedisConnectorExternalGroupSync: + """Manages interactions with redis for external group syncing tasks. Should only be accessed + through RedisConnector.""" + + PREFIX = "connectorexternalgroupsync" + + FENCE_PREFIX = f"{PREFIX}_fence" + + # phase 1 - geneartor task and progress signals + GENERATORTASK_PREFIX = f"{PREFIX}+generator" # connectorexternalgroupsync+generator + GENERATOR_PROGRESS_PREFIX = ( + PREFIX + "_generator_progress" + ) # connectorexternalgroupsync_generator_progress + GENERATOR_COMPLETE_PREFIX = ( + PREFIX + "_generator_complete" + ) # connectorexternalgroupsync_generator_complete + + TASKSET_PREFIX = f"{PREFIX}_taskset" # connectorexternalgroupsync_taskset + SUBTASK_PREFIX = f"{PREFIX}+sub" # connectorexternalgroupsync+sub + + def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None: + self.tenant_id: str | None = tenant_id + self.id = id + self.redis = redis + + self.fence_key: str = f"{self.FENCE_PREFIX}_{id}" + self.generator_task_key = f"{self.GENERATORTASK_PREFIX}_{id}" + self.generator_progress_key = f"{self.GENERATOR_PROGRESS_PREFIX}_{id}" + self.generator_complete_key = f"{self.GENERATOR_COMPLETE_PREFIX}_{id}" + + self.taskset_key = f"{self.TASKSET_PREFIX}_{id}" + + self.subtask_prefix: str = f"{self.SUBTASK_PREFIX}_{id}" + + def taskset_clear(self) -> None: + self.redis.delete(self.taskset_key) + + def generator_clear(self) -> None: + self.redis.delete(self.generator_progress_key) + self.redis.delete(self.generator_complete_key) + + def get_remaining(self) -> int: + # todo: move into fence + remaining = cast(int, self.redis.scard(self.taskset_key)) + return remaining + + def get_active_task_count(self) -> int: + """Count of active external group syncing tasks""" + count = 0 + for _ in self.redis.scan_iter( + RedisConnectorExternalGroupSync.FENCE_PREFIX + "*" + ): + count += 1 + return count + + @property + def fenced(self) -> bool: + if self.redis.exists(self.fence_key): + return True + + return False + + def set_fence(self, value: bool) -> None: + if not value: + self.redis.delete(self.fence_key) + return + + self.redis.set(self.fence_key, 0) + + @property + def generator_complete(self) -> int | None: + """the fence payload is an int representing the starting number of + external group syncing tasks to be processed ... just after the generator completes. + """ + fence_bytes = self.redis.get(self.generator_complete_key) + if fence_bytes is None: + return None + + if fence_bytes == b"None": + return None + + fence_int = int(cast(bytes, fence_bytes).decode()) + return fence_int + + @generator_complete.setter + def generator_complete(self, payload: int | None) -> None: + """Set the payload to an int to set the fence, otherwise if None it will + be deleted""" + if payload is None: + self.redis.delete(self.generator_complete_key) + return + + self.redis.set(self.generator_complete_key, payload) + + def generate_tasks( + self, + celery_app: Celery, + db_session: Session, + lock: RedisLock | None, + ) -> int | None: + pass + + @staticmethod + def remove_from_taskset(id: int, task_id: str, r: redis.Redis) -> None: + taskset_key = f"{RedisConnectorExternalGroupSync.TASKSET_PREFIX}_{id}" + r.srem(taskset_key, task_id) + return + + @staticmethod + def reset_all(r: redis.Redis) -> None: + """Deletes all redis values for all connectors""" + for key in r.scan_iter(RedisConnectorExternalGroupSync.TASKSET_PREFIX + "*"): + r.delete(key) + + for key in r.scan_iter( + RedisConnectorExternalGroupSync.GENERATOR_COMPLETE_PREFIX + "*" + ): + r.delete(key) + + for key in r.scan_iter( + RedisConnectorExternalGroupSync.GENERATOR_PROGRESS_PREFIX + "*" + ): + r.delete(key) + + for key in r.scan_iter(RedisConnectorExternalGroupSync.FENCE_PREFIX + "*"): + r.delete(key) diff --git a/backend/danswer/redis/redis_connector_index.py b/backend/danswer/redis/redis_connector_index.py new file mode 100644 index 00000000000..10fd3667fda --- /dev/null +++ b/backend/danswer/redis/redis_connector_index.py @@ -0,0 +1,144 @@ +from datetime import datetime +from typing import cast +from uuid import uuid4 + +import redis +from pydantic import BaseModel + + +class RedisConnectorIndexPayload(BaseModel): + index_attempt_id: int | None + started: datetime | None + submitted: datetime + celery_task_id: str | None + + +class RedisConnectorIndex: + """Manages interactions with redis for indexing tasks. Should only be accessed + through RedisConnector.""" + + PREFIX = "connectorindexing" + FENCE_PREFIX = f"{PREFIX}_fence" # "connectorindexing_fence" + GENERATOR_TASK_PREFIX = PREFIX + "+generator" # "connectorindexing+generator_fence" + GENERATOR_PROGRESS_PREFIX = ( + PREFIX + "_generator_progress" + ) # connectorindexing_generator_progress + GENERATOR_COMPLETE_PREFIX = ( + PREFIX + "_generator_complete" + ) # connectorindexing_generator_complete + + GENERATOR_LOCK_PREFIX = "da_lock:indexing" + + def __init__( + self, + tenant_id: str | None, + id: int, + search_settings_id: int, + redis: redis.Redis, + ) -> None: + self.tenant_id: str | None = tenant_id + self.id = id + self.search_settings_id = search_settings_id + self.redis = redis + + self.fence_key: str = f"{self.FENCE_PREFIX}_{id}/{search_settings_id}" + self.generator_progress_key = ( + f"{self.GENERATOR_PROGRESS_PREFIX}_{id}/{search_settings_id}" + ) + self.generator_complete_key = ( + f"{self.GENERATOR_COMPLETE_PREFIX}_{id}/{search_settings_id}" + ) + self.generator_lock_key = ( + f"{self.GENERATOR_LOCK_PREFIX}_{id}/{search_settings_id}" + ) + + @classmethod + def fence_key_with_ids(cls, cc_pair_id: int, search_settings_id: int) -> str: + return f"{cls.FENCE_PREFIX}_{cc_pair_id}/{search_settings_id}" + + def generate_generator_task_id(self) -> str: + # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac" + # we prefix the task id so it's easier to keep track of who created the task + # aka "connectorindexing+generator_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac" + + return f"{self.GENERATOR_TASK_PREFIX}_{self.id}/{self.search_settings_id}_{uuid4()}" + + @property + def fenced(self) -> bool: + if self.redis.exists(self.fence_key): + return True + + return False + + @property + def payload(self) -> RedisConnectorIndexPayload | None: + # read related data and evaluate/print task progress + fence_bytes = cast(bytes, self.redis.get(self.fence_key)) + if fence_bytes is None: + return None + + fence_str = fence_bytes.decode("utf-8") + payload = RedisConnectorIndexPayload.model_validate_json(cast(str, fence_str)) + + return payload + + def set_fence( + self, + payload: RedisConnectorIndexPayload | None, + ) -> None: + if not payload: + self.redis.delete(self.fence_key) + return + + self.redis.set(self.fence_key, payload.model_dump_json()) + + def set_generator_complete(self, payload: int | None) -> None: + if not payload: + self.redis.delete(self.generator_complete_key) + return + + self.redis.set(self.generator_complete_key, payload) + + def generator_clear(self) -> None: + self.redis.delete(self.generator_progress_key) + self.redis.delete(self.generator_complete_key) + + def get_progress(self) -> int | None: + """Returns None if the key doesn't exist. The""" + # TODO: move into fence? + bytes = self.redis.get(self.generator_progress_key) + if bytes is None: + return None + + progress = int(cast(int, bytes)) + return progress + + def get_completion(self) -> int | None: + # TODO: move into fence? + bytes = self.redis.get(self.generator_complete_key) + if bytes is None: + return None + + status = int(cast(int, bytes)) + return status + + def reset(self) -> None: + self.redis.delete(self.generator_lock_key) + self.redis.delete(self.generator_progress_key) + self.redis.delete(self.generator_complete_key) + self.redis.delete(self.fence_key) + + @staticmethod + def reset_all(r: redis.Redis) -> None: + """Deletes all redis values for all connectors""" + for key in r.scan_iter(RedisConnectorIndex.GENERATOR_LOCK_PREFIX + "*"): + r.delete(key) + + for key in r.scan_iter(RedisConnectorIndex.GENERATOR_COMPLETE_PREFIX + "*"): + r.delete(key) + + for key in r.scan_iter(RedisConnectorIndex.GENERATOR_PROGRESS_PREFIX + "*"): + r.delete(key) + + for key in r.scan_iter(RedisConnectorIndex.FENCE_PREFIX + "*"): + r.delete(key) diff --git a/backend/danswer/redis/redis_connector_prune.py b/backend/danswer/redis/redis_connector_prune.py new file mode 100644 index 00000000000..f8e6f372619 --- /dev/null +++ b/backend/danswer/redis/redis_connector_prune.py @@ -0,0 +1,178 @@ +import time +from typing import cast +from uuid import uuid4 + +import redis +from celery import Celery +from redis.lock import Lock as RedisLock +from sqlalchemy.orm import Session + +from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.configs.constants import DanswerCeleryPriority +from danswer.configs.constants import DanswerCeleryQueues +from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id + + +class RedisConnectorPrune: + """Manages interactions with redis for pruning tasks. Should only be accessed + through RedisConnector.""" + + PREFIX = "connectorpruning" + + FENCE_PREFIX = f"{PREFIX}_fence" + + # phase 1 - geneartor task and progress signals + GENERATORTASK_PREFIX = f"{PREFIX}+generator" # connectorpruning+generator + GENERATOR_PROGRESS_PREFIX = ( + PREFIX + "_generator_progress" + ) # connectorpruning_generator_progress + GENERATOR_COMPLETE_PREFIX = ( + PREFIX + "_generator_complete" + ) # connectorpruning_generator_complete + + TASKSET_PREFIX = f"{PREFIX}_taskset" # connectorpruning_taskset + SUBTASK_PREFIX = f"{PREFIX}+sub" # connectorpruning+sub + + def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None: + self.tenant_id: str | None = tenant_id + self.id = id + self.redis = redis + + self.fence_key: str = f"{self.FENCE_PREFIX}_{id}" + self.generator_task_key = f"{self.GENERATORTASK_PREFIX}_{id}" + self.generator_progress_key = f"{self.GENERATOR_PROGRESS_PREFIX}_{id}" + self.generator_complete_key = f"{self.GENERATOR_COMPLETE_PREFIX}_{id}" + + self.taskset_key = f"{self.TASKSET_PREFIX}_{id}" + + self.subtask_prefix: str = f"{self.SUBTASK_PREFIX}_{id}" + + def taskset_clear(self) -> None: + self.redis.delete(self.taskset_key) + + def generator_clear(self) -> None: + self.redis.delete(self.generator_progress_key) + self.redis.delete(self.generator_complete_key) + + def get_remaining(self) -> int: + # todo: move into fence + remaining = cast(int, self.redis.scard(self.taskset_key)) + return remaining + + def get_active_task_count(self) -> int: + """Count of active pruning tasks""" + count = 0 + for key in self.redis.scan_iter(RedisConnectorPrune.FENCE_PREFIX + "*"): + count += 1 + return count + + @property + def fenced(self) -> bool: + if self.redis.exists(self.fence_key): + return True + + return False + + def set_fence(self, value: bool) -> None: + if not value: + self.redis.delete(self.fence_key) + return + + self.redis.set(self.fence_key, 0) + + @property + def generator_complete(self) -> int | None: + """the fence payload is an int representing the starting number of + pruning tasks to be processed ... just after the generator completes.""" + fence_bytes = self.redis.get(self.generator_complete_key) + if fence_bytes is None: + return None + + fence_int = cast(int, fence_bytes) + return fence_int + + @generator_complete.setter + def generator_complete(self, payload: int | None) -> None: + """Set the payload to an int to set the fence, otherwise if None it will + be deleted""" + if payload is None: + self.redis.delete(self.generator_complete_key) + return + + self.redis.set(self.generator_complete_key, payload) + + def generate_tasks( + self, + documents_to_prune: set[str], + celery_app: Celery, + db_session: Session, + lock: RedisLock | None, + ) -> int | None: + last_lock_time = time.monotonic() + + async_results = [] + cc_pair = get_connector_credential_pair_from_id(int(self.id), db_session) + if not cc_pair: + return None + + for doc_id in documents_to_prune: + current_time = time.monotonic() + if lock and current_time - last_lock_time >= ( + CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4 + ): + lock.reacquire() + last_lock_time = current_time + + # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac" + # the actual redis key is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac" + # we prefix the task id so it's easier to keep track of who created the task + # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac" + custom_task_id = f"{self.subtask_prefix}_{uuid4()}" + + # add to the tracking taskset in redis BEFORE creating the celery task. + self.redis.sadd(self.taskset_key, custom_task_id) + + # Priority on sync's triggered by new indexing should be medium + result = celery_app.send_task( + "document_by_cc_pair_cleanup_task", + kwargs=dict( + document_id=doc_id, + connector_id=cc_pair.connector_id, + credential_id=cc_pair.credential_id, + tenant_id=self.tenant_id, + ), + queue=DanswerCeleryQueues.CONNECTOR_DELETION, + task_id=custom_task_id, + priority=DanswerCeleryPriority.MEDIUM, + ) + + async_results.append(result) + + return len(async_results) + + def reset(self) -> None: + self.redis.delete(self.generator_progress_key) + self.redis.delete(self.generator_complete_key) + self.redis.delete(self.taskset_key) + self.redis.delete(self.fence_key) + + @staticmethod + def remove_from_taskset(id: int, task_id: str, r: redis.Redis) -> None: + taskset_key = f"{RedisConnectorPrune.TASKSET_PREFIX}_{id}" + r.srem(taskset_key, task_id) + return + + @staticmethod + def reset_all(r: redis.Redis) -> None: + """Deletes all redis values for all connectors""" + for key in r.scan_iter(RedisConnectorPrune.TASKSET_PREFIX + "*"): + r.delete(key) + + for key in r.scan_iter(RedisConnectorPrune.GENERATOR_COMPLETE_PREFIX + "*"): + r.delete(key) + + for key in r.scan_iter(RedisConnectorPrune.GENERATOR_PROGRESS_PREFIX + "*"): + r.delete(key) + + for key in r.scan_iter(RedisConnectorPrune.FENCE_PREFIX + "*"): + r.delete(key) diff --git a/backend/danswer/redis/redis_connector_stop.py b/backend/danswer/redis/redis_connector_stop.py new file mode 100644 index 00000000000..c65c57ff7f4 --- /dev/null +++ b/backend/danswer/redis/redis_connector_stop.py @@ -0,0 +1,34 @@ +import redis + + +class RedisConnectorStop: + """Manages interactions with redis for stop signaling. Should only be accessed + through RedisConnector.""" + + FENCE_PREFIX = "connectorstop_fence" + + def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None: + self.tenant_id: str | None = tenant_id + self.id: int = id + self.redis = redis + + self.fence_key: str = f"{self.FENCE_PREFIX}_{id}" + + @property + def fenced(self) -> bool: + if self.redis.exists(self.fence_key): + return True + + return False + + def set_fence(self, value: bool) -> None: + if not value: + self.redis.delete(self.fence_key) + return + + self.redis.set(self.fence_key, 0) + + @staticmethod + def reset_all(r: redis.Redis) -> None: + for key in r.scan_iter(RedisConnectorStop.FENCE_PREFIX + "*"): + r.delete(key) diff --git a/backend/danswer/redis/redis_document_set.py b/backend/danswer/redis/redis_document_set.py new file mode 100644 index 00000000000..879d955eb88 --- /dev/null +++ b/backend/danswer/redis/redis_document_set.py @@ -0,0 +1,100 @@ +import time +from typing import cast +from uuid import uuid4 + +import redis +from celery import Celery +from redis import Redis +from redis.lock import Lock as RedisLock +from sqlalchemy.orm import Session + +from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.configs.constants import DanswerCeleryPriority +from danswer.configs.constants import DanswerCeleryQueues +from danswer.db.document_set import construct_document_select_by_docset +from danswer.redis.redis_object_helper import RedisObjectHelper + + +class RedisDocumentSet(RedisObjectHelper): + PREFIX = "documentset" + FENCE_PREFIX = PREFIX + "_fence" + TASKSET_PREFIX = PREFIX + "_taskset" + + def __init__(self, tenant_id: str | None, id: int) -> None: + super().__init__(tenant_id, str(id)) + + @property + def fenced(self) -> bool: + if self.redis.exists(self.fence_key): + return True + + return False + + def set_fence(self, payload: int | None) -> None: + if payload is None: + self.redis.delete(self.fence_key) + return + + self.redis.set(self.fence_key, payload) + + @property + def payload(self) -> int | None: + bytes = self.redis.get(self.fence_key) + if bytes is None: + return None + + progress = int(cast(int, bytes)) + return progress + + def generate_tasks( + self, + celery_app: Celery, + db_session: Session, + redis_client: Redis, + lock: RedisLock, + tenant_id: str | None, + ) -> tuple[int, int] | None: + last_lock_time = time.monotonic() + + async_results = [] + stmt = construct_document_select_by_docset(int(self._id), current_only=False) + for doc in db_session.scalars(stmt).yield_per(1): + current_time = time.monotonic() + if current_time - last_lock_time >= ( + CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4 + ): + lock.reacquire() + last_lock_time = current_time + + # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac" + # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac" + # we prefix the task id so it's easier to keep track of who created the task + # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac" + custom_task_id = f"{self.task_id_prefix}_{uuid4()}" + + # add to the set BEFORE creating the task. + redis_client.sadd(self.taskset_key, custom_task_id) + + result = celery_app.send_task( + "vespa_metadata_sync_task", + kwargs=dict(document_id=doc.id, tenant_id=tenant_id), + queue=DanswerCeleryQueues.VESPA_METADATA_SYNC, + task_id=custom_task_id, + priority=DanswerCeleryPriority.LOW, + ) + + async_results.append(result) + + return len(async_results), len(async_results) + + def reset(self) -> None: + self.redis.delete(self.taskset_key) + self.redis.delete(self.fence_key) + + @staticmethod + def reset_all(r: redis.Redis) -> None: + for key in r.scan_iter(RedisDocumentSet.TASKSET_PREFIX + "*"): + r.delete(key) + + for key in r.scan_iter(RedisDocumentSet.FENCE_PREFIX + "*"): + r.delete(key) diff --git a/backend/danswer/redis/redis_object_helper.py b/backend/danswer/redis/redis_object_helper.py new file mode 100644 index 00000000000..35366a36aab --- /dev/null +++ b/backend/danswer/redis/redis_object_helper.py @@ -0,0 +1,97 @@ +from abc import ABC +from abc import abstractmethod + +from celery import Celery +from redis import Redis +from redis.lock import Lock as RedisLock +from sqlalchemy.orm import Session + +from danswer.redis.redis_pool import get_redis_client + + +class RedisObjectHelper(ABC): + PREFIX = "base" + FENCE_PREFIX = PREFIX + "_fence" + TASKSET_PREFIX = PREFIX + "_taskset" + + def __init__(self, tenant_id: str | None, id: str): + self._tenant_id: str | None = tenant_id + self._id: str = id + self.redis = get_redis_client(tenant_id=tenant_id) + + @property + def task_id_prefix(self) -> str: + return f"{self.PREFIX}_{self._id}" + + @property + def fence_key(self) -> str: + # example: documentset_fence_1 + return f"{self.FENCE_PREFIX}_{self._id}" + + @property + def taskset_key(self) -> str: + # example: documentset_taskset_1 + return f"{self.TASKSET_PREFIX}_{self._id}" + + @staticmethod + def get_id_from_fence_key(key: str) -> str | None: + """ + Extracts the object ID from a fence key in the format `PREFIX_fence_X`. + + Args: + key (str): The fence key string. + + Returns: + Optional[int]: The extracted ID if the key is in the correct format, otherwise None. + """ + parts = key.split("_") + if len(parts) != 3: + return None + + object_id = parts[2] + return object_id + + @staticmethod + def get_id_from_task_id(task_id: str) -> str | None: + """ + Extracts the object ID from a task ID string. + + This method assumes the task ID is formatted as `prefix_objectid_suffix`, where: + - `prefix` is an arbitrary string (e.g., the name of the task or entity), + - `objectid` is the ID you want to extract, + - `suffix` is another arbitrary string (e.g., a UUID). + + Example: + If the input `task_id` is `documentset_1_cbfdc96a-80ca-4312-a242-0bb68da3c1dc`, + this method will return the string `"1"`. + + Args: + task_id (str): The task ID string from which to extract the object ID. + + Returns: + str | None: The extracted object ID if the task ID is in the correct format, otherwise None. + """ + # example: task_id=documentset_1_cbfdc96a-80ca-4312-a242-0bb68da3c1dc + parts = task_id.split("_") + if len(parts) != 3: + return None + + object_id = parts[1] + return object_id + + @abstractmethod + def generate_tasks( + self, + celery_app: Celery, + db_session: Session, + redis_client: Redis, + lock: RedisLock, + tenant_id: str | None, + ) -> tuple[int, int] | None: + """First element should be the number of actual tasks generated, second should + be the number of docs that were candidates to be synced for the cc pair. + + The need for this is when we are syncing stale docs referenced by multiple + connectors. In a single pass across multiple cc pairs, we only want a task + for be created for a particular document id the first time we see it. + The rest can be skipped.""" diff --git a/backend/danswer/redis/redis_pool.py b/backend/danswer/redis/redis_pool.py index 25b932dbcd3..0fe5c0ddd42 100644 --- a/backend/danswer/redis/redis_pool.py +++ b/backend/danswer/redis/redis_pool.py @@ -1,25 +1,123 @@ +import functools import threading +from collections.abc import Callable +from typing import Any from typing import Optional import redis from redis.client import Redis -from redis.connection import ConnectionPool from danswer.configs.app_configs import REDIS_DB_NUMBER +from danswer.configs.app_configs import REDIS_HEALTH_CHECK_INTERVAL from danswer.configs.app_configs import REDIS_HOST from danswer.configs.app_configs import REDIS_PASSWORD +from danswer.configs.app_configs import REDIS_POOL_MAX_CONNECTIONS from danswer.configs.app_configs import REDIS_PORT from danswer.configs.app_configs import REDIS_SSL from danswer.configs.app_configs import REDIS_SSL_CA_CERTS from danswer.configs.app_configs import REDIS_SSL_CERT_REQS +from danswer.configs.constants import REDIS_SOCKET_KEEPALIVE_OPTIONS +from danswer.utils.logger import setup_logger -REDIS_POOL_MAX_CONNECTIONS = 10 +logger = setup_logger() + + +class TenantRedis(redis.Redis): + def __init__(self, tenant_id: str, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + self.tenant_id: str = tenant_id + + def _prefixed(self, key: str | bytes | memoryview) -> str | bytes | memoryview: + prefix: str = f"{self.tenant_id}:" + if isinstance(key, str): + if key.startswith(prefix): + return key + else: + return prefix + key + elif isinstance(key, bytes): + prefix_bytes = prefix.encode() + if key.startswith(prefix_bytes): + return key + else: + return prefix_bytes + key + elif isinstance(key, memoryview): + key_bytes = key.tobytes() + prefix_bytes = prefix.encode() + if key_bytes.startswith(prefix_bytes): + return key + else: + return memoryview(prefix_bytes + key_bytes) + else: + raise TypeError(f"Unsupported key type: {type(key)}") + + def _prefix_method(self, method: Callable) -> Callable: + @functools.wraps(method) + def wrapper(*args: Any, **kwargs: Any) -> Any: + if "name" in kwargs: + kwargs["name"] = self._prefixed(kwargs["name"]) + elif len(args) > 0: + args = (self._prefixed(args[0]),) + args[1:] + return method(*args, **kwargs) + + return wrapper + + def _prefix_scan_iter(self, method: Callable) -> Callable: + @functools.wraps(method) + def wrapper(*args: Any, **kwargs: Any) -> Any: + # Prefix the match pattern if provided + if "match" in kwargs: + kwargs["match"] = self._prefixed(kwargs["match"]) + elif len(args) > 0: + args = (self._prefixed(args[0]),) + args[1:] + + # Get the iterator + iterator = method(*args, **kwargs) + + # Remove prefix from returned keys + prefix = f"{self.tenant_id}:".encode() + prefix_len = len(prefix) + + for key in iterator: + if isinstance(key, bytes) and key.startswith(prefix): + yield key[prefix_len:] + else: + yield key + + return wrapper + + def __getattribute__(self, item: str) -> Any: + original_attr = super().__getattribute__(item) + methods_to_wrap = [ + "lock", + "unlock", + "get", + "set", + "delete", + "exists", + "incrby", + "hset", + "hget", + "getset", + "owned", + "reacquire", + "create_lock", + "startswith", + "sadd", + "srem", + "scard", + ] # Regular methods that need simple prefixing + + if item == "scan_iter": + return self._prefix_scan_iter(original_attr) + elif item in methods_to_wrap and callable(original_attr): + return self._prefix_method(original_attr) + return original_attr class RedisPool: _instance: Optional["RedisPool"] = None _lock: threading.Lock = threading.Lock() - _pool: ConnectionPool + _pool: redis.BlockingConnectionPool def __new__(cls) -> "RedisPool": if not cls._instance: @@ -32,8 +130,10 @@ def __new__(cls) -> "RedisPool": def _init_pool(self) -> None: self._pool = RedisPool.create_pool(ssl=REDIS_SSL) - def get_client(self) -> Redis: - return redis.Redis(connection_pool=self._pool) + def get_client(self, tenant_id: str | None) -> Redis: + if tenant_id is None: + tenant_id = "public" + return TenantRedis(tenant_id, connection_pool=self._pool) @staticmethod def create_pool( @@ -42,33 +142,52 @@ def create_pool( db: int = REDIS_DB_NUMBER, password: str = REDIS_PASSWORD, max_connections: int = REDIS_POOL_MAX_CONNECTIONS, - ssl_ca_certs: str = REDIS_SSL_CA_CERTS, + ssl_ca_certs: str | None = REDIS_SSL_CA_CERTS, ssl_cert_reqs: str = REDIS_SSL_CERT_REQS, ssl: bool = False, - ) -> redis.ConnectionPool: + ) -> redis.BlockingConnectionPool: + """We use BlockingConnectionPool because it will block and wait for a connection + rather than error if max_connections is reached. This is far more deterministic + behavior and aligned with how we want to use Redis.""" + # Using ConnectionPool is not well documented. # Useful examples: https://github.com/redis/redis-py/issues/780 if ssl: - return redis.ConnectionPool( + return redis.BlockingConnectionPool( host=host, port=port, db=db, password=password, max_connections=max_connections, + timeout=None, + health_check_interval=REDIS_HEALTH_CHECK_INTERVAL, + socket_keepalive=True, + socket_keepalive_options=REDIS_SOCKET_KEEPALIVE_OPTIONS, connection_class=redis.SSLConnection, ssl_ca_certs=ssl_ca_certs, ssl_cert_reqs=ssl_cert_reqs, ) - return redis.ConnectionPool( + return redis.BlockingConnectionPool( host=host, port=port, db=db, password=password, max_connections=max_connections, + timeout=None, + health_check_interval=REDIS_HEALTH_CHECK_INTERVAL, + socket_keepalive=True, + socket_keepalive_options=REDIS_SOCKET_KEEPALIVE_OPTIONS, ) +redis_pool = RedisPool() + + +def get_redis_client(*, tenant_id: str | None) -> Redis: + return redis_pool.get_client(tenant_id) + + # # Usage example # redis_pool = RedisPool() # redis_client = redis_pool.get_client() diff --git a/backend/danswer/redis/redis_usergroup.py b/backend/danswer/redis/redis_usergroup.py new file mode 100644 index 00000000000..7c49b9c7fb8 --- /dev/null +++ b/backend/danswer/redis/redis_usergroup.py @@ -0,0 +1,113 @@ +import time +from typing import cast +from uuid import uuid4 + +import redis +from celery import Celery +from redis import Redis +from redis.lock import Lock as RedisLock +from sqlalchemy.orm import Session + +from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT +from danswer.configs.constants import DanswerCeleryPriority +from danswer.configs.constants import DanswerCeleryQueues +from danswer.redis.redis_object_helper import RedisObjectHelper +from danswer.utils.variable_functionality import fetch_versioned_implementation +from danswer.utils.variable_functionality import global_version + + +class RedisUserGroup(RedisObjectHelper): + PREFIX = "usergroup" + FENCE_PREFIX = PREFIX + "_fence" + TASKSET_PREFIX = PREFIX + "_taskset" + + def __init__(self, tenant_id: str | None, id: int) -> None: + super().__init__(tenant_id, str(id)) + + @property + def fenced(self) -> bool: + if self.redis.exists(self.fence_key): + return True + + return False + + def set_fence(self, payload: int | None) -> None: + if payload is None: + self.redis.delete(self.fence_key) + return + + self.redis.set(self.fence_key, payload) + + @property + def payload(self) -> int | None: + bytes = self.redis.get(self.fence_key) + if bytes is None: + return None + + progress = int(cast(int, bytes)) + return progress + + def generate_tasks( + self, + celery_app: Celery, + db_session: Session, + redis_client: Redis, + lock: RedisLock, + tenant_id: str | None, + ) -> tuple[int, int] | None: + last_lock_time = time.monotonic() + + async_results = [] + + if not global_version.is_ee_version(): + return 0, 0 + + try: + construct_document_select_by_usergroup = fetch_versioned_implementation( + "danswer.db.user_group", + "construct_document_select_by_usergroup", + ) + except ModuleNotFoundError: + return 0, 0 + + stmt = construct_document_select_by_usergroup(int(self._id)) + for doc in db_session.scalars(stmt).yield_per(1): + current_time = time.monotonic() + if current_time - last_lock_time >= ( + CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4 + ): + lock.reacquire() + last_lock_time = current_time + + # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac" + # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac" + # we prefix the task id so it's easier to keep track of who created the task + # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac" + custom_task_id = f"{self.task_id_prefix}_{uuid4()}" + + # add to the set BEFORE creating the task. + redis_client.sadd(self.taskset_key, custom_task_id) + + result = celery_app.send_task( + "vespa_metadata_sync_task", + kwargs=dict(document_id=doc.id, tenant_id=tenant_id), + queue=DanswerCeleryQueues.VESPA_METADATA_SYNC, + task_id=custom_task_id, + priority=DanswerCeleryPriority.LOW, + ) + + async_results.append(result) + + return len(async_results), len(async_results) + + def reset(self) -> None: + self.redis.delete(self.taskset_key) + self.redis.delete(self.fence_key) + + @staticmethod + def reset_all(r: redis.Redis) -> None: + for key in r.scan_iter(RedisUserGroup.TASKSET_PREFIX + "*"): + r.delete(key) + + for key in r.scan_iter(RedisUserGroup.FENCE_PREFIX + "*"): + r.delete(key) diff --git a/backend/danswer/secondary_llm_flows/agentic_evaluation.py b/backend/danswer/secondary_llm_flows/agentic_evaluation.py index 03121e3cf1d..2da61f4cb51 100644 --- a/backend/danswer/secondary_llm_flows/agentic_evaluation.py +++ b/backend/danswer/secondary_llm_flows/agentic_evaluation.py @@ -1,12 +1,12 @@ import re from danswer.chat.models import SectionRelevancePiece +from danswer.context.search.models import InferenceSection from danswer.llm.interfaces import LLM from danswer.llm.utils import dict_based_prompt_to_langchain_prompt from danswer.llm.utils import message_to_string from danswer.prompts.agentic_evaluation import AGENTIC_SEARCH_SYSTEM_PROMPT from danswer.prompts.agentic_evaluation import AGENTIC_SEARCH_USER_PROMPT -from danswer.search.models import InferenceSection from danswer.utils.logger import setup_logger logger = setup_logger() diff --git a/backend/danswer/secondary_llm_flows/query_validation.py b/backend/danswer/secondary_llm_flows/query_validation.py index 2ee428f0090..d11e603715e 100644 --- a/backend/danswer/secondary_llm_flows/query_validation.py +++ b/backend/danswer/secondary_llm_flows/query_validation.py @@ -1,9 +1,9 @@ +# NOTE No longer used. This needs to be revisited later. import re from collections.abc import Iterator from danswer.chat.models import DanswerAnswerPiece from danswer.chat.models import StreamingError -from danswer.configs.chat_configs import DISABLE_LLM_QUERY_ANSWERABILITY from danswer.llm.exceptions import GenAIDisabledException from danswer.llm.factory import get_default_llms from danswer.llm.utils import dict_based_prompt_to_langchain_prompt @@ -46,7 +46,7 @@ def extract_answerability_bool(model_raw: str) -> bool: def get_query_answerability( - user_query: str, skip_check: bool = DISABLE_LLM_QUERY_ANSWERABILITY + user_query: str, skip_check: bool = False ) -> tuple[str, bool]: if skip_check: return "Query Answerability Evaluation feature is turned off", True @@ -67,7 +67,7 @@ def get_query_answerability( def stream_query_answerability( - user_query: str, skip_check: bool = DISABLE_LLM_QUERY_ANSWERABILITY + user_query: str, skip_check: bool = False ) -> Iterator[str]: if skip_check: yield get_json_line( diff --git a/backend/danswer/seeding/__init__.py b/backend/danswer/seeding/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/danswer/seeding/initial_docs.json b/backend/danswer/seeding/initial_docs.json new file mode 100644 index 00000000000..62c66d1bc83 --- /dev/null +++ b/backend/danswer/seeding/initial_docs.json @@ -0,0 +1,3656 @@ +[ + { + "url": "https://docs.danswer.dev/more/use_cases/overview", + "title": "Use Cases Overview", + "content": "How to leverage Danswer in your organization\n\nDanswer Overview\nDanswer is the AI Assistant connected to your organization's docs, apps, and people. Danswer makes Generative AI more versatile for work by enabling new types of questions like \"What is the most common feature request we've heard from customers this month\". Whereas other AI systems have no context of your team and are generally unhelpful with work related questions, Danswer makes it possible to ask these questions in natural language and get back answers in seconds.\n\nDanswer can connect to +30 different tools and the use cases are not limited to the ones in the following pages. The highlighted use cases are for inspiration and come from feedback gathered from our users and customers.\n\n\nCommon Getting Started Questions:\n\nWhy are these docs connected in my Danswer deployment?\nAnswer: This is just an example of how connectors work in Danswer. You can connect up your own team's knowledge and you will be able to ask questions unique to your organization. Danswer will keep all of the knowledge up to date and in sync with your connected applications.\n\nIs my data being sent anywhere when I connect it up to Danswer?\nAnswer: No! Danswer is built with data security as our highest priority. We open sourced it so our users can know exactly what is going on with their data. By default all of the document processing happens within Danswer. The only time it is sent outward is for the GenAI call to generate answers.\n\nWhere is the feature for auto sync-ing document level access permissions from all connected sources?\nAnswer: This falls under the Enterprise Edition set of Danswer features built on top of the MIT/community edition. If you are on Danswer Cloud, you have access to them by default. If you're running it yourself, reach out to the Danswer team to receive access.", + "title_embedding": [ + 0.013585364446043968, 0.06531507521867752, -0.0001728831703076139, + -0.003940831869840622, 0.044078364968299866, -0.006206007208675146, + -0.008377128280699253, -0.0193742997944355, -0.018904175609350204, + 0.00868070125579834, -0.005770757794380188, 0.018564216792583466, + 0.030414527282118797, 0.0327068567276001, -0.0336286760866642, + -0.0517829954624176, 0.0029869426507502794, -3.836356700048782e-5, + -0.006240167189389467, 0.011168955825269222, -0.04732134938240051, + -0.05257624015212059, 0.017937077209353447, -0.029843101277947426, + 0.03417196497321129, 0.008637758903205395, -0.016427641734480858, + 0.017053034156560898, -0.02532368339598179, -0.016002299264073372, + 0.04696495085954666, 0.03518024459481239, 0.02884317748248577, + -0.06098122522234917, -0.024405447766184807, -0.07693222165107727, + 0.026796545833349228, -0.01345108263194561, 0.030546706169843674, + 0.00011662459291983396, 0.0362892709672451, 0.020864704623818398, + -0.0030571012757718563, -0.014566082507371902, 0.056138113141059875, + -0.01727251335978508, 0.05477291718125343, 0.019774138927459717, + 0.01646329089999199, -0.020768629387021065, -0.025477997958660126, + 0.010012250393629074, 0.0037975965533405542, -0.076152004301548, + 0.032632406800985336, -0.00799622479826212, 0.029365766793489456, + 0.02749769017100334, 0.030807621777057648, -0.031911835074424744, + 0.029850834980607033, 0.05788583680987358, -0.07022606581449509, + 0.057270754128694534, -0.012120618484914303, -0.0351836234331131, + 0.002640453167259693, 0.01869964227080345, 0.010610891506075859, + -0.06439802795648575, 0.06629050523042679, 0.00746738538146019, + 0.01690092496573925, -0.025001635774970055, -0.04047262296080589, + -0.04058482125401497, 0.01863245666027069, -0.021404679864645004, + -0.006766777019947767, 0.05071299150586128, 0.02962121181190014, + -0.06122489646077156, 0.019276190549135208, -0.03599821403622627, + 0.07253828644752502, -0.001938252942636609, -0.00785142183303833, + -0.015793368220329285, -0.06096868962049484, -0.022668933495879173, + 0.014094856567680836, 0.03107546642422676, 0.030937792733311653, + 0.04295564815402031, -0.06191089749336243, 0.010305442847311497, + 0.006790813058614731, -0.05027518793940544, 0.026334501802921295, + -0.03490821272134781, -0.03132909536361694, 0.00332127814181149, + 0.03326006978750229, 0.05009974539279938, 0.05102463811635971, + 0.080863356590271, 0.008220085874199867, 0.015490916557610035, + -0.029478086158633232, 0.004051747731864452, -0.05233829841017723, + 0.032338161021471024, 0.016430383548140526, 0.033909399062395096, + -0.0069341897033154964, -0.01092524453997612, 0.08201614767313004, + -0.061916135251522064, -0.0202189888805151, 0.06966497749090195, + 0.01732165738940239, 0.020277557894587517, 0.005021766293793917, + 0.03159264847636223, 0.027503052726387978, -0.03912171721458435, + -0.03356969729065895, 0.018767613917589188, 0.02705945260822773, + -0.06412986665964127, 0.01829575188457966, 0.030256258323788643, + 0.0074773263186216354, -0.059738945215940475, 0.042067185044288635, + 0.05707620456814766, -0.02195868454873562, -0.018797017633914948, + 0.07043126970529556, -0.007300470490008593, 0.04988619685173035, + -0.01761087030172348, 0.024358391761779785, 0.00520830973982811, + 0.007853846065700054, -0.040487151592969894, 0.013271456584334373, + 0.01356235146522522, -0.07054886221885681, 0.046908896416425705, + 0.0032781255431473255, 0.022826792672276497, 0.016794828698039055, + -0.0015572791453450918, -0.03220144659280777, -0.05249498412013054, + -0.023642878979444504, -0.0013240000698715448, 0.020749850198626518, + -0.032788924872875214, 0.01523237768560648, 0.03563430905342102, + -0.011741658672690392, 0.08171892166137695, -0.04966546595096588, + -0.02209051325917244, 0.005958004854619503, 0.02892436273396015, + 0.03561494126915932, 0.02638504095375538, 0.041660238057374954, + -0.05757247656583786, 0.027456382289528847, -0.011119958013296127, + 0.05332427844405174, 0.03419065102934837, 0.09803897142410278, + -0.0104225380346179, 0.06645305454730988, 0.02061033807694912, + -0.0188175980001688, -0.03409148380160332, -0.010915222577750683, + 0.016926083713769913, -0.01010509766638279, -0.031197063624858856, + 0.064297154545784, -0.047280170023441315, -0.022006161510944366, + 0.04089798405766487, 0.0013760487781837583, 0.0012917317217215896, + -0.010142299346625805, -0.05629380792379379, -0.058489665389060974, + -0.06434599310159683, 0.04390622675418854, 0.03466123342514038, + -0.002495409222319722, -0.01867988333106041, -0.012142776511609554, + 0.025940915569663048, -0.05517507344484329, 0.026919366791844368, + -0.05310383439064026, 0.0020175466779619455, 0.0407392643392086, + -0.0055900681763887405, 0.028038354590535164, 0.10158932954072952, + 0.056325044482946396, 0.016724230721592903, 0.005659179296344519, + 0.04764577001333237, -0.03514963388442993, 0.03311126306653023, + -0.05855907127261162, -0.007677929475903511, -0.0368916280567646, + 0.02390834502875805, 0.021506410092115402, -0.022855432704091072, + 0.02669590339064598, 0.03190927952528, 0.026299884542822838, + 0.04545223340392113, -0.04817903786897659, 0.08401483297348022, + -0.0017600113060325384, -0.0026402128860354424, -0.06519021838903427, + -0.08366627246141434, 0.025473223999142647, -0.03265143185853958, + -0.026078224182128906, -0.014162144623696804, -0.024846363812685013, + 0.042588867247104645, -0.00620845053344965, 0.0341552197933197, + -0.005032224114984274, 0.039284951984882355, 0.02678983099758625, + -0.02592509239912033, -0.0334317646920681, -0.017748532816767693, + -0.03175748512148857, -0.03699929639697075, -0.0009614137816242874, + 0.029971860349178314, 0.03400350362062454, 0.03034038282930851, + 0.005932188127189875, 0.05225752666592598, -0.032566532492637634, + -0.04808121547102928, -0.023166747763752937, 0.02398361638188362, + -0.03062198869884014, -0.046609822660684586, 0.019089654088020325, + 0.0017758660251274705, 0.015751969069242477, -0.029143214225769043, + 0.0024112602695822716, -0.02520643174648285, 0.023578567430377007, + -0.023457514122128487, 0.013982303440570831, -0.016386305913329124, + -0.002706830855458975, -0.016093457117676735, -0.037887830287218094, + 0.06287679821252823, 0.00989477802067995, -0.026199528947472572, + 0.0037161086220294237, -0.027242952957749367, -0.03319230675697327, + -0.026061702519655228, 0.015717124566435814, -0.055130068212747574, + -0.02499731443822384, -0.014253406785428524, 0.046770382672548294, + 0.008143531158566475, 0.005510109476745129, -0.02712511457502842, + -0.03787349909543991, 0.013756319880485535, 0.00579818757250905, + 0.008403831161558628, 0.029755474999547005, -0.0032762265764176846, + 0.0044027529656887054, 0.03601987659931183, 0.0909135565161705, + -0.007387327961623669, -0.005328672006726265, -0.03983118385076523, + -0.045495130121707916, 0.022088482975959778, -0.04040846228599548, + -0.0028130451682955027, 0.03781481459736824, 0.03704448416829109, + 0.03319826349616051, 0.0018484846223145723, 0.0547247938811779, + 0.019755663350224495, -0.07568438351154327, 0.05122718587517738, + -0.02555399388074875, 0.06782808154821396, -0.0756291076540947, + 0.05646894872188568, 0.06156547740101814, -0.0010290262289345264, + 0.02769431658089161, 0.003549074986949563, 0.02812255173921585, + -0.016937678679823875, 0.018674779683351517, -0.037636883556842804, + -0.002664038445800543, 0.023414231836795807, 0.040655869990587234, + 0.027929119765758514, 0.03510269895195961, -0.012380925938487053, + 0.024845607578754425, 0.027425218373537064, -0.05437726899981499, + 0.015804018825292587, 0.05077793449163437, -0.0003959169262088835, + 0.016312288120388985, -0.007089096121490002, -0.018367605283856392, + 0.02974492497742176, 0.08662278950214386, -0.021586472168564796, + -0.01729869470000267, -0.04846135899424553, -0.03031736984848976, + 0.002749247709289193, 0.02350122295320034, -0.0211945790797472, + 0.03907554969191551, -0.023193899542093277, -0.017260679975152016, + -0.03159818798303604, -0.03952740877866745, 0.010126064531505108, + -0.04888703301548958, 0.06297406554222107, 0.03254289552569389, + 0.004272142890840769, -0.03231256827712059, -0.04512784630060196, + 0.0043722353875637054, -0.02994321472942829, 0.05848870426416397, + 0.003534652292728424, 0.007630845997482538, 0.017482444643974304, + 0.04071490466594696, 0.008809284307062626, -0.03566472604870796, + -0.029327288269996643, -0.017241651192307472, -0.012668757699429989, + 0.05879855155944824, 0.05893324315547943, 0.09900359064340591, + 0.028096094727516174, -0.036374326795339584, 0.06244330108165741, + -0.03114512376487255, -0.028666621074080467, 0.06343588978052139, + 0.025132114067673683, -0.01625697687268257, 0.019650116562843323, + -0.049646493047475815, -0.03520796075463295, 0.03757908195257187, + 0.002519423607736826, 0.03556838259100914, -0.017592694610357285, + 0.0010467531392350793, -0.06738362461328506, -0.025265797972679138, + 0.008135112002491951, -0.01762012392282486, -0.024728305637836456, + -0.03567385673522949, 0.018016908317804337, 0.06866948306560516, + 0.03130311518907547, -0.0297296904027462, -0.006176969967782497, + 0.04329727590084076, 0.044129278510808945, -0.020673662424087524, + 0.06023940071463585, -0.004932863637804985, -0.050380971282720566, + -0.034760732203722, 0.00199303706176579, 0.05686243996024132, + -0.0148441381752491, -0.012425840832293034, -0.011634211987257004, + 0.022722775116562843, -0.008717222139239311, 0.020749682560563087, + -0.0277851615101099, 0.0007777228020131588, 0.013342801481485367, + 0.03622204810380936, -0.023042850196361542, -0.026700101792812347, + -0.034892451018095016, -0.028433026745915413, 0.06670085340738297, + 0.013445812277495861, 0.03833755850791931, 0.01010140310972929, + -0.03759188950061798, -0.05855119228363037, 0.00781426765024662, + -0.04906706139445305, 0.03342912718653679, -0.05243462324142456, + 0.040698058903217316, 0.06868159025907516, 0.022752607241272926, + -0.005430352408438921, -0.006812892388552427, -0.04902511462569237, + -0.006551826372742653, -0.03979682922363281, -0.013956423848867416, + -0.06136368587613106, 0.0740000531077385, 0.05603933334350586, + 0.02190348319709301, -0.043786339461803436, -0.0392116904258728, + -0.01866808719933033, 0.01707339473068714, -0.026303859427571297, + -0.01817542500793934, 0.03552285581827164, 0.0276781152933836, + 0.05265122279524803, -0.03358357027173042, -0.020007848739624023, + 0.04865119233727455, 0.02959197200834751, -0.0032693049870431423, + 0.02495887503027916, 0.03446371853351593, -0.011217310093343258, + -0.09030335396528244, 0.014422472566366196, -0.008989378809928894, + -0.011282369494438171, 0.049398381263017654, -0.01687331311404705, + 0.025424139574170113, 0.024985041469335556, -0.009084195829927921, + 0.004050575662404299, 0.0007717382395640016, -0.03172731399536133, + -0.017505444586277008, -0.014687484130263329, 0.03803866356611252, + 0.016156280413269997, -0.010017951019108295, -0.026353944092988968, + 0.019050614908337593, -0.03580506518483162, 0.02924525737762451, + 0.02443450316786766, -0.01770329475402832, 0.04581848904490471, + -0.01908605918288231, 0.012714254669845104, 0.08363562822341919, + 0.037286512553691864, -0.003420531051233411, -0.06909038126468658, + -0.0591881163418293, -0.007654525339603424, 0.053144630044698715, + 0.03045589104294777, -0.04600578546524048, 0.026682959869503975, + -0.0019753179512917995, -0.017073772847652435, -0.012071357108652592, + 0.028171954676508904, 0.00024773634504526854, -0.03256797045469284, + -0.09742321819067001, 0.040483273565769196, -0.025031624361872673, + 0.03650636970996857, 0.0011886897264048457, 0.016929153352975845, + 0.054483890533447266, 0.03752107173204422, 0.019491281360387802, + 0.006253591738641262, 0.02451430633664131, -0.05976274237036705, + 0.060739971697330475, -0.04400366172194481, 0.028709039092063904, + -0.02141660451889038, 0.08152823895215988, -0.00450171809643507, + -0.03484562411904335, -0.00046958858729340136, -0.017397938296198845, + 0.07823023945093155, -0.011110293678939342, 0.004524719901382923, + 0.03619854897260666, -0.02478216402232647, -0.011563056148588657, + -0.012815544381737709, -0.03503820300102234, -0.04771020635962486, + -0.030619489029049873, 0.0669066309928894, 0.030025487765669823, + -0.011697783134877682, -0.006708705797791481, -0.0061534675769507885, + 0.0365905724465847, -0.006860053166747093, -0.07040797173976898, + -0.057646144181489944, 0.04284966364502907, -0.01533683855086565, + -0.06859996914863586, 0.009425769560039043, -9.838528785621747e-5, + 0.010796179063618183, -0.06541100144386292, 0.01059884112328291, + -0.028843343257904053, 0.029019653797149658, -0.005446962546557188, + -0.0120149040594697, -0.0471968911588192, 0.008648250252008438, + 0.021961573511362076, -0.010606558993458748, 0.0008718566386960447, + -0.014988702721893787, -0.11522816866636276, -0.023671753704547882, + -0.004968483000993729, 0.0307041984051466, -0.0020613274537026882, + -0.03271760419011116, -0.04547363147139549, -0.00812614057213068, + -0.013890305534005165, 0.048099175095558167, -0.015408700332045555, + 0.06658884882926941, 0.012633614242076874, -0.05337975174188614, + 0.0033035692758858204, 0.03610198199748993, -0.0405871607363224, + 0.008806376717984676, -0.017653945833444595, -0.05865860357880592, + 0.03825455904006958, -0.00478429114446044, -0.04127506911754608, + 0.01231306791305542, 0.0008735111332498491, 0.02923770435154438, + 0.005922738928347826, -0.01829770766198635, -0.00685579888522625, + -0.03903493285179138, 0.009158597327768803, -0.03491708263754845, + 0.04114120453596115, -0.0014327293029055, -0.019274454563856125, + 0.02704671025276184, 0.01738886535167694, -0.02327372133731842, + -0.03135831654071808, -0.01305293757468462, 0.04163745418190956, + 0.01710107922554016, 0.06454417109489441, 0.020267069339752197, + -0.08408207446336746, -0.010505065321922302, -0.0073319086804986, + 0.004039655905216932, -0.01633611135184765, -0.02889716438949108, + -0.0806351900100708, -0.023603465408086777, -0.06304290890693665, + 0.007231221999973059, -0.038828227669000626, 0.014790577813982964, + -0.03915632143616676, 0.05616161227226257, 0.00311578088440001, + -0.02434428222477436, 0.006431886460632086, -0.06326194852590561, + -0.0166602972894907, 0.03630464896559715, 0.01622965931892395, + 0.026233987882733345, 0.06605540215969086, -0.05635184794664383, + -0.08930846303701401, -0.05207853391766548, 0.027004040777683258, + -0.031913693994283676, -0.009139630943536758, -0.028410566970705986, + 0.06700566411018372, 0.0423152893781662, -0.010422738268971443, + -0.04085265100002289, 0.029382970184087753, 0.052883222699165344, + 0.02239867113530636, -0.0012815282680094242, 0.014223611913621426, + 0.02597920596599579, -0.015063034370541573, 0.0828455239534378, + 0.03366050869226456, -0.022025907412171364, -0.0019613192416727543, + -0.02539178729057312, 0.0399317741394043, -0.006493750028312206, + -0.0013236093800514936, -0.02036309242248535, -0.0065197269432246685, + -0.030695058405399323, -0.03585388883948326, -0.045742426067590714, + 0.015121972188353539, 0.08081705123186111, 0.007906812243163586, + 0.059827424585819244, -0.04740464314818382, -0.0743480697274208, + -0.025416050106287003, -0.05693193897604942, -0.001481675892136991, + -0.017177585512399673, -0.03311903402209282, 0.022755322977900505, + -0.003895542584359646, 0.02692737802863121, -0.0032731543760746717, + -0.0031116430182009935, -0.030539495870471, -0.006427450571209192, + -0.0015021534636616707, 0.0017666849307715893, -0.03059082292020321, + 0.0005832729511894286, -0.05637278035283089, 0.04087543487548828, + 0.00220437441021204, 0.0021564762573689222, -0.0314127579331398, + -0.025518659502267838, -0.07314060628414154, -0.014426291920244694, + -0.08714891970157623, -0.02331671305000782, 0.013582085259258747, + -0.0025384915061295033, -0.01540769450366497, -0.0110056446865201, + 0.04654880613088608, 0.010653696954250336, 0.0018328200094401836, + 0.007387213874608278, 0.07984212785959244, -0.02893732860684395, + -0.04140201583504677, -0.07618758082389832, -0.00793982483446598, + -0.0377434641122818, 0.032935332506895065, -0.013259266503155231, + 0.02015708014369011, 0.09388656169176102, 0.0017843206878751516, + 0.034253206104040146, -0.017240997403860092, 0.009084933437407017, + -0.048595622181892395, -0.03737767040729523, -0.04036621376872063, + -0.009442481212317944, 0.01705838553607464, -0.03709835931658745, + 0.004579882137477398, -0.02558705396950245, -0.010287507437169552, + -0.00969093106687069, 0.012930587865412235, -0.0026530276518315077, + -0.055302973836660385, -0.0007084248936735094, 0.0027114865370094776, + -0.022337302565574646, 0.049817051738500595, -0.015339787118136883, + -0.01575980708003044, -0.0211472287774086, -0.041779227554798126, + -0.00043109135003760457, -0.002638365374878049, 0.0003785403096117079, + 0.04666115716099739, -0.031109463423490524, 0.03951709344983101, + 0.007409846410155296, 0.032062821090221405, -0.019406728446483612, + -0.03020879067480564, 0.06802312284708023, 0.018488138914108276, + -0.053909264504909515, -0.007893281057476997, 0.02533031813800335, + 0.03132852911949158, -0.053483180701732635, 0.025661734864115715, + 0.002397680189460516, 0.04062856733798981, 0.02830035611987114, + -0.00479720626026392, -0.008729430846869946, -0.0034766148310154676, + 0.03173350542783737, 0.0071125393733382225, -0.03040342777967453, + -0.006032709032297134, -0.06926627457141876, -0.0381772480905056 + ], + "content_embedding": [ + -0.02845170348882675, 0.020628532394766808, 0.003312832210212946, + -0.029765071347355843, 0.016959644854068756, -0.004110109526664019, + 0.054954126477241516, -0.03696386516094208, -0.06003747880458832, + -0.016737867146730423, -0.04143841937184334, 0.010524315759539604, + 0.01846286654472351, 0.012900668196380138, -0.01821877434849739, + -0.022333195433020592, 0.016231827437877655, -0.00692401546984911, + -0.009705417789518833, 0.0043431734666228294, -0.01035444438457489, + -0.03510449081659317, -0.01101984828710556, -0.029713021591305733, + 0.08172306418418884, -0.008759400807321072, -0.040999725461006165, + 0.04106973484158516, -0.05768377706408501, -0.008512589149177074, + 0.05944962799549103, -0.012553821317851543, 0.013645646162331104, + -0.02660560794174671, -0.057905036956071854, -0.054687026888132095, + 0.003909541759639978, -0.04956740885972977, -0.042125821113586426, + 0.06187684088945389, 0.06303229182958603, -0.012631679885089397, + -0.004673871211707592, -0.02207319252192974, 0.052802763879299164, + 0.014762785285711288, 0.04115021601319313, -0.006632254458963871, + 0.03773806244134903, -0.03468457981944084, -0.014101233333349228, + 0.013350501656532288, -0.024982236325740814, -0.009867328219115734, + -0.007960042916238308, 0.005127797368913889, 0.002303300891071558, + -0.004433336202055216, 0.03658096119761467, -0.04504770040512085, + 0.027889715507626534, 0.05441499873995781, -0.04908447712659836, + 0.041611816734075546, -0.00782090611755848, -0.05460766702890396, + 0.0005653056432493031, 0.0009949197992682457, 0.013009139336645603, + 0.004702548962086439, -0.0066951001062989235, -0.009612455032765865, + 0.027976926416158676, 0.013144126161932945, -0.009398404508829117, + -0.009249510243535042, 0.02228953316807747, -0.05003415420651436, + -0.03484565392136574, 0.039622433483600616, 0.03127755597233772, + -0.07711455225944519, 0.026068583130836487, -0.03025561012327671, + 0.03434577211737633, 0.02756066806614399, -0.016127552837133408, + 0.0031622813548892736, -0.011191527359187603, 0.10279087722301483, + 0.07508235424757004, 0.014161021448671818, 0.04303860291838646, + 0.02421264722943306, -0.060081642121076584, 0.08023173362016678, + -0.016117870807647705, -0.040795255452394485, -0.006737033370882273, + -0.02539793588221073, -0.005812298972159624, 0.027351481840014458, + 0.02652551420032978, 0.034308984875679016, 0.07952407002449036, + 0.012120738625526428, -0.002102786907926202, 0.02581837773323059, + 0.0036945617757737637, 0.03335866704583168, -0.05533025786280632, + -0.029806576669216156, 0.014511525630950928, 0.028494026511907578, + -0.028353745117783546, -0.0015628034016117454, 0.0542825423181057, + -0.06842301040887833, 0.013071774505078793, 0.035904042422771454, + -0.060427047312259674, -0.010712354443967342, -0.010741145350039005, + 0.00589279318228364, 0.03916572034358978, 0.0011838098289445043, + -0.04358551278710365, -0.02426866628229618, -0.02629699930548668, + -0.016508616507053375, 0.0038987575098872185, 0.00010461249621585011, + -0.06473322957754135, 0.027538873255252838, 0.03787471354007721, + 0.024383891373872757, -0.04171127453446388, -0.03238093852996826, + 0.007360804360359907, -0.014501902274787426, 0.014242740347981453, + -0.0012311796890571713, -0.013716178946197033, -0.009915472939610481, + 0.026615049690008163, -0.07398053258657455, 0.0030485496390610933, + 0.025813661515712738, -0.022065768018364906, 0.0349227599799633, + 0.0045135351829230785, -0.053763143718242645, -0.013968654908239841, + 0.016600387170910835, 0.029198968783020973, -0.03825172409415245, + -0.03900526836514473, 0.02822844497859478, 0.052716661244630814, + -0.00427692336961627, 0.029389938339591026, 0.01127107534557581, + -0.02288925088942051, 0.06506737321615219, -0.011876849457621574, + -0.009232635609805584, 0.059180255979299545, 0.060491811484098434, + 0.04768436402082443, 0.04782063513994217, -0.007591789122670889, + -0.012142209336161613, -0.00854392908513546, -0.03645598515868187, + 0.02366817742586136, 0.028424806892871857, 0.03254731744527817, + -0.0650848001241684, 0.05803924798965454, -0.006124107167124748, + 0.007514724973589182, -0.06995245814323425, 0.03610721975564957, + -0.025534681975841522, -0.047099191695451736, 0.0024543125182390213, + 0.013705895282328129, -0.08660408854484558, 0.013458521105349064, + -0.05938595533370972, 0.025314588099718094, -0.06279927492141724, + -0.008528811857104301, -0.04051665961742401, -0.02572588622570038, + -0.05028638243675232, 0.029650729149580002, 0.03656933456659317, + 0.027842504903674126, -0.017784448340535164, -0.06566111743450165, + -0.016097936779260635, -0.07754653692245483, 0.02611452341079712, + -0.012319186702370644, 0.03830364719033241, 0.05927351489663124, + -0.0005797847989015281, 0.05858585610985756, 0.013468705117702484, + 0.08553440123796463, 0.010187739506363869, -0.023877883329987526, + 0.027608737349510193, -0.04135579988360405, -0.004526825156062841, + 0.01695535145699978, -0.043227668851614, -0.03456792235374451, + 0.06477289646863937, 0.031624119728803635, -0.04087601602077484, + 0.0010430653346702456, 0.017958510667085648, 0.009248117916285992, + 0.010219916701316833, -0.05485055223107338, -0.01347501389682293, + -0.015884561464190483, -0.008806952275335789, -0.04478437826037407, + -0.09141774475574493, 0.07184188067913055, 0.02080371417105198, + 0.03414024040102959, 0.02681431546807289, -0.02171824313700199, + 0.023230157792568207, 0.0034705817233771086, 0.023832201957702637, + 0.04260754957795143, -0.023710861802101135, 0.017519451677799225, + -0.023114347830414772, -0.07241662591695786, 0.043135177344083786, + -0.03519831597805023, 0.01728164590895176, -0.007306656800210476, + 0.029966725036501884, 0.005133960861712694, 0.010730396956205368, + 0.014178331010043621, 0.02290872484445572, 0.04147600382566452, + -0.0711970180273056, 0.011968120001256466, 0.012014097534120083, + -0.00941413827240467, -0.048221614211797714, 0.02721494808793068, + -0.036967791616916656, 0.03305060788989067, -0.023104682564735413, + -0.0021078407298773527, 0.01056760549545288, 0.003825176041573286, + -0.02744617499411106, -0.011484067887067795, -0.019249368458986282, + 0.012087506242096424, 0.016815317794680595, 0.008888418786227703, + 0.09483875334262848, 0.005403030198067427, -0.006538084242492914, + -0.00812787376344204, 0.010251098312437534, 0.025141935795545578, + -0.016502706333994865, -0.07583127170801163, -0.059361476451158524, + -0.03975491225719452, 0.005571363028138876, 0.025980276986956596, + 0.06575164198875427, -0.022391004487872124, -0.0014668750809505582, + 0.0309857539832592, -0.01333113107830286, 0.024967554956674576, + 0.008801382035017014, 0.004801678936928511, 0.011097696609795094, + -0.02484068274497986, 0.07475821673870087, 0.06004296988248825, + 0.008063849061727524, -0.007297527976334095, -0.0225421991199255, + -0.020057078450918198, 0.04824424162507057, 0.019114485010504723, + -0.024972988292574883, -0.013590608723461628, -0.026848217472434044, + 0.013158710673451424, 0.03205424174666405, 0.06152794137597084, + 0.06059333309531212, -0.03528637811541557, 0.03574252501130104, + 0.011881774291396141, -0.014821416698396206, -0.03766554221510887, + 0.02960871160030365, 0.043620847165584564, -0.0008511713240295649, + -0.012452763505280018, -0.008131926879286766, 0.02682739496231079, + -0.027776895090937614, -0.017724614590406418, -0.074460469186306, + 0.007388352416455746, 0.052085623145103455, 0.005944994743913412, + 0.05980602651834488, 0.004958854056894779, 0.004315464291721582, + 0.009470906108617783, 0.08363069593906403, -0.06266297399997711, + -0.02252691425383091, -0.0047216094098985195, 0.01363289449363947, + 0.00803599413484335, 0.017915869131684303, 0.01683986745774746, + 0.0054694474674761295, 0.00690553430467844, -0.033289551734924316, + -0.041059620678424835, -0.05957230553030968, -0.07026804983615875, + 0.0026692922692745924, 0.01784949004650116, -0.0003522790502756834, + 0.044131647795438766, 0.05823194235563278, -0.01914701797068119, + 0.012501074001193047, -0.0076722633093595505, -0.040374498814344406, + 0.010002685710787773, -0.006841403432190418, 0.024179449304938316, + 0.01219668984413147, -0.005650076549500227, 0.010313056409358978, + 0.0951242670416832, -0.012245064601302147, 0.02261069230735302, + -0.02354615181684494, 0.04120791703462601, 0.03377315774559975, + 0.03468254953622818, 0.033613745123147964, 0.03305840864777565, + -0.033862534910440445, 0.034367989748716354, -0.022536078467965126, + 0.04874858632683754, 0.0415329709649086, 0.06666682660579681, + 0.0036932802759110928, -0.09809356927871704, -0.0017088145250454545, + 0.0024299651850014925, -0.018637090921401978, 0.06801039725542068, + 0.02409985661506653, 0.009735392406582832, -0.02749275043606758, + 0.030437270179390907, -0.00898370798677206, -0.020128484815359116, + -0.009687880054116249, 0.01668565906584263, -0.04497738555073738, + -0.019772959873080254, -0.022632960230112076, -0.02253716252744198, + 0.010271976701915264, -0.00696501974016428, 0.030849630013108253, + -0.04239751771092415, 0.03944450616836548, -0.013430316932499409, + 0.032022625207901, -0.05952562019228935, -0.043423160910606384, + -0.0024594010319560766, -0.0009159342152997851, -0.01097820233553648, + 0.02963317185640335, -0.02188674546778202, -0.048581305891275406, + 0.03341398760676384, -0.011065436527132988, 0.047042280435562134, + 0.04042183235287666, -0.02600206807255745, -0.05695529654622078, + 0.05499875172972679, -0.03984459489583969, 0.0649806335568428, + 0.02108422853052616, -0.07841797918081284, -0.02946053445339203, + -0.01747039519250393, -0.013214156962931156, 0.008581981062889099, + -0.0022455912549048662, -0.022153383120894432, 0.011744811199605465, + 0.017487658187747, 0.008825760334730148, 0.025580866262316704, + 0.0082536730915308, 0.03269948437809944, 0.012900054454803467, + 0.04077104851603508, 0.0378829762339592, -0.06819288432598114, + 0.02784581482410431, 0.06434649229049683, 0.03351795673370361, + 0.011211752891540527, 0.01894824393093586, 0.004370532464236021, + -0.014345336705446243, 0.006097803357988596, -0.08543102443218231, + -0.02757532149553299, 0.06072119623422623, 0.10378460586071014, + 0.009428516030311584, -0.04370144382119179, -0.01193047035485506, + 0.04444314166903496, 0.011696353554725647, -0.030421355739235878, + -0.014331959187984467, 0.04900198057293892, 0.027842359617352486, + -0.009193948470056057, -0.06911113113164902, -0.011863719671964645, + 0.035546496510505676, 0.022603457793593407, 0.017083479091525078, + 0.015593679621815681, 0.021234845742583275, 0.004202473908662796, + -0.0537634901702404, 0.04333661124110222, -0.025838447734713554, + -0.024251429364085197, 0.03204401955008507, -0.002707387087866664, + 0.03231189027428627, -0.0030726506374776363, 0.024067431688308716, + -0.03183548524975777, 0.007890576496720314, 0.054388418793678284, + -0.017090266570448875, 0.01572081632912159, 0.04539618268609047, + 0.01960766315460205, 0.009962057694792747, -0.06782109290361404, + 0.03449762612581253, -0.004095849581062794, 0.011451792903244495, + 0.01291556004434824, 0.009039049968123436, -0.023024702444672585, + -0.019358525052666664, 0.004383507184684277, 0.01303566712886095, + 0.008749599568545818, -0.0019463954959064722, -0.05059671401977539, + -0.03929319232702255, -0.038611579686403275, -0.0004830050456803292, + 0.03260226547718048, -0.0484665147960186, 0.04194365814328194, + 0.028087828308343887, 0.0015577803133055568, -0.0020338960457593203, + 0.00992380827665329, 0.029156159609556198, 0.013400931842625141, + -0.08322624862194061, 0.03542347997426987, 0.029148025438189507, + 0.03978026285767555, 0.012040375731885433, -0.015034311451017857, + 0.016610056161880493, 0.030412640422582626, -0.051336318254470825, + 0.057814277708530426, -0.009117085486650467, -0.055816203355789185, + 0.05705805867910385, -0.013596060685813427, 0.07485361397266388, + 0.04783453419804573, 0.03237048164010048, -0.1252431571483612, + -0.024511752650141716, 0.009948192164301872, -0.001071324571967125, + 0.03724309056997299, 0.0011626302730292082, -0.024294020608067513, + 0.09550263732671738, 0.001125041046179831, -0.032950934022665024, + 0.03814041614532471, -0.015817970037460327, -0.042719099670648575, + 0.02066672407090664, 0.03320891037583351, 0.04236403852701187, + 0.026531731709837914, -0.026540761813521385, 0.018515214323997498, + -0.0034993020817637444, -0.019887158647179604, -0.03968263417482376, + -0.048776015639305115, 0.058169975876808167, -0.033802736550569534, + -0.06506575644016266, 0.06649087369441986, -0.0008670967072248459, + 0.03118874505162239, -0.04512554034590721, 0.017889900133013725, + -0.0688585713505745, -0.005668118130415678, -0.04627913981676102, + -0.005911638960242271, -0.004173378925770521, 0.030776705592870712, + -0.003502912586554885, 0.06485340744256973, -0.002432354027405381, + 0.016381509602069855, -0.11775369197130203, -0.03817284479737282, + -0.0002618363650981337, 0.020635321736335754, -0.00496688112616539, + -0.025879040360450745, -0.03508525341749191, 0.03343107923865318, + 0.0065256268717348576, 0.03733794391155243, -0.05374712869524956, + 0.06329557299613953, -0.012232488952577114, -0.07295915484428406, + -0.03410065174102783, -0.036963820457458496, -0.040072306990623474, + 0.08864720910787582, -0.025265397503972054, 0.006373850163072348, + 0.03589979186654091, -0.04953531548380852, -0.014813058078289032, + 0.03135038912296295, -0.045539740473032, -0.013331865891814232, + -0.0077253468334674835, 0.02402244322001934, -0.01973932422697544, + -0.07107116281986237, 0.029835300520062447, 0.027613440528512, + 0.027596186846494675, 0.0323333777487278, -0.059915486723184586, + -0.03399650380015373, -0.011191067285835743, 0.04654889553785324, + 0.02089870348572731, -0.02999742142856121, 0.03440370410680771, + 0.007507022004574537, 0.040571920573711395, -0.03278252109885216, + -0.06050700321793556, 0.021276405081152916, -0.016155162826180458, + 0.00010897620086325333, -0.0010203487472608685, -0.03746471554040909, + -0.025609171018004417, 0.005009123589843512, -0.08643782883882523, + 0.031217029318213463, -0.0032753287814557552, -0.025097588077187538, + -0.03903978690505028, 0.04100614786148071, -0.015351627953350544, + -0.027274904772639275, 0.03702486678957939, -0.08083852380514145, + 0.005576241761445999, -0.006957546342164278, 0.09723483771085739, + 0.018242431804537773, 0.05415903404355049, -0.06101224571466446, + -0.025462020188570023, -0.05338318645954132, -0.014500913210213184, + 0.017682785168290138, -0.017082780599594116, 0.028426188975572586, + 0.039222750812768936, 0.02545572631061077, -0.03942421078681946, + -0.06022893264889717, 0.04819706454873085, 0.06749513745307922, + 0.01035595778375864, 0.011470649391412735, -0.0024080125149339437, + -0.033233992755413055, 0.028164777904748917, 0.0553852841258049, + 0.06263226270675659, -0.016413886100053787, 0.021145500242710114, + 0.01179521344602108, 0.033495236188173294, -0.013982322998344898, + 0.015615278854966164, -0.04715987294912338, 0.029921408742666245, + 0.00394752062857151, -0.028457310050725937, 0.018736237660050392, + -0.02897241711616516, -0.006964333355426788, 0.001459103194065392, + 0.020680002868175507, -0.045486945658922195, -0.0186879001557827, + 0.006018372252583504, -0.005646763369441032, 0.031949128955602646, + 0.015718143433332443, -0.0680442824959755, -0.040316201746463776, + 0.027103520929813385, 0.007122257724404335, -0.02554631046950817, + -0.015084164217114449, -0.05808757618069649, -0.01925673894584179, + -0.030870718881487846, 0.04750153049826622, -0.05464727431535721, + -0.03634507954120636, -0.022956492379307747, -0.001869303290732205, + 0.009947973303496838, -0.0452248640358448, -0.05064425989985466, + 0.0033088859636336565, 0.032645225524902344, 0.011128626763820648, + -0.07038814574480057, -0.00398398470133543, -0.029936205595731735, + -0.037302739918231964, -0.026515178382396698, -0.005808456335216761, + 0.011362303048372269, 0.013931548222899437, 0.053531426936388016, + -0.017702942714095116, 0.04049023985862732, 0.0490780733525753, + -0.0034894803538918495, 0.0072046807035803795, 0.05128946155309677, + -0.02248883992433548, -0.016904350370168686, -0.007111264392733574, + 0.013148962520062923, 0.04168686643242836, -0.011360356584191322, + -0.01462612859904766, 0.036358367651700974, -0.000562329194508493, + -0.037175074219703674, 0.002623690525069833, -0.04284300655126572, + 0.045784976333379745, 0.017441436648368835, -0.09851012378931046, + 0.06095545366406441, -0.0374034121632576, -0.02720695361495018, + -0.007703973911702633, 0.03689894452691078, 0.008105777204036713, + 0.019800135865807533, -0.02071499079465866, -0.022336198017001152, + -0.009109229780733585, 0.02298821695148945, -0.04302436485886574, + -0.023912018164992332, 0.007846477441489697, -0.04115860536694527, + -0.052512455731630325, -0.0262643713504076, 0.00893806479871273, + -0.032936349511146545, -0.015261095948517323, 0.005558508913964033, + -0.008528356440365314, -0.023201653733849525, -0.056550297886133194, + 0.025247847661376, 0.04831540212035179, -0.019267458468675613, + -0.03474835306406021, -0.001712511875666678, -0.04063638299703598, + -0.01707102544605732, -0.01384702231734991, -0.0023981977719813585, + 0.03153757378458977, 0.030312344431877136, 0.040820326656103134, + -0.013783353380858898, 0.012157448567450047, -0.015558870509266853, + -0.056085314601659775, 0.03875841945409775, 0.021351536735892296, + -0.021598603576421738, -0.01058109663426876, 0.001237297197803855 + ], + "chunk_ind": 0 + }, + { + "url": "https://docs.danswer.dev/more/use_cases/enterprise_search", + "title": "Enterprise Search", + "content": "Value of Enterprise Search with Danswer\n\nWhat is Enterprise Search and why is it Important?\nAn Enterprise Search system gives team members a single place to access all of the disparate knowledge of an organization. Critical information is saved across a host of channels like call transcripts with prospects, engineering design docs, IT runbooks, customer support email exchanges, project management tickets, and more. As fast moving teams scale up, information gets spread out and more disorganized.\n\nSince it quickly becomes infeasible to check across every source, decisions get made on incomplete information, employee satisfaction decreases, and the most valuable members of your team are tied up with constant distractions as junior teammates are unable to unblock themselves. Danswer solves this problem by letting anyone on the team access all of the knowledge across your organization in a permissioned and secure way. Users can ask questions in natural language and get back answers and documents across all of the connected sources instantly.\n\nWhat's the real cost?\nA typical knowledge worker spends over 2 hours a week on search, but more than that, the cost of incomplete or incorrect information can be extremely high. Customer support/success that isn't able to find the reference to similar cases could cause hours or even days of delay leading to lower customer satisfaction or in the worst case - churn. An account exec not realizing that a prospect had previously mentioned a specific need could lead to lost deals. An engineer not realizing a similar feature had previously been built could result in weeks of wasted development time and tech debt with duplicate implementation. With a lack of knowledge, your whole organization is navigating in the dark - inefficient and mistake prone.", + "title_embedding": [ + -0.011060578748583794, 0.05994883179664612, 0.008845113217830658, + 0.011364905163645744, 0.03147757425904274, 0.04506697878241539, + -0.025942707434296608, -0.011002028360962868, -0.03507396578788757, + -0.01727098599076271, -0.016820134595036507, 0.04671240597963333, + 0.023456331342458725, -0.005752791650593281, -0.011421029455959797, + -0.04169125109910965, 0.0652366429567337, -0.011136278510093689, + -0.013501451350748539, -0.006273901090025902, 0.0012236927868798375, + -0.030771249905228615, 0.010098040103912354, -0.02360220067203045, + 0.006734110414981842, 0.001379420980811119, -0.0047225081361830235, + -0.013901330530643463, 0.014645840041339397, -0.02156134508550167, + -0.026707857847213745, -0.00676271365955472, 0.056067030876874924, + -0.0455806590616703, -0.02762053906917572, -0.0965149849653244, + 0.04567359760403633, 0.053895801305770874, 0.029093541204929352, + 0.0199823547154665, 0.047089505940675735, 0.026028119027614594, + -0.0034626282285898924, -0.021002190187573433, 0.04217635095119476, + -0.015198000706732273, 0.047393105924129486, 0.008588545024394989, + 0.07058116048574448, -0.09135723114013672, -0.009591161273419857, + 0.014971816912293434, -8.222273208957631e-7, -0.05534408614039421, + -0.029965048655867577, -0.028381407260894775, 0.025547217577695847, + 0.038583844900131226, 0.01984122209250927, -0.02435469813644886, + 0.04955849424004555, 0.0128632802516222, -0.022573867812752724, + 0.025284791365265846, 0.04496009647846222, 0.0004200333496555686, + -0.0015001222491264343, 0.02609623409807682, 0.023677939549088478, + -0.05961468443274498, 0.06799754500389099, -0.014409428462386131, + -0.010374268516898155, 0.019634529948234558, -0.03720257058739662, + -0.04654879495501518, 0.0056068566627800465, -0.021244503557682037, + -0.03198164328932762, 0.05707596614956856, 0.043927326798439026, + -0.033356692641973495, 0.015723733231425285, -0.027493512257933617, + 0.04525380581617355, 0.025808652862906456, -0.007083983160555363, + -0.011038954369723797, -0.004390218295156956, -0.006583297159522772, + 0.003319315379485488, 0.061810243874788284, 0.05887124314904213, + 0.07722297310829163, -0.06601747870445251, 0.036486171185970306, + 0.05119618400931358, 0.00945530366152525, -0.03005358576774597, + -0.047870855778455734, -0.0359003059566021, -0.005699747242033482, + 0.053807053714990616, -0.001554036163724959, 0.060548700392246246, + 0.05476829782128334, 0.00426551653072238, -0.005215689539909363, + -0.0352163203060627, -0.03284529969096184, -0.03373449295759201, + 0.021254340186715126, -0.010481598787009716, 0.02651101164519787, + -0.00481840968132019, 0.045160870999097824, 0.09704204648733139, + -0.03473421558737755, 0.015584945678710938, 0.06579536944627762, + 0.0651017278432846, 0.007380738854408264, 0.00624364148825407, + 0.07893780618906021, -0.019876087084412575, 0.006619091611355543, + -0.030776498839259148, -0.016426965594291687, 0.014603338204324245, + -0.03326896205544472, 0.003784433240070939, 0.025205042213201523, + 0.03047170303761959, -0.03364298865199089, 0.005974944215267897, + 0.015994269400835037, -0.10205432027578354, -0.026497431099414825, + 0.07166463136672974, -0.007370935752987862, 0.034912627190351486, + -0.004656887147575617, 0.03219066560268402, -0.02239271067082882, + 0.012679396197199821, -0.07867992669343948, -0.026638884097337723, + -0.008346030488610268, -0.027243634685873985, 0.012043806724250317, + 0.024078860878944397, 0.0006219774950295687, 0.0032065671402961016, + -0.008564895950257778, -0.03275461867451668, -0.05791788548231125, + -0.044815272092819214, 0.006680188700556755, 0.04072298854589462, + 0.016144724562764168, -0.0008583687013015151, 0.03699830546975136, + -0.04675738513469696, 0.06570404022932053, -0.011776894330978394, + -0.011386243626475334, -0.0003363603027537465, 0.039485324174165726, + 0.014156803488731384, 0.018139634281396866, -0.014710970222949982, + -0.052651647478342056, 0.02912742830812931, 0.024101730436086655, + 0.0413704477250576, 0.03631320223212242, 0.046750932931900024, + -0.017062805593013763, 0.017990263178944588, -0.03954370319843292, + -0.006972718983888626, -0.03784368559718132, 0.02746269851922989, + 0.04107878357172012, -0.005694024730473757, -0.03896583244204521, + 0.026265999302268028, -0.035318680107593536, -0.018394622951745987, + 0.013594037853181362, 0.0381510891020298, -0.010223937220871449, + 0.032417282462120056, 0.01610656827688217, -0.013205642811954021, + -0.03757423907518387, 0.03799910470843315, -0.039449408650398254, + -0.011290505528450012, -0.016824476420879364, 0.007128347177058458, + 0.030213234946131706, -0.09385695308446884, 0.014417118392884731, + -0.021249795332551003, -0.021371016278862953, 0.031582340598106384, + -0.015021033585071564, 0.03207740932703018, 0.04465494304895401, + 0.051139406859874725, -0.004539252258837223, -0.004026447422802448, + 0.036198731511831284, 0.002513982355594635, -0.022555746138095856, + -0.023142442107200623, 0.026506206020712852, -0.0208470169454813, + 0.01958189532160759, 0.02606782503426075, -0.050900157541036606, + 0.001175468903966248, 0.0026185859460383654, 0.01644700951874256, + 0.047048378735780716, -0.006155692040920258, 0.013264120556414127, + -0.004277337808161974, 0.022337032482028008, -0.030710609629750252, + -0.06784506887197495, 0.010662010870873928, -0.020733945071697235, + -0.01206474844366312, -0.0005046974983997643, -0.004159707576036453, + 0.028128545731306076, -0.011551725678145885, 0.057953692972660065, + 0.028500419110059738, 0.02070418931543827, 0.029373178258538246, + -0.053878165781497955, -0.03885475918650627, -0.011427262797951698, + -0.040592946112155914, 0.019192807376384735, -0.013966009952127934, + 0.002324307570233941, 0.027266129851341248, 0.02721570059657097, + -0.013851913623511791, 0.06292124837636948, -0.019983768463134766, + -0.06498263776302338, -0.014787066727876663, 0.07545251399278641, + 0.009921795688569546, -0.02266773208975792, -0.0174646507948637, + -0.0037801002617925406, 0.037214070558547974, -0.033669255673885345, + -0.0033054312225431204, -0.004362864885479212, -0.010861773043870926, + -0.041649043560028076, 0.02711806818842888, -0.001099557732231915, + 0.0007163260015659034, 0.01317980233579874, -0.011158796027302742, + 0.03966476768255234, 0.023275790736079216, -0.011645027436316013, + 0.0030634249560534954, -0.01243121363222599, 0.01271719578653574, + 0.003938829991966486, -0.00769989937543869, -0.039121564477682114, + 0.0005735178128816187, 0.02157283015549183, 0.005828005261719227, + 0.03934130072593689, 0.015216797590255737, 0.017237801104784012, + -0.037648268043994904, -0.007132838945835829, -0.018956882879137993, + -0.0597093440592289, 0.058341480791568756, -0.0008284997311420739, + 0.02095271646976471, 0.043099164962768555, 0.09887702018022537, + -0.01221393421292305, -0.02239784225821495, 0.016775032505393028, + 0.013331425376236439, -0.004451168701052666, -0.02870352193713188, + -0.020854242146015167, 0.05349724739789963, 0.03315908834338188, + 0.018541062250733376, -0.03136591613292694, 0.03549784794449806, + -0.0076525891199707985, -0.06454484909772873, 0.049847088754177094, + 0.012184737250208855, 0.03575005754828453, -0.050804175436496735, + 0.09406977146863937, 0.05103312432765961, -0.0036910600028932095, + 0.10705005377531052, 0.011394658125936985, -0.014218435622751713, + -0.042272791266441345, 0.018426422029733658, -0.08213183283805847, + -0.010240674018859863, 0.051353540271520615, 0.016103247180581093, + 0.04293083772063255, -0.00462630670517683, 0.001971749123185873, + -0.05101824551820755, -0.017815101891756058, -0.0788436159491539, + -0.019784294068813324, 0.006863154470920563, 0.04096531495451927, + 0.016416994854807854, 0.018884461373090744, -0.03645262494683266, + -0.02363709919154644, 0.08447448164224625, 0.027652855962514877, + -0.005039512179791927, -0.05533800646662712, 0.006148343440145254, + -0.03248206898570061, -0.015117023140192032, -0.056908346712589264, + 0.057090409100055695, 0.02987913228571415, -0.0642392635345459, + -0.01212853193283081, -0.04195745661854744, -0.008033841848373413, + -0.05249612778425217, 0.05965931713581085, 0.08591161668300629, + -0.012983623892068863, 0.002055486897006631, -0.002928174799308181, + -0.023014886304736137, -0.05307631567120552, 0.0325687900185585, + -0.008586175739765167, -0.005393583793193102, 0.009566529653966427, + 0.06500132381916046, -0.02100509963929653, -0.018470296636223793, + 0.001247459789738059, 0.007388024125248194, 0.012469757348299026, + 0.08475572615861893, 0.06918514519929886, 0.054265547543764114, + 0.03292711451649666, -0.08437038213014603, 0.07744771242141724, + -0.0004291488730814308, -0.020394261926412582, 0.039096955209970474, + 0.015851527452468872, -0.009922537952661514, 0.02087295800447464, + -0.019477976486086845, -0.06510577350854874, 0.008559669367969036, + 0.015032066963613033, -0.022979427129030228, -0.017166415229439735, + -0.014456263743340969, -0.034205030649900436, -0.04903494939208031, + 0.073653943836689, -0.041798241436481476, 0.0035302129108458757, + 0.031043095514178276, 0.038764648139476776, 0.03582717105746269, + -0.003121789079159498, 0.03909862041473389, -0.03283870965242386, + 0.06343409419059753, 0.085169717669487, 0.0037416887935250998, + 0.043896209448575974, -0.02215113304555416, -0.04062772914767265, + -0.029482074081897736, 0.0013964198296889663, 0.04621904715895653, + 0.030072476714849472, -0.023583346977829933, -0.016047311946749687, + -0.04016166180372238, -0.026690224185585976, 0.034725841134786606, + -0.08011004328727722, -0.023635270074009895, -0.01675681211054325, + 0.02217511460185051, -0.018720457330346107, 0.0413116030395031, + -0.0045730252750217915, -0.08402986079454422, 0.03641941770911217, + 0.028000695630908012, 0.042173273861408234, 0.024761751294136047, + -0.051845893263816833, -0.07877497375011444, -0.020710380747914314, + -0.035789184272289276, 0.04824375733733177, -0.04493764415383339, + -0.0014088008319959044, 0.09272980690002441, -0.030772028490900993, + 0.027623610571026802, -0.0008853759500198066, -0.015347420237958431, + -0.0006863650633022189, 0.02924676053225994, 0.03864092007279396, + -0.043402496725320816, 0.11410719156265259, 0.01606914773583412, + 0.03158045932650566, -0.049648500978946686, -0.026801105588674545, + 0.013934214599430561, -0.04582132399082184, -0.02133217453956604, + 0.013296819292008877, 0.030687933787703514, 0.0014671665849164128, + 0.005454834550619125, -0.024595070630311966, 0.036868833005428314, + -0.003586424048990011, -0.007300499361008406, 0.00619609747081995, + 0.004614396020770073, 0.06406176835298538, 0.010256785899400711, + -0.050202082842588425, -0.013155301101505756, -0.04005127400159836, + -0.027943719178438187, 0.05738724395632744, -0.002920332597568631, + -0.00731270294636488, 0.04419538751244545, 0.024069754406809807, + 0.012176074087619781, 0.004615467507392168, -0.04112132638692856, + -0.04844773933291435, -0.012684458866715431, 0.0071298484690487385, + -0.010914848186075687, -0.03592529892921448, -0.05016973987221718, + -0.011797907762229443, -0.043843258172273636, -0.03715396672487259, + 0.016528192907571793, 0.024301515892148018, 0.01335576456040144, + 0.021006477996706963, -0.021391959860920906, 0.05299517139792442, + 0.0070807491429150105, -0.08096124231815338, -0.07334060221910477, + -0.034530941396951675, -0.04421507194638252, 0.010524646379053593, + 0.009575314819812775, -0.031711090356111526, 0.023479584604501724, + -0.04212309420108795, 0.016264619305729866, 0.03907531499862671, + -0.0011187525233253837, -0.03998023644089699, -0.027464834973216057, + -0.07113838940858841, -0.028915319591760635, -0.01282753050327301, + -0.0033073138911277056, 0.026715606451034546, -0.002769897459074855, + 0.020033732056617737, 0.014502385631203651, -0.017903830856084824, + 0.06932531297206879, 0.0432068407535553, 0.01685408502817154, + 0.04834728315472603, -0.009553197771310806, 0.019799189642071724, + 0.01173039898276329, 0.04158413037657738, -0.018829666078090668, + -0.008410722948610783, 0.008009687066078186, 0.034592460840940475, + 0.07790639251470566, -0.022050900384783745, 0.04081638529896736, + 0.046872470527887344, 0.0010260086273774505, -0.05322079360485077, + 0.009096509777009487, -0.06831686198711395, -0.01390997413545847, + -0.020475609228014946, 0.017393099144101143, -0.007532020565122366, + -0.06435851007699966, -0.014785194769501686, 0.02654031664133072, + 0.004753720946609974, 0.026440177112817764, -0.028890414163470268, + -0.011440729722380638, 0.003554105758666992, -0.0022926912643015385, + -0.02393224649131298, 0.03711748123168945, -0.06023703143000603, + -0.008778683841228485, -0.05984162166714668, -0.024247022345662117, + -0.036919932812452316, 0.05249374359846115, 0.03022468276321888, + -0.011348876170814037, 0.0008303995127789676, 0.001597013440914452, + -0.015491127036511898, -0.035073015838861465, -0.024477796629071236, + -0.030328145250678062, -0.09301470220088959, -0.046440113335847855, + 0.036719027906656265, -0.021899227052927017, 0.04666316881775856, + -0.07481305301189423, -0.04928148165345192, -0.01480096485465765, + 0.0014140848070383072, 0.016779841855168343, -0.04318199306726456, + 0.011910341680049896, -0.04019855335354805, -0.027363713830709457, + 0.006433602888137102, 0.023732252418994904, -0.013081788085401058, + 0.02489032782614231, 0.005415213759988546, -0.058724161237478256, + 0.032487478107213974, -0.014332194812595844, -0.020952431485056877, + 0.055405858904123306, -0.02239573374390602, 0.016315918415784836, + 0.04710645601153374, 0.006866136100143194, -0.019589263945817947, + -0.046199049800634384, 0.04977096989750862, -0.03211359679698944, + 0.06759121268987656, -0.007805021945387125, 0.009877636097371578, + -0.003194598713889718, -0.0014034705236554146, 0.024012917652726173, + 0.0007609894964843988, 0.04028927534818649, 0.047299597412347794, + 0.04644732549786568, 0.06253348290920258, -0.03101237863302231, + -0.04797065258026123, -0.02459110878407955, -0.06663094460964203, + -0.012946722097694874, -0.046321313828229904, -0.03617801144719124, + -0.06608668714761734, 0.01371682621538639, -0.040183935314416885, + 0.027353622019290924, -0.013125114142894745, 0.020482128486037254, + -0.10186963528394699, 0.03741387277841568, -0.048566944897174835, + 0.0017904090927913785, 0.0444694422185421, -0.02355058304965496, + -0.04245513305068016, 0.01599632203578949, -0.00974870752543211, + -0.02246273122727871, 0.011107604950666428, -0.006354854442179203, + -0.08260829746723175, -0.054969724267721176, -0.038703542202711105, + -0.02590899169445038, -0.012424441985785961, 0.033952418714761734, + 0.032632969319820404, 0.03585505858063698, -0.027734532952308655, + -0.05185376852750778, 0.005663866177201271, 0.01415393128991127, + 0.007472912315279245, -0.0325092077255249, -0.0008526426972821355, + 0.05909401550889015, -0.006496420595794916, 0.06674317270517349, + 0.06033811718225479, -0.04705937206745148, 0.01221691444516182, + -0.005195186473429203, 0.017006050795316696, 0.015768419951200485, + -0.02346021682024002, -0.04318040981888771, -0.00965888798236847, + -0.012831253930926323, -0.023086808621883392, -0.043478451669216156, + 0.02215973101556301, 0.01018955372273922, -0.0029477940406650305, + -0.026364397257566452, -0.04219489544630051, -0.0690244510769844, + 0.0017003740649670362, -0.03498053178191185, -0.01891854591667652, + -0.020380523055791855, -0.07183944433927536, 0.01474913302809, + 0.012818068265914917, 0.02298390306532383, 0.006645163521170616, + -0.014497633092105389, -0.05751577392220497, -0.01127719134092331, + 0.014469895511865616, 0.039319343864917755, -0.002891098614782095, + 0.0038161359261721373, -0.0176107045263052, -0.02695712260901928, + 0.023520348593592644, 0.053624920547008514, -0.0472102165222168, + -0.021724319085478783, -0.04204733297228813, 0.004941252060234547, + -0.07744265347719193, -0.028974706307053566, -6.1493665270973e-5, + -0.020630594342947006, -0.014794640243053436, -0.045572925359010696, + 0.03233763575553894, 0.00969443004578352, 0.03665856271982193, + 0.027483846992254257, 0.074271060526371, -0.07454165071249008, + -0.034101732075214386, -0.07216823101043701, -0.001424514572136104, + -0.0025912360288202763, -0.002444307319819927, -0.012540637515485287, + 0.009027975611388683, 0.06855443120002747, -0.0013480151537805796, + 0.027303414419293404, -0.019723499193787575, 0.033644214272499084, + -0.04313155263662338, -0.016152892261743546, -0.020085612311959267, + 0.029526935890316963, 0.0004591972683556378, -0.013712934218347073, + 0.015895912423729897, -0.046559300273656845, -0.00015638815239071846, + 0.0015497541753575206, -0.0015048328787088394, 0.06692329794168472, + 0.0013934546150267124, 0.008921030908823013, -0.010347972624003887, + -0.039798807352781296, 0.06892028450965881, 0.021145053207874298, + 0.007431029342114925, -0.05281573906540871, 0.015844792127609253, + 0.014578046277165413, -0.0020482230465859175, 0.03509555384516716, + -0.021227506920695305, -0.03619229048490524, 0.004116897005587816, + 0.02835669554769993, -0.0028248224407434464, 0.00836214143782854, + -0.004688165616244078, 0.04566347226500511, -0.0352579727768898, + -0.007859165780246258, -0.003958444111049175, 0.023938892409205437, + 0.04262895882129669, -0.02836589328944683, 0.0456448458135128, + -0.062015753239393234, 0.03518408164381981, 0.06333593279123306, + -0.0155468275770545, 0.013991734012961388, 0.02207978442311287, + 0.0032898876816034317, 0.05948015674948692, 0.010670959949493408, + -0.00624996330589056, -0.04401599243283272, -0.0022705462761223316 + ], + "content_embedding": [ + -0.02403288148343563, 0.08599621057510376, -0.003520619124174118, + -0.002186001278460026, -0.002845448674634099, 0.005484029185026884, + 0.016410797834396362, -0.02119613252580166, -0.04811510443687439, + -0.013274849392473698, -0.043441254645586014, 0.009376521222293377, + 0.019551504403352737, 0.0036566888447850943, 0.004842979833483696, + -0.006052782759070396, 0.025645455345511436, -0.012315846979618073, + -0.027703408151865005, 0.02152254432439804, 0.011010917834937572, + -0.012258552014827728, -0.01729186624288559, -0.02833859808743, + 0.04027653858065605, -0.009032614529132843, -0.017036406323313713, + 0.003077560570091009, -0.04315951466560364, 0.029963837936520576, + 0.01716647669672966, -0.02696092799305916, -0.006828296463936567, + -0.026644738391041756, -0.0889354720711708, -0.05207673832774162, + 0.05015599727630615, -0.019386274740099907, -0.03684607893228531, + 0.046758152544498444, 0.05076799914240837, 0.007221075240522623, + 0.016260067000985146, -0.03850802034139633, 0.054756514728069305, + -0.012776038609445095, 0.036689598113298416, -0.02616005390882492, + 0.033269986510276794, -0.05991198495030403, -0.00043452056706883013, + -0.004230191465467215, -0.008319171145558357, 0.0068639665842056274, + -0.016133679077029228, 0.005670355167239904, -0.005839305464178324, + 0.027315128594636917, 0.04275438189506531, -0.05024448782205582, + 0.05199997127056122, 0.05441230162978172, -0.049353599548339844, + -0.008365850895643234, 0.0066192797385156155, -0.055218830704689026, + 0.01654958724975586, 0.013456150889396667, -0.01148252934217453, + -0.014086110517382622, 0.024422504007816315, -0.001102397684007883, + -0.022180721163749695, 0.022023402154445648, -0.03140854462981224, + 0.0038229606579989195, 0.01081792451441288, -0.006220541428774595, + -0.02462770976126194, 0.051595576107501984, 0.04433179274201393, + -0.05173564329743385, 0.00422016903758049, 0.01004322525113821, + 0.01985878124833107, 0.028202056884765625, -0.0016253730282187462, + 0.01617475040256977, 0.010118816047906876, 0.0603351853787899, + 0.0140571603551507, 0.0029350141994655132, 0.04240429773926735, + 0.06991686671972275, -0.10119865089654922, 0.08219177275896072, + 0.02891121245920658, 0.00130809610709548, -0.016763439401984215, + -0.00509023480117321, -0.0035567383747547865, 0.02000737562775612, + -0.002168829319998622, 0.01889166608452797, 0.04547121003270149, + 0.04019659012556076, 0.024593960493803024, 0.017190879210829735, + 0.007603269536048174, 0.007314966060221195, -0.06791973859071732, + -0.036731328815221786, 0.007499238010495901, 0.02694091759622097, + -0.02129160799086094, 0.021507520228624344, 0.09310256689786911, + -0.03434507176280022, 0.006634920369833708, 0.03453971818089485, + 0.0155464056879282, 0.003056582296267152, -0.004192651715129614, + 0.03274714946746826, 0.04909229651093483, 0.025430802255868912, + -0.01268640998750925, -0.047261349856853485, 0.0018452038057148457, + -0.0002589405339676887, -0.07050265371799469, 0.004126655403524637, + -0.07842830568552017, 0.00013916153693571687, 0.047662656754255295, + 0.01362426858395338, -0.07088430225849152, -0.026547620072960854, + 0.010091855190694332, -0.015962867066264153, 0.03528159111738205, + 0.011798265390098095, 0.020107097923755646, -0.013524978421628475, + 0.016901858150959015, -0.08753035962581635, -0.062227677553892136, + -0.028078285977244377, -0.03297634422779083, 0.008013743907213211, + 0.018041228875517845, -0.022115394473075867, 0.00595641927793622, + 0.019160043448209763, 0.008510938845574856, -0.0474565215408802, + -0.038813307881355286, -0.016643475741147995, 0.06800767034292221, + 0.041471801698207855, 0.03696686029434204, 0.03421548008918762, + -0.03440884128212929, 0.06769654899835587, -0.01683412306010723, + 0.028133966028690338, 0.018801912665367126, 0.015075244940817356, + 0.012032945640385151, 0.03569433093070984, -0.021484674885869026, + -0.01213730126619339, 0.023888660594820976, -0.03447817265987396, + 0.03329891338944435, -0.007350335828959942, 0.06528840214014053, + -0.03317185863852501, 0.024836916476488113, -0.061747901141643524, + -0.01068184245377779, -0.021780723705887794, 0.06678029894828796, + 0.03825325518846512, -0.02603997103869915, 0.0200904980301857, + 0.025599440559744835, -0.05657019838690758, 0.028341008350253105, + -0.0439138226211071, 0.05886855348944664, -0.049358345568180084, + 0.014541592448949814, 0.005707047879695892, 0.008378228172659874, + -0.030232897028326988, 0.06261618435382843, -0.013355602510273457, + -0.036993358284235, -0.028917213901877403, -0.0680958554148674, + -0.027451951056718826, -0.07567653805017471, 0.014718701131641865, + -0.009075576439499855, 0.013478180393576622, 0.03208685666322708, + 0.031021032482385635, 0.016195151954889297, 0.008199494332075119, + 0.08357387781143188, -0.01200099941343069, 0.022620532661676407, + 0.05445336923003197, -0.018056273460388184, -0.04719870164990425, + 0.04062207415699959, 0.0009855915559455752, -0.0462096631526947, + 0.05879806727170944, 0.03913828358054161, -0.05726383253931999, + 0.02152623049914837, 0.002137464936822653, 0.01444965973496437, + -0.019534891471266747, -0.0375588983297348, -0.03905639797449112, + -0.027474306523799896, -0.001400938956066966, -0.033295221626758575, + -0.06691068410873413, 0.0620984211564064, 0.020130982622504234, + 0.027853885665535927, 0.028353361412882805, 0.007000260055065155, + -0.015244328416883945, 0.028457706794142723, 0.05079026147723198, + 0.0265045203268528, -0.008008715696632862, 0.011166643351316452, + -0.02545643411576748, -0.09122578054666519, -0.000896137673407793, + 0.0055070980452001095, 0.023860882967710495, -0.056958671659231186, + 0.002000730484724045, 0.000531299039721489, 0.01964678056538105, + 0.02459172159433365, 0.010496687144041061, 0.032775767147541046, + -0.040455516427755356, -0.01898832432925701, 0.048115238547325134, + 0.008294769562780857, -0.02248159423470497, -0.0020450311712920666, + -0.02413240633904934, 0.0423247255384922, -0.02917350083589554, + -0.0197658222168684, 0.009233975782990456, -0.02438087947666645, + -0.057745061814785004, 0.020396480336785316, -0.028454614803195, + -0.007276479620486498, -0.0060751899145543575, 0.016126802191138268, + 0.07733260095119476, 0.0055052717216312885, -0.0241200253367424, + -0.009856182150542736, -0.01288821641355753, 0.021394196897745132, + -0.0027394252829253674, -0.057746946811676025, -0.055244673043489456, + -0.03518827632069588, 0.020108383148908615, -0.037429675459861755, + 0.06402620673179626, 0.014570947736501694, 0.0011715830769389868, + 0.04670550301671028, -0.03730842098593712, -0.002726265462115407, + -0.03393309563398361, 0.03357642516493797, 0.006151925306767225, + 0.027046309784054756, 0.06079886853694916, 0.08915705978870392, + -0.040912795811891556, -0.009531376883387566, -0.008656186051666737, + -0.010746185667812824, 0.011325616389513016, 0.00910742674022913, + -0.00870103295892477, -0.02257593534886837, -0.008474824018776417, + -0.01126043125987053, -0.006183316465467215, 0.03318650647997856, + -0.005288233514875174, -0.031032271683216095, 0.02630523219704628, + 0.02767125330865383, -0.01024201512336731, -0.02395681105554104, + 0.07798302173614502, 0.06453987956047058, -0.005852920934557915, + 0.08618523925542831, -0.009387078694999218, 0.007869970984756947, + -0.03182069584727287, -0.022106602787971497, -0.0868132933974266, + 0.028115050867199898, 0.07332660257816315, -0.0037628302816301584, + 0.03760993853211403, -0.027132470160722733, 0.030093027278780937, + -0.037918947637081146, 0.039932165294885635, -0.07345228642225266, + -0.046965666115283966, -0.0013359235599637032, 0.00791996717453003, + 0.03006441704928875, 0.04222951829433441, -0.0141807422041893, + -0.021912341937422752, -0.0065930006094276905, -0.0038735137786716223, + -0.038659993559122086, -0.057126715779304504, 0.006521300878375769, + -0.030727874487638474, -0.022539950907230377, -0.06316803395748138, + 0.06865260004997253, 0.031939368695020676, -0.055947039276361465, + 0.0066061182878911495, -0.014607742428779602, -0.02204318344593048, + -0.05172397196292877, 0.02495967596769333, 0.07759078592061996, + 0.0027070387732237577, 0.008220532909035683, 0.02342107705771923, + 0.03180982172489166, -0.03099866956472397, 0.03512701019644737, + -0.03168865293264389, 0.012847676873207092, 0.06514899432659149, + 0.08987598121166229, 0.0024377063382416964, 0.02394464798271656, + -0.041963983327150345, -0.004438851028680801, 0.015682004392147064, + 0.0410960391163826, 0.05460710451006889, 0.057952240109443665, + 0.020986247807741165, -0.08822161704301834, 0.01074486318975687, + 0.014192330650985241, -0.025726256892085075, 0.0719577744603157, + 0.0021957557182759047, 0.022048326209187508, -0.04020603001117706, + 0.0014428661670535803, -0.0357256680727005, -0.030243121087551117, + -0.0376482829451561, -0.020463477820158005, -0.022432789206504822, + -0.03096373938024044, -0.01816924288868904, -0.05358648672699928, + 0.07382772862911224, -0.014173741452395916, 0.0201816875487566, + 0.006632740143686533, 0.0025384302716702223, 0.04055432602763176, + 0.0069578299298882484, -0.019879184663295746, -0.059168167412281036, + 0.028969064354896545, 0.05784929171204567, -0.002147398190572858, + 0.043272342532873154, -0.004542165901511908, -0.0482858382165432, + -0.004174860659986734, 0.020621255040168762, 0.04293094202876091, + 0.0292718093842268, -0.00980047881603241, -0.021710650995373726, + -0.02639775723218918, -0.019148416817188263, 0.09012293070554733, + -0.045379895716905594, -0.023026002570986748, -0.030370570719242096, + 0.008383749052882195, 0.014925302006304264, -0.0011008139699697495, + 0.006763918325304985, -0.025421440601348877, -0.004525069613009691, + 0.03806034475564957, 0.005547006148844957, 0.05000557750463486, + -0.005488873925060034, 0.021936720237135887, 0.020678924396634102, + -0.004737663082778454, 0.040749210864305496, -0.0533074289560318, + 0.025417080149054527, 0.08257681876420975, -0.005508026573807001, + -0.009805315174162388, 0.07595512270927429, -0.0018210643902420998, + -0.029579052701592445, 0.009883117862045765, -0.015399829484522343, + -0.017134232446551323, 0.03538937494158745, 0.0827752947807312, + 0.012051745317876339, -0.07159247249364853, -0.02079680748283863, + 0.03473742678761482, 0.018268825486302376, 0.023407628759741783, + -0.036390434950590134, 0.07932467013597488, 0.004754354711622, + -0.012676632963120937, -0.06851805001497269, 0.02255256660282612, + 0.03780437260866165, 0.04691546410322189, 0.018480120226740837, + 0.0005508657777681947, 0.05573705583810806, -0.009221675805747509, + -0.06587770581245422, 0.015470701269805431, -0.012271493673324585, + -0.025784730911254883, 0.022757982835173607, -0.01213389914482832, + 0.017422374337911606, 0.012241406366229057, 0.04379018396139145, + 0.01124424859881401, 0.002584748901426792, 0.02793707512319088, + -0.04307323694229126, 0.03207562863826752, 0.05286982282996178, + 0.01086041983217001, 0.009665313176810741, -0.054988693445920944, + 0.01324005052447319, -0.04261464625597, -0.02707112766802311, + -0.002658748533576727, 0.03499991446733475, -0.005491453222930431, + 0.006562606431543827, 0.018722862005233765, 0.07151596993207932, + -0.003824777202680707, -0.04148973524570465, -0.06528852880001068, + -0.018773429095745087, -0.023220594972372055, 0.021337825804948807, + 0.003552130190655589, -0.07254927605390549, 0.030997687950730324, + 0.009675328619778156, -0.007739027962088585, -0.001004970632493496, + -0.0009698161156848073, -0.03183043375611305, -0.003764253342524171, + -0.06521959602832794, 0.0077109793201088905, 0.008421109057962894, + 0.02024395577609539, 0.06566902250051498, 0.011374534107744694, + 0.040655992925167084, 0.0274888314306736, -0.0748000368475914, + 0.06930309534072876, 0.014980202540755272, -0.03328235074877739, + 0.07670122385025024, -0.013236696831882, 0.09516690671443939, + 0.0004450292617548257, 0.01539886835962534, -0.11376772075891495, + -0.0004633136559277773, -0.023844275623559952, 0.023186970502138138, + 0.0542912632226944, 0.006978484336286783, 0.03704620897769928, + 0.0761408805847168, 0.0018389171455055475, -0.02292831614613533, + 0.035566531121730804, -0.06125196814537048, -0.01740599237382412, + -0.03189321979880333, -0.023606419563293457, 0.0002929234178736806, + -0.032161861658096313, -0.02417462132871151, 0.007371667306870222, + 0.01384897343814373, 0.0011207011993974447, -0.054523780941963196, + -0.03664090484380722, 0.012376014143228531, 0.005946264136582613, + -0.05214802548289299, 0.06363234668970108, -0.01850913278758526, + 0.03264418616890907, -0.08298838883638382, 0.028580913320183754, + -0.06874261051416397, 0.04560680687427521, -0.01221420057117939, + -0.015291322953999043, 0.011163976043462753, -0.01707146316766739, + -0.021233493462204933, 0.0009499920415692031, -0.011884773150086403, + 0.031535957008600235, -0.07693900167942047, -0.030928723514080048, + 0.02938068099319935, 0.013103127479553223, 0.009228850714862347, + -0.04399878904223442, -0.038614701479673386, 0.021263988688588142, + 0.0270336102694273, -0.0022124540992081165, -0.032499391585588455, + 0.029354240745306015, -0.028516946360468864, -0.03277367725968361, + -0.04755333065986633, -0.03938357159495354, -0.029368583112955093, + 0.06943269073963165, 0.017946777865290642, -0.01990826241672039, + 0.014896579086780548, -0.06675421446561813, -0.04962918534874916, + 0.10290152579545975, -0.05442032590508461, 0.0268304031342268, + 0.01750801131129265, 0.0006768505554646254, -0.007724875118583441, + -0.05064627528190613, 0.03560181334614754, 0.005476392339915037, + 0.008490868844091892, -0.005553610157221556, -0.04698188602924347, + -0.025146158412098885, 0.0026807712856680155, 0.0254969522356987, + 0.005350390914827585, 0.004036207217723131, 0.02843003161251545, + 0.008211316540837288, 0.03748054802417755, -0.05300099402666092, + -0.012365839444100857, -0.0130928261205554, -0.03939966484904289, + -0.026050617918372154, -0.04415596276521683, -0.03128521516919136, + -0.0388399139046669, 0.05186399444937706, -0.049164481461048126, + 0.043122462928295135, -0.0315178819000721, 0.012280933558940887, + -0.0792573019862175, 0.05075725167989731, -0.04304235801100731, + 0.018651138991117477, 0.03076835162937641, -0.060538437217473984, + -0.023055853322148323, 0.01177286822348833, 0.058492839336395264, + 0.025716299191117287, 0.009599392302334309, -0.012054546736180782, + -0.027742642909288406, -0.05367058888077736, -0.026801493018865585, + -0.014112668111920357, -0.006383270025253296, 1.2056754712830298e-5, + 0.039540693163871765, 0.02213987335562706, -0.08540242910385132, + -0.04058465361595154, 0.008699232712388039, 0.031218260526657104, + 0.0021884969901293516, 0.011582552455365658, 0.025049764662981033, + 0.04276714473962784, 0.009781924076378345, 0.05123818293213844, + 0.07441077381372452, -0.029336893931031227, 0.02714505046606064, + 0.041163086891174316, -0.006217346992343664, 0.025060802698135376, + 0.023126818239688873, -0.07503696531057358, -0.0020585027523338795, + 0.005981603171676397, -0.027166299521923065, -0.020568007603287697, + 0.005853605456650257, -0.006091856863349676, -0.033637579530477524, + -0.039759427309036255, -0.06260950118303299, -0.024897020310163498, + 0.02462431788444519, 0.01859314925968647, 0.010398009791970253, + -0.00020126033632550389, -0.06035298481583595, -0.019108809530735016, + 0.042335279285907745, -0.03559218347072601, -0.02529655024409294, + -0.02809930220246315, -0.05607590824365616, -0.026691321283578873, + -0.026792382821440697, 0.04120280221104622, -0.015540994703769684, + -0.005803580395877361, 0.020203134045004845, -0.05952906608581543, + -0.004206392448395491, -0.011308858171105385, -0.037488050758838654, + 0.007830106653273106, -0.009608179330825806, 0.00015318443183787167, + -0.0684049054980278, 0.0018899703864008188, -0.023107590153813362, + -0.015158215537667274, -0.030714333057403564, 0.025599345564842224, + 0.018543586134910583, -0.0075812251307070255, 0.04323196783661842, + -0.005424505099654198, 0.06189188361167908, -0.01650432124733925, + 0.0035911088343709707, 0.01841658726334572, 0.012203766033053398, + -0.015994010493159294, -0.0018007376929745078, 0.011197488754987717, + -0.01184547133743763, 0.06119342893362045, -0.04449119791388512, + -0.010956074111163616, 0.015267443843185902, 0.03397256135940552, + -0.022375188767910004, -0.010562969371676445, -0.030176721513271332, + 0.0649082288146019, -0.026252834126353264, -0.043264783918857574, + 0.020383840426802635, -0.014332938008010387, -0.019906938076019287, + 0.002487052930518985, 0.016441592946648598, 0.05937374755740166, + 0.029459767043590546, -0.03393784165382385, -0.016614725813269615, + -0.03633803129196167, 0.04786395654082298, -0.014543719589710236, + 0.0030611655674874783, -0.03296193480491638, 0.024570109322667122, + -0.08628548681735992, -0.008082202635705471, 0.05895440652966499, + -0.05567137897014618, -0.05882163718342781, -0.005672273691743612, + -0.022155780345201492, -0.03165644034743309, -0.04472680389881134, + 0.025338545441627502, 0.053867027163505554, -0.020717058330774307, + -0.019026240333914757, -0.03037080727517605, -0.005734192673116922, + -0.014109884388744831, -0.005240253172814846, 0.056925658136606216, + 0.006881027482450008, 0.006321505177766085, 0.025533199310302734, + 0.0066923401318490505, 0.014867548830807209, 0.01877731829881668, + -0.03573253005743027, 0.05504361167550087, 0.044875118881464005, + 0.008996511809527874, -0.020861415192484856, 0.0196152962744236 + ], + "chunk_ind": 0 + }, + { + "url": "https://docs.danswer.dev/more/use_cases/enterprise_search", + "title": "Enterprise Search", + "content": "More than Search\nWhen analyzing the entire corpus of knowledge within your company is as easy as asking a question in a search bar, your entire team can stay informed and up to date. Danswer also makes it trivial to identify where knowledge is well documented and where it is lacking. Team members who are centers of knowledge can begin to effectively document their expertise since it is no longer being thrown into a black hole. All of this allows the organization to achieve higher efficiency and drive business outcomes.\n\nWith Generative AI, the entire user experience has evolved as well. For example, instead of just finding similar cases for your customer support team to reference, Danswer breaks down the issue and explains it so that even the most junior members can understand it. This in turn lets them give the most holistic and technically accurate response possible to your customers. On the other end, even the super stars of your sales team will not be able to review 10 hours of transcripts before hopping on that critical call, but Danswer can easily parse through it in mere seconds and give crucial context to help your team close.", + "title_embedding": [ + -0.011060578748583794, 0.05994883179664612, 0.008845113217830658, + 0.011364905163645744, 0.03147757425904274, 0.04506697878241539, + -0.025942707434296608, -0.011002028360962868, -0.03507396578788757, + -0.01727098599076271, -0.016820134595036507, 0.04671240597963333, + 0.023456331342458725, -0.005752791650593281, -0.011421029455959797, + -0.04169125109910965, 0.0652366429567337, -0.011136278510093689, + -0.013501451350748539, -0.006273901090025902, 0.0012236927868798375, + -0.030771249905228615, 0.010098040103912354, -0.02360220067203045, + 0.006734110414981842, 0.001379420980811119, -0.0047225081361830235, + -0.013901330530643463, 0.014645840041339397, -0.02156134508550167, + -0.026707857847213745, -0.00676271365955472, 0.056067030876874924, + -0.0455806590616703, -0.02762053906917572, -0.0965149849653244, + 0.04567359760403633, 0.053895801305770874, 0.029093541204929352, + 0.0199823547154665, 0.047089505940675735, 0.026028119027614594, + -0.0034626282285898924, -0.021002190187573433, 0.04217635095119476, + -0.015198000706732273, 0.047393105924129486, 0.008588545024394989, + 0.07058116048574448, -0.09135723114013672, -0.009591161273419857, + 0.014971816912293434, -8.222273208957631e-7, -0.05534408614039421, + -0.029965048655867577, -0.028381407260894775, 0.025547217577695847, + 0.038583844900131226, 0.01984122209250927, -0.02435469813644886, + 0.04955849424004555, 0.0128632802516222, -0.022573867812752724, + 0.025284791365265846, 0.04496009647846222, 0.0004200333496555686, + -0.0015001222491264343, 0.02609623409807682, 0.023677939549088478, + -0.05961468443274498, 0.06799754500389099, -0.014409428462386131, + -0.010374268516898155, 0.019634529948234558, -0.03720257058739662, + -0.04654879495501518, 0.0056068566627800465, -0.021244503557682037, + -0.03198164328932762, 0.05707596614956856, 0.043927326798439026, + -0.033356692641973495, 0.015723733231425285, -0.027493512257933617, + 0.04525380581617355, 0.025808652862906456, -0.007083983160555363, + -0.011038954369723797, -0.004390218295156956, -0.006583297159522772, + 0.003319315379485488, 0.061810243874788284, 0.05887124314904213, + 0.07722297310829163, -0.06601747870445251, 0.036486171185970306, + 0.05119618400931358, 0.00945530366152525, -0.03005358576774597, + -0.047870855778455734, -0.0359003059566021, -0.005699747242033482, + 0.053807053714990616, -0.001554036163724959, 0.060548700392246246, + 0.05476829782128334, 0.00426551653072238, -0.005215689539909363, + -0.0352163203060627, -0.03284529969096184, -0.03373449295759201, + 0.021254340186715126, -0.010481598787009716, 0.02651101164519787, + -0.00481840968132019, 0.045160870999097824, 0.09704204648733139, + -0.03473421558737755, 0.015584945678710938, 0.06579536944627762, + 0.0651017278432846, 0.007380738854408264, 0.00624364148825407, + 0.07893780618906021, -0.019876087084412575, 0.006619091611355543, + -0.030776498839259148, -0.016426965594291687, 0.014603338204324245, + -0.03326896205544472, 0.003784433240070939, 0.025205042213201523, + 0.03047170303761959, -0.03364298865199089, 0.005974944215267897, + 0.015994269400835037, -0.10205432027578354, -0.026497431099414825, + 0.07166463136672974, -0.007370935752987862, 0.034912627190351486, + -0.004656887147575617, 0.03219066560268402, -0.02239271067082882, + 0.012679396197199821, -0.07867992669343948, -0.026638884097337723, + -0.008346030488610268, -0.027243634685873985, 0.012043806724250317, + 0.024078860878944397, 0.0006219774950295687, 0.0032065671402961016, + -0.008564895950257778, -0.03275461867451668, -0.05791788548231125, + -0.044815272092819214, 0.006680188700556755, 0.04072298854589462, + 0.016144724562764168, -0.0008583687013015151, 0.03699830546975136, + -0.04675738513469696, 0.06570404022932053, -0.011776894330978394, + -0.011386243626475334, -0.0003363603027537465, 0.039485324174165726, + 0.014156803488731384, 0.018139634281396866, -0.014710970222949982, + -0.052651647478342056, 0.02912742830812931, 0.024101730436086655, + 0.0413704477250576, 0.03631320223212242, 0.046750932931900024, + -0.017062805593013763, 0.017990263178944588, -0.03954370319843292, + -0.006972718983888626, -0.03784368559718132, 0.02746269851922989, + 0.04107878357172012, -0.005694024730473757, -0.03896583244204521, + 0.026265999302268028, -0.035318680107593536, -0.018394622951745987, + 0.013594037853181362, 0.0381510891020298, -0.010223937220871449, + 0.032417282462120056, 0.01610656827688217, -0.013205642811954021, + -0.03757423907518387, 0.03799910470843315, -0.039449408650398254, + -0.011290505528450012, -0.016824476420879364, 0.007128347177058458, + 0.030213234946131706, -0.09385695308446884, 0.014417118392884731, + -0.021249795332551003, -0.021371016278862953, 0.031582340598106384, + -0.015021033585071564, 0.03207740932703018, 0.04465494304895401, + 0.051139406859874725, -0.004539252258837223, -0.004026447422802448, + 0.036198731511831284, 0.002513982355594635, -0.022555746138095856, + -0.023142442107200623, 0.026506206020712852, -0.0208470169454813, + 0.01958189532160759, 0.02606782503426075, -0.050900157541036606, + 0.001175468903966248, 0.0026185859460383654, 0.01644700951874256, + 0.047048378735780716, -0.006155692040920258, 0.013264120556414127, + -0.004277337808161974, 0.022337032482028008, -0.030710609629750252, + -0.06784506887197495, 0.010662010870873928, -0.020733945071697235, + -0.01206474844366312, -0.0005046974983997643, -0.004159707576036453, + 0.028128545731306076, -0.011551725678145885, 0.057953692972660065, + 0.028500419110059738, 0.02070418931543827, 0.029373178258538246, + -0.053878165781497955, -0.03885475918650627, -0.011427262797951698, + -0.040592946112155914, 0.019192807376384735, -0.013966009952127934, + 0.002324307570233941, 0.027266129851341248, 0.02721570059657097, + -0.013851913623511791, 0.06292124837636948, -0.019983768463134766, + -0.06498263776302338, -0.014787066727876663, 0.07545251399278641, + 0.009921795688569546, -0.02266773208975792, -0.0174646507948637, + -0.0037801002617925406, 0.037214070558547974, -0.033669255673885345, + -0.0033054312225431204, -0.004362864885479212, -0.010861773043870926, + -0.041649043560028076, 0.02711806818842888, -0.001099557732231915, + 0.0007163260015659034, 0.01317980233579874, -0.011158796027302742, + 0.03966476768255234, 0.023275790736079216, -0.011645027436316013, + 0.0030634249560534954, -0.01243121363222599, 0.01271719578653574, + 0.003938829991966486, -0.00769989937543869, -0.039121564477682114, + 0.0005735178128816187, 0.02157283015549183, 0.005828005261719227, + 0.03934130072593689, 0.015216797590255737, 0.017237801104784012, + -0.037648268043994904, -0.007132838945835829, -0.018956882879137993, + -0.0597093440592289, 0.058341480791568756, -0.0008284997311420739, + 0.02095271646976471, 0.043099164962768555, 0.09887702018022537, + -0.01221393421292305, -0.02239784225821495, 0.016775032505393028, + 0.013331425376236439, -0.004451168701052666, -0.02870352193713188, + -0.020854242146015167, 0.05349724739789963, 0.03315908834338188, + 0.018541062250733376, -0.03136591613292694, 0.03549784794449806, + -0.0076525891199707985, -0.06454484909772873, 0.049847088754177094, + 0.012184737250208855, 0.03575005754828453, -0.050804175436496735, + 0.09406977146863937, 0.05103312432765961, -0.0036910600028932095, + 0.10705005377531052, 0.011394658125936985, -0.014218435622751713, + -0.042272791266441345, 0.018426422029733658, -0.08213183283805847, + -0.010240674018859863, 0.051353540271520615, 0.016103247180581093, + 0.04293083772063255, -0.00462630670517683, 0.001971749123185873, + -0.05101824551820755, -0.017815101891756058, -0.0788436159491539, + -0.019784294068813324, 0.006863154470920563, 0.04096531495451927, + 0.016416994854807854, 0.018884461373090744, -0.03645262494683266, + -0.02363709919154644, 0.08447448164224625, 0.027652855962514877, + -0.005039512179791927, -0.05533800646662712, 0.006148343440145254, + -0.03248206898570061, -0.015117023140192032, -0.056908346712589264, + 0.057090409100055695, 0.02987913228571415, -0.0642392635345459, + -0.01212853193283081, -0.04195745661854744, -0.008033841848373413, + -0.05249612778425217, 0.05965931713581085, 0.08591161668300629, + -0.012983623892068863, 0.002055486897006631, -0.002928174799308181, + -0.023014886304736137, -0.05307631567120552, 0.0325687900185585, + -0.008586175739765167, -0.005393583793193102, 0.009566529653966427, + 0.06500132381916046, -0.02100509963929653, -0.018470296636223793, + 0.001247459789738059, 0.007388024125248194, 0.012469757348299026, + 0.08475572615861893, 0.06918514519929886, 0.054265547543764114, + 0.03292711451649666, -0.08437038213014603, 0.07744771242141724, + -0.0004291488730814308, -0.020394261926412582, 0.039096955209970474, + 0.015851527452468872, -0.009922537952661514, 0.02087295800447464, + -0.019477976486086845, -0.06510577350854874, 0.008559669367969036, + 0.015032066963613033, -0.022979427129030228, -0.017166415229439735, + -0.014456263743340969, -0.034205030649900436, -0.04903494939208031, + 0.073653943836689, -0.041798241436481476, 0.0035302129108458757, + 0.031043095514178276, 0.038764648139476776, 0.03582717105746269, + -0.003121789079159498, 0.03909862041473389, -0.03283870965242386, + 0.06343409419059753, 0.085169717669487, 0.0037416887935250998, + 0.043896209448575974, -0.02215113304555416, -0.04062772914767265, + -0.029482074081897736, 0.0013964198296889663, 0.04621904715895653, + 0.030072476714849472, -0.023583346977829933, -0.016047311946749687, + -0.04016166180372238, -0.026690224185585976, 0.034725841134786606, + -0.08011004328727722, -0.023635270074009895, -0.01675681211054325, + 0.02217511460185051, -0.018720457330346107, 0.0413116030395031, + -0.0045730252750217915, -0.08402986079454422, 0.03641941770911217, + 0.028000695630908012, 0.042173273861408234, 0.024761751294136047, + -0.051845893263816833, -0.07877497375011444, -0.020710380747914314, + -0.035789184272289276, 0.04824375733733177, -0.04493764415383339, + -0.0014088008319959044, 0.09272980690002441, -0.030772028490900993, + 0.027623610571026802, -0.0008853759500198066, -0.015347420237958431, + -0.0006863650633022189, 0.02924676053225994, 0.03864092007279396, + -0.043402496725320816, 0.11410719156265259, 0.01606914773583412, + 0.03158045932650566, -0.049648500978946686, -0.026801105588674545, + 0.013934214599430561, -0.04582132399082184, -0.02133217453956604, + 0.013296819292008877, 0.030687933787703514, 0.0014671665849164128, + 0.005454834550619125, -0.024595070630311966, 0.036868833005428314, + -0.003586424048990011, -0.007300499361008406, 0.00619609747081995, + 0.004614396020770073, 0.06406176835298538, 0.010256785899400711, + -0.050202082842588425, -0.013155301101505756, -0.04005127400159836, + -0.027943719178438187, 0.05738724395632744, -0.002920332597568631, + -0.00731270294636488, 0.04419538751244545, 0.024069754406809807, + 0.012176074087619781, 0.004615467507392168, -0.04112132638692856, + -0.04844773933291435, -0.012684458866715431, 0.0071298484690487385, + -0.010914848186075687, -0.03592529892921448, -0.05016973987221718, + -0.011797907762229443, -0.043843258172273636, -0.03715396672487259, + 0.016528192907571793, 0.024301515892148018, 0.01335576456040144, + 0.021006477996706963, -0.021391959860920906, 0.05299517139792442, + 0.0070807491429150105, -0.08096124231815338, -0.07334060221910477, + -0.034530941396951675, -0.04421507194638252, 0.010524646379053593, + 0.009575314819812775, -0.031711090356111526, 0.023479584604501724, + -0.04212309420108795, 0.016264619305729866, 0.03907531499862671, + -0.0011187525233253837, -0.03998023644089699, -0.027464834973216057, + -0.07113838940858841, -0.028915319591760635, -0.01282753050327301, + -0.0033073138911277056, 0.026715606451034546, -0.002769897459074855, + 0.020033732056617737, 0.014502385631203651, -0.017903830856084824, + 0.06932531297206879, 0.0432068407535553, 0.01685408502817154, + 0.04834728315472603, -0.009553197771310806, 0.019799189642071724, + 0.01173039898276329, 0.04158413037657738, -0.018829666078090668, + -0.008410722948610783, 0.008009687066078186, 0.034592460840940475, + 0.07790639251470566, -0.022050900384783745, 0.04081638529896736, + 0.046872470527887344, 0.0010260086273774505, -0.05322079360485077, + 0.009096509777009487, -0.06831686198711395, -0.01390997413545847, + -0.020475609228014946, 0.017393099144101143, -0.007532020565122366, + -0.06435851007699966, -0.014785194769501686, 0.02654031664133072, + 0.004753720946609974, 0.026440177112817764, -0.028890414163470268, + -0.011440729722380638, 0.003554105758666992, -0.0022926912643015385, + -0.02393224649131298, 0.03711748123168945, -0.06023703143000603, + -0.008778683841228485, -0.05984162166714668, -0.024247022345662117, + -0.036919932812452316, 0.05249374359846115, 0.03022468276321888, + -0.011348876170814037, 0.0008303995127789676, 0.001597013440914452, + -0.015491127036511898, -0.035073015838861465, -0.024477796629071236, + -0.030328145250678062, -0.09301470220088959, -0.046440113335847855, + 0.036719027906656265, -0.021899227052927017, 0.04666316881775856, + -0.07481305301189423, -0.04928148165345192, -0.01480096485465765, + 0.0014140848070383072, 0.016779841855168343, -0.04318199306726456, + 0.011910341680049896, -0.04019855335354805, -0.027363713830709457, + 0.006433602888137102, 0.023732252418994904, -0.013081788085401058, + 0.02489032782614231, 0.005415213759988546, -0.058724161237478256, + 0.032487478107213974, -0.014332194812595844, -0.020952431485056877, + 0.055405858904123306, -0.02239573374390602, 0.016315918415784836, + 0.04710645601153374, 0.006866136100143194, -0.019589263945817947, + -0.046199049800634384, 0.04977096989750862, -0.03211359679698944, + 0.06759121268987656, -0.007805021945387125, 0.009877636097371578, + -0.003194598713889718, -0.0014034705236554146, 0.024012917652726173, + 0.0007609894964843988, 0.04028927534818649, 0.047299597412347794, + 0.04644732549786568, 0.06253348290920258, -0.03101237863302231, + -0.04797065258026123, -0.02459110878407955, -0.06663094460964203, + -0.012946722097694874, -0.046321313828229904, -0.03617801144719124, + -0.06608668714761734, 0.01371682621538639, -0.040183935314416885, + 0.027353622019290924, -0.013125114142894745, 0.020482128486037254, + -0.10186963528394699, 0.03741387277841568, -0.048566944897174835, + 0.0017904090927913785, 0.0444694422185421, -0.02355058304965496, + -0.04245513305068016, 0.01599632203578949, -0.00974870752543211, + -0.02246273122727871, 0.011107604950666428, -0.006354854442179203, + -0.08260829746723175, -0.054969724267721176, -0.038703542202711105, + -0.02590899169445038, -0.012424441985785961, 0.033952418714761734, + 0.032632969319820404, 0.03585505858063698, -0.027734532952308655, + -0.05185376852750778, 0.005663866177201271, 0.01415393128991127, + 0.007472912315279245, -0.0325092077255249, -0.0008526426972821355, + 0.05909401550889015, -0.006496420595794916, 0.06674317270517349, + 0.06033811718225479, -0.04705937206745148, 0.01221691444516182, + -0.005195186473429203, 0.017006050795316696, 0.015768419951200485, + -0.02346021682024002, -0.04318040981888771, -0.00965888798236847, + -0.012831253930926323, -0.023086808621883392, -0.043478451669216156, + 0.02215973101556301, 0.01018955372273922, -0.0029477940406650305, + -0.026364397257566452, -0.04219489544630051, -0.0690244510769844, + 0.0017003740649670362, -0.03498053178191185, -0.01891854591667652, + -0.020380523055791855, -0.07183944433927536, 0.01474913302809, + 0.012818068265914917, 0.02298390306532383, 0.006645163521170616, + -0.014497633092105389, -0.05751577392220497, -0.01127719134092331, + 0.014469895511865616, 0.039319343864917755, -0.002891098614782095, + 0.0038161359261721373, -0.0176107045263052, -0.02695712260901928, + 0.023520348593592644, 0.053624920547008514, -0.0472102165222168, + -0.021724319085478783, -0.04204733297228813, 0.004941252060234547, + -0.07744265347719193, -0.028974706307053566, -6.1493665270973e-5, + -0.020630594342947006, -0.014794640243053436, -0.045572925359010696, + 0.03233763575553894, 0.00969443004578352, 0.03665856271982193, + 0.027483846992254257, 0.074271060526371, -0.07454165071249008, + -0.034101732075214386, -0.07216823101043701, -0.001424514572136104, + -0.0025912360288202763, -0.002444307319819927, -0.012540637515485287, + 0.009027975611388683, 0.06855443120002747, -0.0013480151537805796, + 0.027303414419293404, -0.019723499193787575, 0.033644214272499084, + -0.04313155263662338, -0.016152892261743546, -0.020085612311959267, + 0.029526935890316963, 0.0004591972683556378, -0.013712934218347073, + 0.015895912423729897, -0.046559300273656845, -0.00015638815239071846, + 0.0015497541753575206, -0.0015048328787088394, 0.06692329794168472, + 0.0013934546150267124, 0.008921030908823013, -0.010347972624003887, + -0.039798807352781296, 0.06892028450965881, 0.021145053207874298, + 0.007431029342114925, -0.05281573906540871, 0.015844792127609253, + 0.014578046277165413, -0.0020482230465859175, 0.03509555384516716, + -0.021227506920695305, -0.03619229048490524, 0.004116897005587816, + 0.02835669554769993, -0.0028248224407434464, 0.00836214143782854, + -0.004688165616244078, 0.04566347226500511, -0.0352579727768898, + -0.007859165780246258, -0.003958444111049175, 0.023938892409205437, + 0.04262895882129669, -0.02836589328944683, 0.0456448458135128, + -0.062015753239393234, 0.03518408164381981, 0.06333593279123306, + -0.0155468275770545, 0.013991734012961388, 0.02207978442311287, + 0.0032898876816034317, 0.05948015674948692, 0.010670959949493408, + -0.00624996330589056, -0.04401599243283272, -0.0022705462761223316 + ], + "content_embedding": [ + -0.01892169564962387, 0.0662541389465332, 0.008976679295301437, + -0.03809165209531784, 0.02344459854066372, 0.012984057888388634, + 0.016158411279320717, 0.0040777078829705715, -0.0321662537753582, + -0.0026544055435806513, -0.03179372847080231, 0.019741656258702278, + 0.049423426389694214, 0.019327590242028236, 0.01367267221212387, + -0.042058881372213364, 0.023155249655246735, -0.015003002248704433, + 0.01056167297065258, 0.0032619787380099297, -0.014582481235265732, + -0.01262009609490633, -0.009695992805063725, -0.025683948770165443, + 0.010330218821763992, -0.043577518314123154, -0.03799012303352356, + 0.03159527853131294, -0.046592168509960175, 0.03461733087897301, + 0.029929379001259804, -0.02696100063621998, 0.01958872564136982, + -0.04882275313138962, -0.04835181683301926, -0.07444816827774048, + 0.0615590400993824, -0.018079139292240143, -0.02907492406666279, + 0.03256160393357277, 0.052772294729948044, 0.0014335751766338944, + 0.02048010565340519, -0.01859121397137642, 0.0436980240046978, + -0.028847631067037582, 0.06271578371524811, -0.04908007010817528, + 0.04021253436803818, -0.07390867173671722, 0.0029745057690888643, + -0.01733274944126606, -0.005066753830760717, -0.006927797570824623, + -0.01495048776268959, 0.020951012149453163, -0.02161789871752262, + 0.004997345618903637, 0.02517000213265419, -0.03955457732081413, + 0.038905348628759384, 0.008108963258564472, -0.04058837890625, + 0.03415047749876976, -0.004129728768020868, -0.07600218057632446, + -0.008998502045869827, 0.012445643544197083, -0.005613638553768396, + -0.015701062977313995, 0.010493642650544643, -0.01511659286916256, + -0.007434363476932049, 0.04920893907546997, -0.044436678290367126, + -0.015229232609272003, -0.009392009116709232, -0.004889432340860367, + -0.03250344097614288, 0.05671893432736397, 0.03468514233827591, + -0.04985000938177109, 0.021073583513498306, 0.005558345932513475, + 0.04397028684616089, -0.011105467565357685, -0.0010204907739534974, + -0.0013343892060220242, 0.010888955555856228, 0.11187340319156647, + 0.05144372954964638, 0.014714346267282963, 0.03652629256248474, + 0.08354610204696655, -0.050587598234415054, 0.07670528441667557, + 0.022823045030236244, -0.010303523391485214, -0.00016479991609230638, + -0.015029380097985268, -0.010333288460969925, 0.03660477325320244, + 0.013327172957360744, -0.008142965845763683, 0.04656663164496422, + 0.043171610683202744, 0.027440473437309265, 0.011585040017962456, + -0.008035292848944664, -0.0008554590749554336, -0.04715310037136078, + -0.013419345021247864, -0.034535810351371765, 0.028465399518609047, + -0.030552269890904427, 0.02954002656042576, 0.11263657361268997, + -0.060091886669397354, -0.004718341864645481, 0.02276463434100151, + -0.029855655506253242, 1.136395258072298e-5, 0.01254600752145052, + 0.030318304896354675, 0.04609473794698715, 0.04090471565723419, + -0.015202691778540611, -0.025406358763575554, 0.01403091847896576, + -0.01206378173083067, -0.034794360399246216, 0.021181223914027214, + -0.041345320641994476, 0.026389217004179955, 0.04634319990873337, + 0.05973498523235321, -0.0791369080543518, -0.018549518659710884, + 0.009269041940569878, 0.005099988542497158, 0.016017470508813858, + 0.016928445547819138, 0.004272987134754658, -0.03169683367013931, + 0.008137955330312252, -0.07982300966978073, -0.037415798753499985, + -0.0016467635286971927, -0.016258487477898598, 0.01855027861893177, + 0.012749083340168, -0.015595809556543827, 0.009437683038413525, + 0.005881224758923054, 0.009153603576123714, -0.035431332886219025, + -0.03822671249508858, -0.007053021341562271, 0.07195861637592316, + 0.03834277018904686, 0.025282155722379684, 0.03235918655991554, + -0.040675584226846695, 0.06914123892784119, -0.014681060798466206, + 0.04182145744562149, 0.016547678038477898, 0.0302575696259737, + 0.027968881651759148, 0.028392894193530083, -0.03601876646280289, + 0.011166741140186787, 0.013932433910667896, -0.024813517928123474, + 0.04876561462879181, 0.03280804678797722, 0.020525190979242325, + -0.04888831451535225, 0.05333299934864044, -0.01227282639592886, + 0.009397462010383606, -0.062118303030729294, 0.020511150360107422, + 0.03606007248163223, -0.011546325869858265, 0.02632950246334076, + 0.03558770939707756, -0.04729287326335907, -0.00040853474638424814, + -0.05594595894217491, 0.03343893215060234, -0.03624171018600464, + -0.01565496437251568, 0.03419746086001396, -0.014939344488084316, + -0.0346553735435009, 0.02617849037051201, -0.018064821138978004, + 0.00044916238402947783, -0.029752276837825775, -0.06982599943876266, + -0.01529014203697443, -0.10238753259181976, 0.056908924132585526, + -0.018579944968223572, 0.032441046088933945, 0.02623467892408371, + 0.0005816647899337113, 0.024393916130065918, -0.0010619793320074677, + 0.09054756909608841, 0.012866330333054066, 0.0110749127343297, + 0.060603830963373184, -0.04485912621021271, -0.035673510283231735, + 0.00880404282361269, -0.0236192774027586, -0.04651271551847458, + 0.04936773329973221, 0.016861658543348312, -0.026910705491900444, + 0.02507326751947403, 0.0018011556239798665, -0.01599423959851265, + 0.007061067037284374, -0.028597962111234665, -0.005096979904919863, + -0.003091734368354082, -0.008610324002802372, -0.03941959887742996, + -0.07249880582094193, 0.07896454632282257, -0.01282701175659895, + 0.03806105628609657, -0.01628866419196129, -0.00032510326127521694, + 0.007600210607051849, 0.012463843449950218, 0.07028777152299881, + 0.024854836985468864, 0.00597741175442934, 0.012146051973104477, + -0.04252159595489502, -0.08857864141464233, 0.005069843493402004, + -0.002303875982761383, 0.007218160200864077, -0.054320499300956726, + 0.01721455715596676, -0.012323171831667423, 0.029316846281290054, + 0.010660098865628242, 0.01619168184697628, 0.024796800687909126, + -0.06043343245983124, -0.009076021611690521, 0.05426326394081116, + 0.024232488125562668, -0.025832876563072205, 0.024366402998566628, + -0.04501958563923836, 0.04263340309262276, -0.01757700741291046, + 0.0240378025919199, 0.007873878814280033, -0.019204245880246162, + -0.04099274054169655, -0.0028695412911474705, -0.02336733788251877, + -0.009908018633723259, 0.01244357880204916, 0.014616346918046474, + 0.07263968884944916, -0.006017595529556274, 0.006593986880034208, + -0.017023928463459015, -0.0008568437770009041, 0.0393415242433548, + -0.03193742036819458, -0.07265064865350723, -0.056716252118349075, + -0.06321432441473007, 0.0014871162129566073, 0.015271728858351707, + 0.06799189001321793, 0.002235528314486146, 0.015148743987083435, + 0.029075419530272484, -0.036075517535209656, 0.03699851781129837, + 0.002699150936678052, 0.029273545369505882, 0.024833064526319504, + 0.02166113816201687, 0.07822758704423904, 0.0907154381275177, + -0.015422212891280651, -0.004725399427115917, -0.013691544532775879, + 0.00014949020987842232, 0.003309824038296938, 0.019388742744922638, + -0.01792132295668125, -0.005919941700994968, -0.009184692986309528, + -0.00453580915927887, -0.017324700951576233, 0.020368218421936035, + 0.007512629963457584, -0.05764073505997658, 0.01584697514772415, + -0.016094518825411797, -0.0366678424179554, -0.02194156125187874, + 0.053442906588315964, 0.04864593967795372, -0.009642759338021278, + 0.06584249436855316, 0.017993653193116188, 0.02838297188282013, + -0.02758033573627472, -0.018208689987659454, -0.08217029273509979, + 0.001340706367045641, 0.07344162464141846, -0.0014725526561960578, + 0.027256185188889503, -0.03795681148767471, 0.03496084362268448, + -0.009351355955004692, 0.03554052114486694, -0.0647641122341156, + -0.018092816695570946, -0.0003290708118584007, -0.008958869613707066, + -0.0006743986159563065, 0.02749652974307537, -0.005728874355554581, + -0.00014254855341278017, 0.02650611102581024, -0.007747439201921225, + -0.036285076290369034, -0.04723037779331207, -0.01256555411964655, + -0.015652446076273918, -0.0033896011300385, -0.027379868552088737, + 0.06606956571340561, 0.001414530212059617, -0.03816799819469452, + 0.005582350306212902, -0.0037654521875083447, -0.03315531834959984, + -0.03833584487438202, 0.005306297447532415, 0.06055983901023865, + 0.017386972904205322, 0.017846958711743355, 0.002940434729680419, + 0.06065093353390694, -0.033751003444194794, 0.02014659158885479, + -0.026745468378067017, 0.02349875122308731, 0.06887564063072205, + 0.08784544467926025, 0.0348343662917614, 0.017027992755174637, + 0.007463646121323109, 0.010731169953942299, -0.015452216379344463, + 0.0697169378399849, 0.06115807220339775, 0.05587253347039223, + 0.0035254100803285837, -0.06922555714845657, -0.00895272009074688, + 0.04390031844377518, 0.003160918829962611, 0.0734192356467247, + -0.012384983710944653, 0.00778034096583724, -0.06225632503628731, + 0.01105977687984705, -0.019027134403586388, -0.01744268462061882, + -0.03861316666007042, -0.026121554896235466, -0.03796643018722534, + -0.02607419341802597, -0.00727757578715682, -0.04364367574453354, + 0.027054548263549805, 0.001148495590314269, -0.0051346817053854465, + -0.014047800563275814, 0.033344950526952744, 0.016461240127682686, + 0.033907197415828705, -0.052207209169864655, -0.058969806879758835, + 0.019914019852876663, 0.04874560981988907, 0.0043409536592662334, + 0.014156220480799675, -0.025425465777516365, -0.03806624561548233, + 0.027224158868193626, -8.918229286791757e-5, 0.04550011456012726, + 0.02069287933409214, -0.006964664440602064, -0.05213857442140579, + 0.03515300899744034, -0.02322443015873432, 0.07085354626178741, + 0.010733392089605331, -0.04821530729532242, -0.024944474920630455, + 0.01349271647632122, 0.0064827692694962025, 0.021682681515812874, + 0.03466835618019104, -0.023484358564019203, -0.004177657887339592, + 0.019195759668946266, 0.021642865613102913, 0.03591984510421753, + -6.837025284767151e-5, 0.003064215648919344, 0.0067205713130533695, + 0.024574855342507362, 0.03467808663845062, -0.07038415223360062, + 0.020557953044772148, 0.05572228878736496, 0.024007081985473633, + 0.008300675079226494, 0.05382058024406433, -0.008657778613269329, + -0.04247821494936943, -0.02082398161292076, -0.030047548934817314, + -0.0042150202207267284, 0.0643019825220108, 0.08603832125663757, + -0.0032497297506779432, -0.05890907347202301, -0.017683515325188637, + -0.0017970707267522812, 0.030202442780137062, -0.004163889214396477, + -0.005693267099559307, 0.07439851015806198, -0.007623215671628714, + -0.014011486433446407, -0.06531509011983871, -0.012002935633063316, + 0.05098460614681244, 0.018368106335401535, 0.044709816575050354, + -0.034841395914554596, 0.04669453203678131, -0.006633058190345764, + -0.06744810938835144, 0.00022071562125347555, -0.02252846583724022, + -0.008146141655743122, 0.04570293799042702, -0.017073389142751694, + 0.033481452614068985, 0.02024919167160988, -0.00039372473838739097, + -0.015125994570553303, 0.0035840750206261873, 0.03293292224407196, + -0.023488696664571762, 0.02769201435148716, 0.03366998955607414, + 0.013383373618125916, -0.0062416414730250835, -0.05436183512210846, + -0.007013875991106033, -0.0343070924282074, 0.008950931020081043, + -0.0007773659308440983, 0.01631912775337696, -0.01733097992837429, + 0.007631183601915836, 0.022811884060502052, 0.05997275933623314, + -0.025991076603531837, -0.06607384979724884, -0.0873650386929512, + -0.05788758397102356, -0.020700229331851006, 0.00862400233745575, + 0.008653292432427406, -0.05257308855652809, -0.01877412386238575, + 0.001132996054366231, 0.007562611252069473, 0.007040517870336771, + -0.03939346596598625, -0.0012852386571466923, 0.03364014998078346, + -0.08792895078659058, 0.0003337061498314142, 0.04566165804862976, + 0.022397097200155258, 0.07704627513885498, 0.011688907630741596, + 0.06875491887331009, 0.031596384942531586, -0.07542278617620468, + 0.06929827481508255, 0.03525209799408913, -0.05507253482937813, + 0.06310203671455383, 0.009202172048389912, 0.08802317827939987, + 0.015267971903085709, 0.01631786674261093, -0.08159693330526352, + 0.011958948336541653, -0.022956276312470436, -0.0045707738026976585, + 0.06590449810028076, -0.025062261149287224, 0.05683448538184166, + 0.08174461871385574, 0.018841996788978577, -0.02901572361588478, + 0.04103256016969681, -0.06138996779918671, -0.02983909286558628, + -0.03850552439689636, -0.018056459724903107, 0.00292590050958097, + -0.0737059935927391, 0.00898703932762146, -0.012909052893519402, + -0.00488039618358016, 0.019017860293388367, -0.037835441529750824, + -0.05031483247876167, 0.025473300367593765, -0.009489303454756737, + -0.08405261486768723, 0.06039801985025406, -0.028819533064961433, + 0.01564796455204487, -0.07851359248161316, 0.00776974530890584, + -0.0627446398139, 0.043354298919439316, -0.0447402149438858, + 0.008833021856844425, -0.0005271312547847629, -0.03740326315164566, + -0.033597033470869064, 0.02730080671608448, -0.030516251921653748, + 0.03767557814717293, -0.10619816929101944, -0.038678478449583054, + 0.02232091873884201, 0.03868230804800987, 0.018831931054592133, + -0.05178656801581383, -0.05465080961585045, 0.03249572589993477, + 0.009297838434576988, -0.003563723061233759, -0.04144677892327309, + 0.0509132519364357, -0.02094709314405918, -0.022470436990261078, + -0.04437573254108429, -0.03695523366332054, -0.075083889067173, + 0.07801777124404907, 0.007801617495715618, -0.005376672837883234, + 0.020604871213436127, -0.06675189733505249, 0.0027014226652681828, + 0.08348087221384048, -0.031110215932130814, -0.02220381610095501, + 0.021845143288373947, 0.03032352775335312, -0.0012008004123345017, + -0.03200481832027435, 0.049666762351989746, 0.005313111934810877, + -0.020655009895563126, 0.007201225031167269, -0.05322100222110748, + -0.03385355696082115, 0.010354285128414631, 0.04187091067433357, + 0.006058192811906338, 0.005469379480928183, 0.041591376066207886, + -0.023555509746074677, 0.043303441256284714, -0.04954344779253006, + -0.033017441630363464, -0.01149839162826538, -0.012791389599442482, + 0.001670036930590868, -0.012347050942480564, 0.0004881276981905103, + -0.031120697036385536, 0.022906621918082237, -0.050669725984334946, + 0.04269399866461754, -0.011447146534919739, -0.017906805500388145, + -0.06953153014183044, 0.04467186704277992, -0.04761233553290367, + -0.013187393546104431, 0.05690088868141174, -0.042590390890836716, + -0.01746809110045433, 0.020567748695611954, 0.05125907063484192, + 0.020307395607233047, 0.03492629528045654, -0.04882863909006119, + -0.03183748573064804, -0.06539574265480042, -0.01744089275598526, + -0.02758834883570671, 0.0050849285908043385, -0.00035606700112111866, + 0.023614611476659775, 0.01930573768913746, -0.05899752303957939, + -0.04627015441656113, 0.0068423328921198845, 0.03920449689030647, + -0.007687605917453766, 0.016464397311210632, 0.04807426780462265, + 0.0023120716214179993, 0.01973593607544899, 0.07794646173715591, + 0.07625434547662735, -0.03674965724349022, -0.0012999586760997772, + 0.009016714058816433, 0.03811555355787277, 0.017517905682325363, + 0.004926901776343584, -0.07054422050714493, -0.01442575640976429, + 0.01330371480435133, -0.008963101543486118, -0.009463613852858543, + 0.0017095000948756933, -0.016330784186720848, -0.017924489453434944, + -0.042089130729436874, -0.06883884966373444, -0.042998943477869034, + 0.014172191731631756, -0.0023317155428230762, -0.027441971004009247, + 0.004573931451886892, -0.07700463384389877, -0.013737251050770283, + 0.025464439764618874, -0.02619084157049656, -0.008323452435433865, + -0.03393486887216568, -0.04159104451537132, -0.004442669451236725, + -0.008337379433214664, 0.05703001841902733, -0.05177110433578491, + 0.002210760721936822, 0.005930258426815271, -0.0369490347802639, + -0.013454861007630825, -0.004840550944209099, -0.04600533843040466, + -0.010599354282021523, -0.008193885907530785, -0.029226260259747505, + -0.06824758648872375, 0.002242376795038581, -0.00545460032299161, + -0.016073163598775864, -0.02212926186621189, 0.014335459098219872, + 0.02033282071352005, -0.01998221129179001, 0.06560437381267548, + -0.007302496116608381, 0.037101101130247116, -0.015349503606557846, + -0.0149971479550004, -0.003208655398339033, 0.01065454725176096, + -0.010318529792129993, 0.005211932118982077, -0.007634020410478115, + 0.007333737798035145, 0.04658440127968788, -0.017371229827404022, + -0.02044561877846718, 0.021157968789339066, -0.005675977561622858, + -0.016465574502944946, 0.001816042698919773, -0.022665906697511673, + 0.04769016057252884, -0.02464037574827671, -0.05675514042377472, + 0.05963050201535225, -0.01688731089234352, -0.05340677872300148, + 0.010052076540887356, 0.02069842256605625, 0.028715714812278748, + 0.009125969372689724, -0.02970687672495842, -0.010313224978744984, + -0.03552298620343208, 0.04363728687167168, -0.03991911932826042, + -0.004784241318702698, -0.044753339141607285, 0.01931679993867874, + -0.056493211537599564, -0.006617037579417229, 0.035743631422519684, + -0.053424812853336334, -0.02699253521859646, -0.007951406762003899, + -0.009088664315640926, -0.018690962344408035, -0.04115553945302963, + 0.02701025828719139, 0.0571308434009552, -0.029878465458750725, + -0.03173048421740532, -0.01149672456085682, -0.0105333486571908, + -0.005241425707936287, -0.02809373289346695, 0.05968040972948074, + 0.0010212024208158255, 0.042596235871315, 0.04825957119464874, + -0.003983878996223211, 0.016225650906562805, 0.015263753943145275, + -0.023301145061850548, 0.041719190776348114, 0.028326746076345444, + 0.026445787400007248, -0.022935770452022552, 0.03078318201005459 + ], + "chunk_ind": 1 + }, + { + "url": "https://docs.danswer.dev/more/use_cases/ai_platform", + "title": "AI Platform", + "content": "Build AI Agents powered by the knowledge and workflows specific to your organization.\n\nBeyond Answers\nAgents enabled by generative AI and reasoning capable models are helping teams to automate their work. Danswer is helping teams make it happen. Danswer provides out of the box user chat sessions, attaching custom tools, handling LLM reasoning, code execution, data analysis, referencing internal knowledge, and much more.\n\nDanswer as a platform is not a no-code agent builder. We are made by developers for developers and this gives your team the full flexibility and power to create agents not constrained by blocks and simple logic paths.\n\nFlexibility and Extensibility\nDanswer is open source and completely whitebox. This not only gives transparency to what happens within the system but also means that your team can directly modify the source code to suit your unique needs.", + "title_embedding": [ + 0.032763753086328506, 0.049961112439632416, 0.00777681777253747, + -0.009621717967092991, 0.03860695660114288, 0.035656899213790894, + -0.029095029458403587, -0.030549267306923866, -0.028131460770964622, + -0.023247526958584785, -0.030750803649425507, 0.04233109578490257, + 0.044790223240852356, 0.020764602348208427, -0.011113415472209454, + -0.052699606865644455, 0.05441703647375107, -0.027375519275665283, + 0.03858301043510437, 0.0015289749717339873, -0.0007870558765716851, + -0.013234086334705353, -0.008892231620848179, -0.0269540473818779, + 0.032256800681352615, 0.028824904933571815, 0.021423548460006714, + 0.0196831077337265, 0.004699843470007181, 0.01062865275889635, + -0.03573931008577347, -0.01450167316943407, 0.06177164614200592, + -0.004766061902046204, -0.011502844281494617, -0.059983331710100174, + 0.03794373199343681, 0.003160010790452361, 0.05785837396979332, + -0.016349520534276962, 0.048589278012514114, 0.03928593918681145, + -0.027400294318795204, -0.007712628226727247, 0.044047582894563675, + 0.03514353185892105, 0.050972215831279755, -0.027322333306074142, + 0.08146621286869049, -0.041862014681100845, 0.034794293344020844, + 0.0064093489199876785, -0.05552367866039276, -0.06472223997116089, + -0.0006224742392078042, 0.010324635542929173, -0.00513586075976491, + 0.006625971291214228, -0.03121061436831951, -0.02010185271501541, + 0.024356791749596596, 0.04554779455065727, -0.04365985095500946, + 0.038004688918590546, 0.026826566085219383, -0.0002007065195357427, + 0.0025419823359698057, 0.022517988458275795, 0.004520556423813105, + -0.04712541028857231, 0.042386990040540695, 0.0317973829805851, + 0.022796982899308205, 0.03537650406360626, -0.024706847965717316, + -0.05100490525364876, 0.013296891935169697, -0.027389265596866608, + -0.022103115916252136, 0.07237043976783752, 0.022473221644759178, + -0.08428098261356354, 0.0284805316478014, 0.014994120225310326, + 0.0647200271487236, -0.0013714460656046867, -0.02798375114798546, + 0.004889763426035643, -0.02891303412616253, 0.06638259440660477, + -0.015550877898931503, 0.01490933820605278, 0.03998437523841858, + 0.031558796763420105, -0.09123710542917252, 0.03090553544461727, + -0.027405250817537308, 0.0028605929110199213, -0.01660272665321827, + -0.024673976004123688, -0.03330164775252342, 0.019906772300601006, + 0.020785389468073845, -0.02234416827559471, 0.0711885541677475, + 0.010001438669860363, -0.007417359855026007, -0.03474368155002594, + 0.0117587149143219, -0.030912458896636963, -0.04288865998387337, + 0.004992801230400801, -0.011203224770724773, 0.026435980573296547, + -0.019328005611896515, -0.01772245578467846, 0.05772961303591728, + -0.018587617203593254, 0.03977040946483612, 0.0511898435652256, + 0.02799198590219021, -0.021339384838938713, 0.016965094953775406, + 0.08415205776691437, 0.010289170779287815, -0.02373247779905796, + -0.06358940154314041, 0.03165338188409805, -0.013218379579484463, + -0.041016921401023865, 0.052579861134290695, 0.016211217269301414, + 0.012958381325006485, -0.029191715642809868, -0.0013247805181890726, + 0.05056416615843773, -0.05472686141729355, -0.05397220700979233, + 0.07864602655172348, 0.044400643557310104, 0.011529057286679745, + -0.0056294528767466545, 0.0019877285230904818, -0.01892041228711605, + 0.031235355883836746, -0.06018691137433052, 0.015224655158817768, + 0.0035034629981964827, -0.04407024383544922, 0.03802705183625221, + 0.016176624223589897, 0.05680167302489281, -0.017375409603118896, + -0.01676156371831894, -0.017084985971450806, -0.042274024337530136, + -0.07406415045261383, 0.020823167636990547, 0.04484682157635689, + -0.023108867928385735, 0.02925572544336319, 0.06840821355581284, + -0.027610015124082565, 0.04234248027205467, -0.02915036305785179, + -0.004962626378983259, -0.0017270881216973066, 0.023044373840093613, + 0.037656962871551514, 0.04789644852280617, -0.0027900487184524536, + 0.0004090967122465372, 0.014888445846736431, 0.009237252175807953, + 0.036635007709264755, 0.015078885480761528, 0.046658437699079514, + -0.025920215994119644, 0.014571646228432655, -0.053589239716529846, + 0.024663543328642845, -0.0388394258916378, 0.0037244234699755907, + 0.007817366160452366, -0.03352022543549538, -0.0609428733587265, + 0.04179045185446739, -0.05036167427897453, -0.04099080711603165, + 0.02920934371650219, -0.037300609052181244, 0.010041946545243263, + 0.025091813877224922, -0.032656773924827576, -0.05137333646416664, + -0.038329657167196274, 0.03855415806174278, 0.006781625561416149, + 0.02984003536403179, -0.06467068940401077, 0.02395613305270672, + 0.018539344891905785, -0.0718475878238678, 0.031203489750623703, + -0.057184506207704544, 0.02436862140893936, 0.02837834134697914, + -0.010054084472358227, -0.02551312930881977, 0.021066943183541298, + 0.06444599479436874, 0.01263453345745802, -0.018358737230300903, + 0.010503370314836502, -0.023012487217783928, 0.009831788949668407, + 0.0049070375971496105, -0.022574082016944885, -0.0049112942069768906, + -0.01014224998652935, 0.055648382753133774, -0.016490083187818527, + -0.012448773719370365, -0.014511270448565483, 0.027931246906518936, + 0.024195006117224693, -0.005839435383677483, 0.029669128358364105, + -0.007521398831158876, 0.03150096535682678, -0.01941276341676712, + -0.06204359978437424, 0.01095200888812542, 0.0023097621742635965, + 0.008341503329575062, 0.0100992601364851, -0.039239075034856796, + 0.04388657584786415, 0.015824418514966965, 0.06830465793609619, + 0.009663422591984272, -0.00038048860733397305, 0.035620324313640594, + -0.011668454855680466, -0.06677736341953278, 0.008154943585395813, + -0.03417421504855156, -0.022497251629829407, 0.01800542138516903, + 0.0010614683851599693, 0.00842749048024416, 0.020196812227368355, + -0.005975049454718828, 0.024395788088440895, -0.01633184403181076, + -0.004018640611320734, -0.0018627216340973973, 0.058719366788864136, + -0.024047864601016045, -0.0032275430858135223, 0.07045131176710129, + -0.03221508115530014, 0.0352499820291996, 0.02055438607931137, + 0.02973576821386814, 0.0017980994889512658, 0.05022549629211426, + 0.03819788247346878, -0.005316003691405058, 0.011116476729512215, + 0.019071733579039574, 0.03500362113118172, -0.03451540693640709, + 0.09197302162647247, 0.008307289332151413, 0.015847783535718918, + -0.003909585066139698, -0.04707544669508934, 0.01712993159890175, + -0.026143768802285194, -0.007809836883097887, -0.02002348005771637, + -0.03528841957449913, -0.012745876796543598, 0.016280299052596092, + 0.005661313887685537, 0.022872695699334145, 0.016736241057515144, + -0.048460669815540314, 0.012391538359224796, -0.04375111311674118, + -0.06501554697751999, -0.0159616582095623, 0.009163076989352703, + -0.008098017424345016, 0.03997795283794403, 0.09088447690010071, + -0.025736957788467407, -0.01334838755428791, 0.015781259164214134, + 0.010901914909482002, 0.021588636562228203, 0.011131210252642632, + -0.034338608384132385, 0.053609222173690796, 0.018425501883029938, + 0.03827910125255585, -0.003314226632937789, 0.010824226774275303, + 0.020308859646320343, -0.11467628926038742, 0.04042372852563858, + 0.01810252107679844, 0.03511713072657585, -0.0987866222858429, + 0.016760295256972313, 0.007829226553440094, -0.011888569220900536, + 0.034833233803510666, -0.009197549894452095, 0.005588896572589874, + -0.07932842522859573, -0.02078017219901085, -0.03448954597115517, + 0.0152775589376688, 0.08626428246498108, 0.03126169368624687, + 0.04502886161208153, -0.026686420664191246, -0.028234312310814857, + 0.0049273171462118626, 0.023110508918762207, -0.08400018513202667, + 0.017200743779540062, 0.02693784609436989, 0.0036261421628296375, + -0.018591655418276787, 0.005189367104321718, 0.0002512435312382877, + -0.01673535816371441, 0.06507309526205063, 0.02960938587784767, + 0.0194547139108181, -0.045088544487953186, -0.01410599984228611, + -0.001771911047399044, 0.042333200573921204, -0.015243434347212315, + 0.027360277250409126, -0.02644488774240017, -0.059026844799518585, + 0.0013204477727413177, -0.005272903945297003, -0.03697441890835762, + -0.03736754506826401, 0.06495915353298187, 0.004548369906842709, + 0.004532824270427227, -0.005509661976248026, 0.013331729918718338, + 0.005671144928783178, -0.043852198868989944, 0.06886028498411179, + -0.0020801422651857138, 0.014272121712565422, -0.02358032390475273, + 0.010091368108987808, -0.013035510666668415, -0.009768063202500343, + -0.024086249992251396, 0.04728090390563011, -0.024031780660152435, + 0.032426923513412476, 0.06455196440219879, 0.08759471774101257, + 0.009270765818655491, -0.0936349406838417, -0.012462696991860867, + -0.019188350066542625, -0.06805568188428879, 0.01794586144387722, + -0.0007348881918005645, 0.0024105070624500513, -0.016566181555390358, + 0.012622764334082603, -0.03900640457868576, 0.010342570021748543, + 0.011543489061295986, 0.01152091845870018, -0.05232607573270798, + 0.004903953988105059, -0.05708310753107071, -0.04076048359274864, + 0.016818160191178322, -0.020741824060678482, 0.01609313301742077, + -0.022479360923171043, 0.03654901310801506, 0.022170664742588997, + 0.01575297676026821, -0.011484816670417786, -0.025103436782956123, + 0.05906060338020325, 0.02779274433851242, 0.028078753501176834, + 0.04629473015666008, -0.005719225853681564, -0.06190178170800209, + 0.006866101641207933, -0.002305209171026945, 0.03215618431568146, + 0.007546067703515291, -0.02738751657307148, -0.04539818689227104, + 0.04683874174952507, -0.0208493173122406, 0.03900844231247902, + -0.027456291019916534, -0.028509290888905525, 0.013289637863636017, + 0.0017003221437335014, -0.0198791716247797, 0.014913729391992092, + 0.005401032045483589, -0.04071260988712311, 0.02060793526470661, + -0.003016189206391573, 0.03800947219133377, -0.019319988787174225, + -0.024961907416582108, -0.02498740889132023, -0.04191872850060463, + -0.042030803859233856, 0.013421737588942051, -0.045663248747587204, + 0.024831216782331467, 0.06314653903245926, 0.013705547899007797, + 0.025637097656726837, -0.006122317630797625, 0.0041285231709480286, + 0.050409767776727676, 0.007197089493274689, -0.01965370774269104, + -0.04048306494951248, 0.11998444050550461, 0.029942067340016365, + 0.02599455416202545, -0.057833291590213776, 0.0033883019350469112, + 0.00468824477866292, -0.01925582066178322, -0.01766190119087696, + 0.011122050695121288, 0.04823627695441246, 0.018773270770907402, + -0.020368080586194992, -0.009206349961459637, 0.031074542552232742, + 0.02497885189950466, 0.0031681342516094446, 0.015077338553965092, + -0.022211533039808273, 0.058754149824380875, -0.016073331236839294, + -0.014968045987188816, -0.0051240865141153336, -0.06383436918258667, + -0.0280417762696743, 0.013401271775364876, -0.006949563976377249, + -0.009025825187563896, 0.03748825564980507, 0.04152849316596985, + -0.03703063353896141, 0.0006073106196708977, -0.019878407940268517, + -0.059219732880592346, 0.03231174871325493, -0.012458872981369495, + -0.0006862205918878317, -0.029703414067626, -0.011737367138266563, + -0.01565374620258808, -0.002873011166229844, 0.035379018634557724, + -0.025712305679917336, 0.027225548401474953, -0.011701708659529686, + -0.020186487585306168, -0.013381940312683582, 0.044779565185308456, + 0.027129901573061943, -0.03770675137639046, -0.06656532734632492, + -0.04852313920855522, -0.07922673970460892, 0.042464420199394226, + 0.08760115504264832, -0.01756269298493862, 0.025902874767780304, + -0.049739014357328415, 0.015325409360229969, 0.04406426474452019, + 0.012947683222591877, -0.022557666525244713, -0.033376943320035934, + -0.12034522742033005, 0.019998058676719666, 0.04397791251540184, + 0.024618806317448616, -0.013922464102506638, 0.031511057168245316, + 0.03906194865703583, 0.011382625438272953, -0.027103818953037262, + 0.04971625655889511, 0.051205385476350784, -0.08501561731100082, + 0.011972213163971901, -0.018331818282604218, -0.00884521659463644, + -0.0015008534537628293, 0.0827648937702179, -0.03979771211743355, + 0.0015674568712711334, -0.014266063459217548, -0.03932151570916176, + 0.04269920662045479, -0.059784602373838425, 0.01841970533132553, + 0.06251460313796997, -0.02819698490202427, -0.040344759821891785, + 0.0010407248046249151, -0.034333355724811554, -0.029237672686576843, + -0.0001084851028281264, 0.06710729002952576, 0.019469408318400383, + -0.01640215329825878, 0.019526075571775436, 0.007778842933475971, + 0.03379968926310539, 0.030870657414197922, -0.059688691049814224, + -0.05436835065484047, 0.053333111107349396, 0.004061849322170019, + -0.08632408827543259, 0.014255499467253685, -0.05555962026119232, + 0.010840730741620064, -0.05179913341999054, -0.007342956960201263, + 0.0011719957692548633, 0.022990427911281586, 0.013041576370596886, + -0.026316920295357704, -0.022087475284934044, -0.02786962315440178, + 0.013592005707323551, 0.021783264353871346, -0.059460774064064026, + -0.029133567586541176, -0.06166587024927139, -0.055512115359306335, + 0.004256486427038908, 0.0341678261756897, 0.011773993261158466, + -0.029188869521021843, -0.021346861496567726, -0.036212995648384094, + 0.025272972881793976, 0.02215636521577835, -0.03782811760902405, + 0.01701144315302372, -0.05073560029268265, -0.06574195623397827, + 0.012947561219334602, 0.003303218400105834, -0.05047185719013214, + 0.010198806412518024, -0.04323785379528999, -0.04194899648427963, + 0.02726336568593979, -0.015397109091281891, -0.02849482372403145, + 0.058862827718257904, -0.0026129265315830708, 0.006432596128433943, + 0.04382907226681709, -0.05114292353391647, -0.02147330716252327, + -0.05826929211616516, 0.046473387628793716, -0.09205549210309982, + 0.04540986940264702, -0.006234755739569664, -0.05360054224729538, + -0.012155161239206791, -0.030249077826738358, 0.02822766825556755, + 0.013851269148290157, 0.027002329006791115, 0.09613272547721863, + 0.035666726529598236, 0.03504599630832672, -0.00038134161150082946, + -0.06922309845685959, 0.016433153301477432, -0.031455833464860916, + -0.018132444471120834, -0.02008064091205597, -0.015955988317728043, + -0.04022971913218498, -0.00230028061196208, -0.06941505521535873, + 0.0230435561388731, -0.026967540383338928, 0.0354134738445282, + -0.08307641744613647, 0.055718302726745605, 0.0012352125486359, + 0.017340224236249924, 0.02709241770207882, -0.009195402264595032, + 0.020474854856729507, 0.0016901030903682113, 0.05093026161193848, + -0.02238425612449646, 0.011796950362622738, -0.007241291459649801, + -0.0334753580391407, -0.04778272658586502, -0.030247407034039497, + -0.012979192659258842, 0.004056413192301989, 0.015001167543232441, + 0.06737781316041946, 0.028164206072688103, 0.0028011424001306295, + -0.049282923340797424, 0.06260383874177933, 0.04237203299999237, + -0.026161646470427513, 0.02427232824265957, 0.021224258467555046, + 0.002963172970339656, -0.049155037850141525, 0.033326923847198486, + 0.07168576121330261, -0.04409810155630112, -0.012802177108824253, + 0.011941076256334782, 0.005057428497821093, -0.04857957363128662, + -0.011230324395000935, 0.009986268356442451, 0.010389930568635464, + -0.013448627665638924, -0.04319113492965698, -0.02839748188853264, + 0.011157489381730556, 0.015462666749954224, -0.014774681068956852, + -0.035400133579969406, 0.003983446396887302, -0.06980624049901962, + -0.0019868735689669847, -0.0014860559022054076, -0.017261963337659836, + 0.03138411417603493, -0.07367079704999924, 0.025024767965078354, + 0.037335801869630814, 0.04612639173865318, -0.018027080222964287, + -0.015578734688460827, -0.05632679536938667, -0.01690700650215149, + 0.023824671283364296, 0.003364108270034194, -0.0478903129696846, + 0.014160525053739548, 0.0023307709489017725, 0.028807908296585083, + 0.0053710732609033585, -0.007223619148135185, -0.09570229798555374, + 0.013001752085983753, -0.03882845118641853, -0.018106481060385704, + -0.08351759612560272, -0.01296163722872734, -0.0017098417738452554, + -0.042986027896404266, 0.02120766043663025, -0.00032761419424787164, + 0.059994783252477646, 0.00795682892203331, 0.025746053084731102, + 0.026430919766426086, 0.10314885526895523, -0.042013708502054214, + -0.01044819038361311, -0.06457454711198807, 0.04287077486515045, + -0.0233222134411335, -0.011595506221055984, 0.008520099334418774, + 0.021662304177880287, 0.04874734953045845, 0.03213977813720703, + -0.03502868860960007, -0.013689175248146057, 0.007175855804234743, + -0.06394322961568832, -0.03230760619044304, -0.0520993173122406, + 0.03424723073840141, -0.01675051636993885, -0.04967552423477173, + 0.03324288874864578, -0.03822193667292595, -0.015012933872640133, + -0.02746376395225525, -0.015637405216693878, 0.040449269115924835, + -0.0027442676946520805, 0.008192671462893486, 0.013573664240539074, + -0.0065663764253258705, 0.07001614570617676, -0.00289558875374496, + 0.004224210977554321, -0.05637960880994797, -0.010168599896132946, + -0.02271331660449505, 0.0014612390659749508, 0.06994854658842087, + -0.00733678275719285, -0.0025255896616727114, 0.03514084219932556, + 0.02634606696665287, -0.016171403229236603, -0.02692556194961071, + 0.015410004183650017, 0.07382199913263321, 0.01444800104945898, + -0.020071715116500854, 0.030701540410518646, 0.0056877885945141315, + 0.011047931388020515, -0.05641033127903938, 0.03570398688316345, + -0.06379767507314682, 0.09488129615783691, 0.015704551711678505, + -0.0008733674185350537, 0.009907273575663567, 0.004910382442176342, + 0.050873469561338425, 0.01800096221268177, -0.027450138702988625, + -0.001498246449045837, -0.027504686266183853, -0.019632702693343163 + ], + "content_embedding": [ + -0.0417482890188694, 0.0512668639421463, 0.0012354102218523622, + -0.035204555839300156, 0.028333576396107674, 0.006138786673545837, + 0.017678435891866684, 0.004378852900117636, -0.022564459592103958, + -0.03274708241224289, -0.06855575740337372, 0.03446828946471214, + 0.03136003389954567, -0.016096506267786026, -0.007832110859453678, + 0.01546874176710844, 0.025302864611148834, -0.01542437169700861, + 0.009685760363936424, 0.025153761729598045, 0.01136286836117506, + -0.03678102046251297, 0.01742858625948429, -0.04800569638609886, + 0.052324045449495316, -0.0188713688403368, -0.017203466966748238, + 0.04401639476418495, -0.05147295445203781, -0.005816930439323187, + 0.04151167348027229, 0.0020627069752663374, 0.008849645033478737, + -0.03293370082974434, -0.030744211748242378, -0.025762831792235374, + 0.07024409621953964, -0.029683783650398254, -0.02081390842795372, + 0.034864746034145355, 0.057659171521663666, 0.009455090388655663, + -0.001964752795174718, -0.028249403461813927, 0.045469045639038086, + 0.010203365236520767, 0.039163172245025635, -0.01693413034081459, + 0.03357663378119469, -0.016916338354349136, 0.007125346455723047, + -0.02135808765888214, -0.007920235395431519, -0.014657854102551937, + -0.0023566402960568666, 0.026274284347891808, -0.0449351891875267, + 0.006130301393568516, 0.0021915079560130835, -0.05063489079475403, + 0.010083623230457306, 0.03967271372675896, -0.047972869127988815, + 0.011878297664225101, -0.02869013138115406, -0.06947814673185349, + 0.012776396237313747, -0.022227533161640167, -0.021391209214925766, + -0.0071424199268221855, -0.010884602554142475, 0.0022353651002049446, + 0.04208262637257576, 0.04585080221295357, -0.028864840045571327, + 0.014383035711944103, 0.0006865983596071601, -0.003945623058825731, + -0.024596840143203735, 0.02039221115410328, 0.05236830934882164, + -0.06231372430920601, 0.0006878590793348849, 0.005045242141932249, + 0.04543100297451019, -0.022787010297179222, -0.0323825404047966, + 0.0060617332346737385, -0.0009496629354543984, 0.1132081151008606, + 0.021422259509563446, -0.008516624569892883, 0.011941758915781975, + 0.06050655618309975, -0.06464048475027084, 0.0715012326836586, + -0.04892478510737419, -0.014262699522078037, 0.02197115309536457, + -0.02258905954658985, -0.03329572454094887, 0.0733470693230629, + 0.01521797850728035, -0.02922399342060089, 0.05403874069452286, + -0.0024076823610812426, -0.005156014114618301, -0.0004758739669341594, + -0.0009397549438290298, 0.022768890485167503, -0.06273472309112549, + -0.013565277680754662, -0.038060612976551056, 0.03901419788599014, + -0.025413114577531815, -0.031085047870874405, 0.062427643686532974, + -0.05666875094175339, 0.018170330673456192, 0.03758049011230469, + -0.005046131554991007, -0.03363005071878433, 0.0071977670304477215, + -0.007294844835996628, 0.04950850084424019, 0.05829211696982384, + -0.028599455952644348, -0.00011273028212599456, -0.027114247903227806, + -0.04813091829419136, 0.03546503558754921, -0.0017865434056147933, + -0.06174362823367119, 0.015936153009533882, 0.05498664081096649, + 0.06208323314785957, -0.06043750047683716, -0.07075081020593643, + 0.03265148773789406, 0.01779918558895588, -0.004657578654587269, + 0.013401461765170097, -0.031561195850372314, -0.010674675926566124, + 0.02138788439333439, -0.059565648436546326, 0.003320328425616026, + -0.0016824831254780293, -0.021733451634645462, 0.048551496118307114, + -0.003053524298593402, 0.011647860519587994, -0.0014629715587943792, + 0.059308186173439026, 0.0077448501251637936, -0.01239799801260233, + -0.039145924150943756, 0.016731932759284973, 0.062229979783296585, + -0.029277512803673744, 0.05666857957839966, 0.021947506815195084, + -0.027742277830839157, 0.05703498050570488, -0.02114000730216503, + -0.0011631653178483248, 0.04833010211586952, 0.013655254617333412, + 0.042764052748680115, 0.04422000050544739, -0.010796190239489079, + -0.0081519465893507, 0.0005064443103037775, -0.007894535548985004, + 0.01271637249737978, 0.0280605535954237, 0.023104701191186905, + -0.05545410141348839, 0.03579716384410858, -0.01674344576895237, + 0.011995082721114159, -0.04967891052365303, 0.018647905439138412, + -0.0025427585933357477, -0.05248319357633591, -0.004207089077681303, + 0.0029677890706807375, -0.08436138927936554, 0.011933421716094017, + -0.046401966363191605, 0.004982754122465849, -0.03336072713136673, + 0.007464535068720579, -0.02536672353744507, -0.02103051170706749, + -0.0247516930103302, 0.03470923379063606, 0.008188062347471714, + 0.04575216770172119, -0.04027656093239784, -0.028462760150432587, + -0.00641157990321517, -0.1032537892460823, 0.015407266095280647, + -0.017259350046515465, 0.057880233973264694, 0.02970932051539421, + 0.003135938895866275, 0.04052228853106499, 0.006307818461209536, + 0.09373948723077774, 0.012201530858874321, 0.01518191210925579, + 0.005055180750787258, -0.00017229207151103765, -0.008860277943313122, + -0.0009321855613961816, -0.024702103808522224, -0.02220877818763256, + 0.018036337569355965, 0.0461902916431427, -2.3178456103778444e-5, + -0.021639293059706688, -0.009496558457612991, -0.0069047678261995316, + -0.005369818769395351, -0.038412243127822876, 0.0376049242913723, + -0.02614714205265045, 0.010913437232375145, -0.02533271722495556, + -0.08659890294075012, 0.05744393169879913, 0.012141053564846516, + 0.060547053813934326, -0.0005550469504669309, -0.01619824767112732, + -0.0022558700293302536, 0.01814994402229786, 0.06237058714032173, + 0.055474210530519485, -0.02512912079691887, 0.010455053299665451, + -0.023948650807142258, -0.07459914684295654, -0.006999264471232891, + -0.006154322065412998, -0.014305580407381058, -0.042501892894506454, + 0.04605546593666077, -0.007378050591796637, 0.013837042264640331, + 0.005601659417152405, -0.02454686351120472, 0.0228840634226799, + -0.010892537422478199, 0.0011435768101364374, 0.027678560465574265, + 0.015353331342339516, -0.03731193020939827, 0.05862969905138016, + -0.02842552959918976, 0.03124571032822132, 0.02315538190305233, + 0.012950807809829712, 0.026965327560901642, 0.009465894661843777, + -0.010829408653080463, -0.008594458922743797, 0.014982074499130249, + 0.021298887208104134, -0.018343189731240273, 0.01739460788667202, + 0.07865084707736969, 0.02205476351082325, 0.015017225407063961, + -0.011981618590652943, -0.02248695306479931, 0.017631210386753082, + -0.02025180496275425, -0.05385996773838997, -0.05477667227387428, + -0.042989905923604965, 0.004830287769436836, 0.03188111260533333, + 0.048253823071718216, 0.0027890182100236416, -0.01684093475341797, + 0.029284454882144928, -0.014463928528130054, 0.029999280348420143, + 0.013334669172763824, -0.030123639851808548, 0.007939296774566174, + -0.015909308567643166, 0.03652086481451988, 0.043923694640398026, + -0.03349898010492325, 0.016639679670333862, 0.007404185365885496, + -0.023147881031036377, 0.004568914417177439, 0.008112411946058273, + -0.021877270191907883, -0.0072467140853405, -0.024027734994888306, + 0.022522028535604477, 0.03248016908764839, 0.04624137282371521, + 0.03288194164633751, -0.0706077441573143, 0.00647892989218235, + -0.013711459934711456, -0.00910367164760828, -0.06070556864142418, + 0.013195404782891273, 0.02949078381061554, -0.04314878582954407, + 0.03952472656965256, -0.039313577115535736, 0.01958983577787876, + -0.04745025932788849, 0.011169768869876862, -0.07735665887594223, + 0.012919869273900986, 0.08162245899438858, 0.04961969330906868, + 0.02261139266192913, -0.01081178616732359, -0.022089937701821327, + 0.036029793322086334, 0.07065453380346298, -0.050287678837776184, + 0.009584897197782993, -0.005743148736655712, -0.03252799063920975, + -0.029911693185567856, -0.0031824579928070307, -0.0022875897120684385, + 0.010553253814578056, -0.005088122095912695, -0.019103137776255608, + -0.029758833348751068, -0.03040270134806633, -0.05643913522362709, + -0.0183008573949337, 0.0036066959146410227, -0.010227258317172527, + 0.03830184414982796, 0.008860573172569275, -0.04022029787302017, + 0.016092464327812195, -0.007906369864940643, -0.0206406619399786, + -0.01545781921595335, 0.011720928363502026, 0.030331697314977646, + 0.020348263904452324, 0.013491041027009487, 0.015015012584626675, + 0.0757412239909172, -0.013692211359739304, 0.0554184690117836, + -0.03535052016377449, 0.027659131214022636, 0.062012042850255966, + 0.05365491285920143, 0.02611374668776989, 0.03400697186589241, + -0.0187185350805521, 0.030734656378626823, -0.04378894716501236, + 0.04222285747528076, 0.06321597844362259, 0.0926889032125473, + 0.06395434588193893, -0.045033425092697144, -0.02227518893778324, + -0.018914448097348213, -0.024137776345014572, 0.06653360277414322, + 0.03000609017908573, 0.016536613926291466, -0.05106441304087639, + 0.009556908160448074, -0.003165673930197954, -0.02989509329199791, + -0.008909299969673157, -0.002428715117275715, -0.038857024163007736, + -0.014716073870658875, -0.02291145734488964, -0.03815469145774841, + 0.018349675461649895, -0.001724440953694284, 0.024225711822509766, + -0.038882117718458176, 0.013145080767571926, 0.013105038553476334, + 0.033219680190086365, -0.04639777913689613, -0.044315461069345474, + -0.012929159216582775, 0.003259071381762624, 0.012331360019743443, + -1.7462354662711732e-5, -0.02317662350833416, -0.042660780251026154, + 0.001802539685741067, -0.041100095957517624, 0.04925210401415825, + 0.047337062656879425, -0.01313596311956644, -0.048633869737386703, + 0.06100405752658844, -0.024509647861123085, 0.06903672963380814, + 0.026338376104831696, -0.05955340713262558, -0.013524221256375313, + -0.007072206120938063, -0.0004094979085493833, 0.02331911027431488, + 0.006079655606299639, -0.027727166190743446, 0.01562763936817646, + 0.011910749599337578, -0.010385152883827686, 0.02091721072793007, + -0.030102524906396866, -0.014945127069950104, 0.007444288115948439, + -0.009210431948304176, 0.01587914675474167, -0.07968660444021225, + 0.034870292991399765, 0.04423568770289421, 0.05101220682263374, + -0.0018310192972421646, 0.04378198832273483, 0.008875945582985878, + -0.018744593486189842, -0.010748499073088169, -0.05976865068078041, + -0.024797234684228897, 0.02921747788786888, 0.08715134114027023, + -0.014189728535711765, -0.05772070586681366, -0.00013612159818876535, + 0.034182313829660416, 0.02940666675567627, -0.007551911287009716, + 0.005196248646825552, 0.09129910916090012, 0.03463520109653473, + -0.028487645089626312, -0.054952532052993774, -0.019425109028816223, + 0.04267658665776253, -0.010463536716997623, -0.022979862987995148, + 0.003282969817519188, 0.032446060329675674, -0.03184691444039345, + -0.01494336687028408, 0.027125416323542595, -0.03301938623189926, + -0.021615097299218178, 0.01919432356953621, -0.018361948430538177, + 0.0440165251493454, 0.018785251304507256, 0.05379289388656616, + -0.06103529781103134, -0.04040123522281647, 0.06767034530639648, + -0.04255857691168785, 0.059002116322517395, 0.06269264966249466, + 0.04158494248986244, 0.016211502254009247, -0.046843864023685455, + -0.028105739504098892, 0.007073850836604834, 0.052667438983917236, + 0.00735336821526289, 0.017733542248606682, -0.023568013682961464, + -0.007077949587255716, 0.01566276140511036, 0.048224493861198425, + -0.0003875133115798235, -0.046327680349349976, -0.08656812459230423, + -0.025123324245214462, -0.034193720668554306, 0.03014206700026989, + 0.05021859332919121, -0.0026385232340544462, 0.023799851536750793, + -0.009769299067556858, -0.01290298905223608, 0.004491783678531647, + -0.012223453260958195, -0.00033618492307141423, 0.0233011394739151, + -0.08696971833705902, 0.058488454669713974, 0.000664825493004173, + 0.039359770715236664, 0.014214815571904182, 0.03424450755119324, + 0.05592956021428108, 0.016471324488520622, -0.059732481837272644, + 0.06536833196878433, 0.024387361481785774, -0.10856911540031433, + 0.06828989833593369, 0.0036337117198854685, 0.05830007046461105, + 0.016170067712664604, 0.013002794235944748, -0.11607159674167633, + 0.0019640743266791105, 0.026027854532003403, -0.028382647782564163, + 0.041647832840681076, -0.005042455159127712, -0.0010717103723436594, + 0.09709432721138, 0.018342554569244385, -0.03699033707380295, + 0.03425338864326477, -0.07419072836637497, -0.05410637706518173, + 0.013680101372301579, -0.007827416993677616, -0.007252392824739218, + 0.016606653109192848, 0.015743359923362732, -0.007168450392782688, + 0.030557913705706596, 0.010715801268815994, -0.03387424722313881, + -0.059598296880722046, 0.061636339873075485, -0.024311335757374763, + -0.08930302411317825, 0.04300369694828987, -0.052911426872015, + 0.048922792077064514, -0.07488273829221725, 0.0253959558904171, + -0.057005614042282104, -0.010324039496481419, -0.03382004797458649, + 0.01331509929150343, -0.0060559725388884544, 0.021830739453434944, + 0.0004554805636871606, 0.06132755056023598, -0.04885099083185196, + 0.01681993156671524, -0.09306737780570984, -0.03891037777066231, + 0.03394221141934395, 0.03513973951339722, 9.119489550357684e-5, + -0.009680265560746193, -0.018936453387141228, 0.002022465690970421, + 0.03725491091609001, -0.007916543632745743, -0.05493376404047012, + 0.06674706935882568, -0.04586830735206604, -0.05310272425413132, + -0.002019708277657628, -0.03419820964336395, -0.08405481278896332, + 0.044505130499601364, -0.022271662950515747, 0.008551442995667458, + 0.024632176384329796, -0.057307109236717224, -0.025764044374227524, + 0.05102856457233429, -0.01996302232146263, -0.003182733431458473, + 0.010233199223876, -0.005380541551858187, -0.033068619668483734, + -0.038329556584358215, 0.041149478405714035, -0.038474202156066895, + 0.03263046592473984, 0.043984752148389816, -0.06405626237392426, + -0.04378855600953102, -0.017724154517054558, -0.00023550254991278052, + -0.006340715568512678, 0.008379276841878891, 0.06068692356348038, + -0.023048071190714836, 0.04665880277752876, -0.026433007791638374, + -0.04106089845299721, 0.008102682419121265, -0.02919689752161503, + -0.002803279785439372, 0.00115284975618124, -0.007610488682985306, + -0.009425876662135124, 0.014759095385670662, -0.07407978177070618, + 0.040522702038288116, -0.028428586199879646, -0.015484650619328022, + -0.08971428871154785, 0.04541322588920593, 0.01523630227893591, + -0.02159152925014496, 0.06348283588886261, -0.0762605369091034, + 0.008550439029932022, -0.0010396456345915794, 0.09191705286502838, + 0.01919129304587841, 0.012690366245806217, -0.032078325748443604, + -0.03879883140325546, -0.06354136019945145, -0.016241934150457382, + -0.013353055343031883, 0.013797549530863762, 0.03027600795030594, + 0.05205754190683365, 0.018223397433757782, -0.02529638260602951, + -0.012619049288332462, 0.05183516442775726, 0.04441876709461212, + -0.0014240458840504289, -0.004662310238927603, 0.007740246132016182, + -0.023739585652947426, 0.008351752534508705, 0.04186442866921425, + 0.06846421957015991, -0.03302106633782387, -0.019061105325818062, + 0.03688846528530121, 0.027123648673295975, -0.008548760786652565, + 0.006452383007854223, -0.05057734623551369, 0.009094422683119774, + -0.003088460536673665, -0.01042612362653017, 0.03579631447792053, + -0.008917502127587795, 0.010444638319313526, -0.023657843470573425, + -0.03254014626145363, -0.0009533764678053558, 0.00684812106192112, + 0.01948300190269947, 0.00943666510283947, -0.010625068098306656, + 0.02385423146188259, -0.05145318806171417, -0.03215208277106285, + 0.007343036122620106, 0.01264273189008236, 0.036680057644844055, + 0.022073568776249886, -0.06296181678771973, -0.008569572120904922, + -0.012322318740189075, 0.021164294332265854, -0.051289938390254974, + 0.0010486009996384382, 0.0021613994613289833, 0.030476249754428864, + 0.01092084776610136, -0.054112132638692856, -0.06015515327453613, + 0.023149874061346054, 0.03427460417151451, -0.019571471959352493, + -0.07272381335496902, 0.009794066660106182, -0.04319072142243385, + -0.04802769050002098, -0.0024639740586280823, 0.01276618055999279, + 0.030480578541755676, -0.007069519720971584, 0.026940204203128815, + -0.013154259882867336, 0.05308559536933899, 0.0008981192368082702, + -0.012286764569580555, -0.010251149535179138, 0.056114789098501205, + -0.027719540521502495, -0.06385437399148941, -0.01707690954208374, + 0.03182663023471832, 0.04629168286919594, 0.003105542855337262, + -0.035991836339235306, 0.030695278197526932, -0.01389816403388977, + 0.005694018676877022, 0.024141885340213776, -0.056052565574645996, + 0.07325056940317154, -0.052376989275217056, -0.0827232152223587, + 0.07083716243505478, -0.011363365687429905, -0.049301743507385254, + 0.01282532885670662, 0.029815899208188057, 0.0025471607223153114, + 0.014735412783920765, -0.017929038032889366, -0.017711512744426727, + -0.03859850764274597, 0.020923320204019547, -0.024983150884509087, + -0.009905354119837284, -0.033428773283958435, 0.0033264297526329756, + -0.057740144431591034, 0.0011588952038437128, 0.05510108917951584, + -0.042752135545015335, 0.00023805272940080613, 0.02830038219690323, + -0.0023612258955836296, 0.004450241569429636, -0.09065061807632446, + 0.05099336430430412, 0.050836946815252304, 0.002225160365924239, + -0.02620827779173851, -0.0017080202233046293, -0.006798254791647196, + -0.06735426932573318, -0.012160968966782093, 0.0198799017816782, + -0.005785979796200991, 0.030539529398083687, 0.03791653737425804, + -0.01715696230530739, -0.013931870460510254, -0.026593970134854317, + 0.015033211559057236, 0.04166087508201599, -0.0009548550006002188, + 0.002680635079741478, -0.005521025042980909, -0.005426781252026558 + ], + "chunk_ind": 0 + }, + { + "url": "https://docs.danswer.dev/more/use_cases/customer_support", + "title": "Customer Support", + "content": "Help your customer support team instantly answer any question across your entire product.\n\nAI Enabled Support\nCustomer support agents have one of the highest breadth jobs. They field requests that cover the entire surface area of the product and need to help your users find success on extremely short timelines. Because they're not the same people who designed or built the system, they often lack the depth of understanding needed - resulting in delays and escalations to other teams. Modern teams are leveraging AI to help their CS team optimize the speed and quality of these critical customer-facing interactions.\n\nThe Importance of Context\nThere are two critical components of AI copilots for customer support. The first is that the AI system needs to be connected with as much information as possible (not just support tools like Zendesk or Intercom) and that the knowledge needs to be as fresh as possible. Sometimes a fix might even be in places rarely checked by CS such as pull requests in a code repository. The second critical component is the ability of the AI system to break down difficult concepts and convoluted processes into more digestible descriptions and for your team members to be able to chat back and forth with the system to build a better understanding.\n\nDanswer takes care of both of these. The system connects up to over 30+ different applications and the knowledge is pulled in constantly so that the information access is always up to date.", + "title_embedding": [ + 0.029702378436923027, 0.04087577760219574, 0.011759690940380096, + -0.015867559239268303, 0.051922496408224106, 0.04818926751613617, + -0.051036059856414795, -0.000768028199672699, -0.011680016294121742, + -0.04731091484427452, -0.06819964945316315, 0.02706378884613514, + 0.012011447921395302, -0.0162456426769495, 0.00561815220862627, + -0.060200855135917664, 0.011203722096979618, 0.011659571900963783, + 0.005986262112855911, 0.010378050617873669, -0.02058245800435543, + -0.007857420481741428, -0.011501888744533062, -0.06029190123081207, + -0.005981787107884884, 0.02316387929022312, -0.003978169988840818, + 0.014874234795570374, -0.02605351060628891, 0.015183809213340282, + 0.024635987356305122, 0.009090029634535313, 0.02292451448738575, + -0.051132138818502426, -0.01627650111913681, -0.04312199354171753, + 0.055974528193473816, -0.007541665807366371, 0.05875417962670326, + 0.014909300021827221, 0.04143260419368744, 0.013145966455340385, + -0.0019000619649887085, -0.014630978927016258, 0.036174625158309937, + -0.009382152929902077, 0.0129568912088871, -0.01105422992259264, + 0.05389830842614174, -0.05418487638235092, -0.019787268713116646, + 0.021062593907117844, -0.004908672533929348, -0.05889728665351868, + 0.005012272857129574, -0.032561108469963074, 0.045741673558950424, + 0.0023587732575833797, -0.029787305742502213, -0.016032742336392403, + -7.657184141862672e-6, 0.06451895087957382, -0.061427876353263855, + 0.05008486285805702, 0.013032016344368458, -0.008349837735295296, + -0.023183466866612434, 0.028445789590477943, 0.013328451663255692, + 0.002017433987930417, 0.08471205830574036, 0.02884836308658123, + 0.03370589017868042, 0.02926166169345379, -0.019738871604204178, + -0.06608780473470688, -0.024426797404885292, -0.008213629014790058, + -0.017787031829357147, 0.02045559138059616, 0.04079979658126831, + -0.03456271439790726, 0.011362932622432709, -0.022804994136095047, + 0.06335528194904327, 0.007143533322960138, -0.038967471569776535, + 0.01895124651491642, -0.021929487586021423, 0.020517535507678986, + -0.005601715296506882, 0.048027630895376205, 0.05165044218301773, + 0.021509770303964615, -0.05171488970518112, 0.022959010675549507, + 0.008313633501529694, -0.033466871827840805, -0.00873673614114523, + -0.07275433838367462, -0.01826190948486328, -0.0008458571974188089, + 0.03251243755221367, 0.0027185927610844374, 0.03351648896932602, + 0.050253089517354965, 0.03931482136249542, 0.011611105874180794, + 0.0006847226177342236, -0.018391452729701996, -0.04876922070980072, + 0.0032669915817677975, 0.002616048092022538, 0.018911289051175117, + 0.0035516773350536823, 0.04444553330540657, 0.0874137431383133, + -0.06154795363545418, -0.0007547208806499839, 0.05003364384174347, + 0.047423556447029114, 0.010614278726279736, -0.002085448009893298, + 0.028551766648888588, -0.00320938928052783, -0.0028788738418370485, + -0.032166119664907455, 0.032534729689359665, 0.05165233090519905, + -0.02726086415350437, 0.032850414514541626, 0.021426480263471603, + 0.008582738228142262, -0.05970913544297218, -0.013117690570652485, + 0.0327497161924839, -0.04084235802292824, -0.034347862005233765, + 0.08484583348035812, -0.007416227832436562, 0.0817202478647232, + 0.013180759735405445, 0.01752362586557865, -0.011748716235160828, + 0.006734360009431839, -0.05940733850002289, -0.01464597787708044, + 0.021172411739826202, 0.015961064025759697, 0.013145568780601025, + -0.03292446956038475, 0.05433695763349533, -0.04581886902451515, + -0.011024394072592258, -0.013903305865824223, -0.04525483399629593, + -0.009453397244215012, -0.01541796326637268, 0.024864252656698227, + 0.016035286709666252, 0.04684724658727646, 0.04711056500673294, + -0.08100881427526474, 0.07197079807519913, -0.00942996796220541, + -0.04369058832526207, 0.02740531787276268, 0.027486257255077362, + 0.035378992557525635, 0.05205152928829193, 0.000229495475650765, + -0.015225332230329514, 0.018225882202386856, 0.019075268879532814, + 0.05653514340519905, 0.00206256122328341, 0.04172705486416817, + 0.025263279676437378, 0.02995399944484234, -0.02344629354774952, + -0.00580610940232873, -0.03100346215069294, 0.024410588666796684, + 0.05101815611124039, -0.044721707701683044, -0.03469102457165718, + 0.015877151861786842, -0.05606666952371597, -0.04264648258686066, + 8.914931095205247e-5, 0.005455580540001392, 0.02580810897052288, + 0.03810019791126251, -0.005504349246621132, -0.03873325511813164, + -0.06938246637582779, 0.06987633556127548, -0.0071804821491241455, + 0.025614110752940178, -0.04443173483014107, 0.014129945077002048, + 0.04964412376284599, -0.06445024162530899, 0.03494735807180405, + 0.00042216022848151624, 0.03607922047376633, 0.014481625519692898, + -0.03348603844642639, 0.04130083695054054, 0.03306325525045395, + 0.0670546144247055, -0.004632187075912952, -0.02899966388940811, + 0.028892826288938522, -0.02801397442817688, -0.009044334292411804, + -0.0299741979688406, 0.006851669866591692, -0.051097121089696884, + 0.041360609233379364, 0.040426693856716156, -0.04066675901412964, + 0.01568625681102276, -0.016009517014026642, -0.004742924124002457, + 0.048653047531843185, -0.010993007570505142, 0.039591673761606216, + 0.0007891886634752154, 0.0280364528298378, -0.029024146497249603, + -0.07559369504451752, 0.020083241164684296, -0.02160985954105854, + 0.00466573191806674, 0.017442384734749794, -0.02421753853559494, + 0.05211988091468811, 0.0016645005671307445, 0.08051992207765579, + 0.02786155417561531, 0.022991932928562164, 0.04568661376833916, + -0.03650546818971634, -0.061528630554676056, 0.026744728907942772, + -0.029581749811768532, -0.04499091953039169, 0.002152943518012762, + -0.015043909661471844, 0.047530874609947205, 0.041445743292570114, + 0.041881438344717026, 0.03457123413681984, -0.023201758041977882, + -0.05317879468202591, -0.020216727629303932, 0.050812073051929474, + 0.0008769077248871326, -0.01832399144768715, 0.026449931785464287, + -0.00376958679407835, 0.04373340308666229, -0.015004824846982956, + 0.02940281480550766, -0.028869349509477615, -0.02610083483159542, + -0.0077619897201657295, 0.03147227317094803, -0.0032539069652557373, + 0.02559952437877655, 0.02357475273311138, 0.0001173858399852179, + 0.047926079481840134, 0.03721074387431145, -0.019753489643335342, + 0.017013119533658028, -0.028762344270944595, -0.005848998669534922, + -0.006997200194746256, 0.05184704810380936, -0.05036364868283272, + 0.002436417154967785, 0.0003822402795776725, 0.003277599113062024, + 0.03293520584702492, -0.0045876270160079, -0.004722336772829294, + -0.019277948886156082, 0.0012148021487519145, -0.023630889132618904, + -0.011700155213475227, -0.006263254676014185, 0.02274380624294281, + 0.004808057565242052, 0.03601561859250069, 0.1043647825717926, + -0.014201398938894272, -0.016617566347122192, -0.02272864058613777, + -0.030905725434422493, 0.0010793384863063693, -0.049122776836156845, + -0.005753105040639639, 0.01338914968073368, 0.027050666511058807, + 0.04214894399046898, -0.023005545139312744, 0.031917206943035126, + 0.015109232626855373, -0.07634111493825912, 0.024369796738028526, + -0.00647472171112895, 0.043127499520778656, -0.0673207938671112, + 0.0822305828332901, 0.06597486138343811, 0.004127069376409054, + 0.06724239140748978, 0.004546293988823891, 0.03501193970441818, + -0.03256544470787048, 0.02815675362944603, -0.018141930922865868, + 0.008695757016539574, 0.030166303738951683, -0.009897452779114246, + 0.03613714128732681, -0.013987270183861256, -0.02539311721920967, + -0.06444346904754639, -0.01528739370405674, -0.06564117968082428, + -0.029204169288277626, 0.03283213824033737, -0.018580380827188492, + 0.026825398206710815, -0.012654704041779041, -0.0018552436958998442, + -0.01754637062549591, 0.08191259950399399, 0.0534062460064888, + -0.027856973931193352, -0.053807660937309265, -0.02944841794669628, + -0.014591488987207413, -0.0004364093765616417, -0.01691609062254429, + 0.0792316198348999, 0.004102316685020924, -0.03579891845583916, + -0.0108651639893651, -0.00966869480907917, -0.00033933919621631503, + -0.026145832613110542, -0.016428470611572266, 0.030308935791254044, + -0.02421530708670616, -0.01272093690931797, 0.0026039716321974993, + 0.029183251783251762, -0.015125368721783161, 0.07109745591878891, + 0.02079625427722931, 0.018054857850074768, -0.00201214081607759, + 0.028579678386449814, -0.011699595488607883, 0.010970978997647762, + -0.008872047066688538, 0.03169122710824013, -0.06434084475040436, + 0.03283708542585373, 0.002920384518802166, 0.1117773950099945, + 0.0207917969673872, -0.07546871900558472, -0.0013152466854080558, + -0.009336701594293118, 0.00034176796907559037, 0.06051360070705414, + -0.0378379300236702, -0.01082307007163763, -0.009045763872563839, + -0.042135000228881836, -0.04675054922699928, 0.0008745589875616133, + 0.022651556879281998, 0.016891758888959885, -0.06758315861225128, + 0.03011692874133587, -0.0689280554652214, -0.0287728663533926, + -0.016613522544503212, 0.009947648271918297, -0.009978469461202621, + 0.016907479614019394, 0.00691134762018919, 0.04193537309765816, + -0.0010126983979716897, -0.01135191135108471, -0.04884914308786392, + 0.05164073780179024, 0.06193321943283081, -0.01663290709257126, + 0.0512203685939312, 0.00277346046641469, -0.0766502171754837, + 0.0011106275487691164, -0.008470023050904274, 0.03819086030125618, + -0.006837388966232538, -0.03457418084144592, -0.06866854429244995, + 0.05483240634202957, -0.016624240204691887, 0.028569897636771202, + -0.01400308683514595, 0.0378500260412693, 0.007686481345444918, + 0.030080482363700867, -0.05881612002849579, 0.015524756163358688, + 0.030225753784179688, -0.02950134687125683, 0.01465617585927248, + 0.0009167538373731077, 0.056613512337207794, -0.02706410363316536, + -0.0473414771258831, -0.006343611981719732, 0.011811802163720131, + 0.007573770359158516, 0.021041858941316605, -0.014327406883239746, + 0.01859954372048378, 0.06863977015018463, -0.002199358306825161, + -0.03532129153609276, 0.009050965309143066, 0.02409159019589424, + -0.0025098449550569057, -0.00499211298301816, 0.00033862097188830376, + -0.059937484562397, 0.10898157954216003, 0.0318506620824337, + 0.0058680190704762936, -0.03671310096979141, -0.03349997103214264, + -0.0349581353366375, -0.023411044850945473, -0.011138128116726875, + 0.00608166866004467, 0.08696430921554565, -0.008161027915775776, + 0.045368045568466187, -0.01864445023238659, 0.035301174968481064, + 0.003979773260653019, 0.016739632934331894, 0.011675872839987278, + 0.025817174464464188, 0.03272102400660515, 0.013721601106226444, + -0.04690241813659668, 0.05665350705385208, -0.05676185339689255, + 0.013739561662077904, 0.020678944885730743, -0.03532474488019943, + 0.013464651070535183, 0.04246523231267929, 0.017674343660473824, + -0.005077862646430731, -0.019556084647774696, -0.009594413451850414, + -0.04825031757354736, 0.00016230896289926022, 0.003143883077427745, + 0.031157106161117554, -0.0332491435110569, -0.010140872560441494, + -0.04249225929379463, -0.028256090357899666, -0.008995918557047844, + 0.021160980686545372, 0.03130994364619255, -0.001629085629247129, + 0.02819039300084114, 0.009931858628988266, 0.05051739886403084, + 0.006850008387118578, -0.018732454627752304, -0.09201951324939728, + -0.042829085141420364, -0.054845187813043594, 0.021562401205301285, + 0.05139476805925369, -0.011137720197439194, 0.021749140694737434, + -0.01837606355547905, 0.017083071172237396, 0.023444999009370804, + -0.009212017990648746, -0.011377239599823952, -0.018595905974507332, + -0.09953112155199051, 0.0019816216081380844, -0.004408092238008976, + -0.027098996564745903, -0.002341427141800523, 0.029412943869829178, + 0.041135817766189575, 0.015357858501374722, 0.0017108122119680047, + 0.028364799916744232, 0.004185053985565901, -0.04822831228375435, + 0.0948278158903122, -0.0414741188287735, 0.03572544828057289, + 0.04517536610364914, 0.009942572563886642, -0.025769881904125214, + -0.017749540507793427, -0.03137620911002159, -0.00617972994223237, + 0.04517003893852234, -0.03762981668114662, 0.0358721986413002, + 0.11737086623907089, 0.0004563555121421814, -0.06275733560323715, + -0.011418106034398079, -0.08354005962610245, -0.048093460500240326, + -0.030154218897223473, -0.010961515828967094, 0.007697841618210077, + -0.08613990992307663, 0.01947987824678421, 0.017181523144245148, + 0.02698543854057789, 0.040134966373443604, -0.03223738074302673, + -0.03745822235941887, 0.054219458252191544, -0.03571298345923424, + -0.035921428352594376, 0.04604002833366394, -0.04030536487698555, + -0.032255616039037704, -0.06959861516952515, 0.032114237546920776, + -0.027767114341259003, 0.019928939640522003, 0.022700339555740356, + -0.04375129193067551, -0.05712258815765381, -0.02559071220457554, + 0.006574000231921673, -0.025430524721741676, -0.028035728260874748, + -0.04453514888882637, -0.10015997290611267, -0.0672021210193634, + 0.012007188983261585, 0.014830735512077808, 0.00970692653208971, + -0.051091041415929794, -0.031854890286922455, -0.025864001363515854, + -0.016826078295707703, -0.0026011785957962275, -0.05528291314840317, + 0.04440443217754364, -0.03783581778407097, -0.09876326471567154, + 0.029767369851469994, -0.0023010883014649153, -0.05689188838005066, + 0.007344242185354233, 0.009551416151225567, -0.08018877357244492, + 0.007934950292110443, -0.03663663938641548, -0.0009275389602407813, + 0.026911364868283272, -0.001246148720383644, 0.054533813148736954, + 0.009721122682094574, 0.005318093113601208, -0.00535608222708106, + 0.008591657504439354, 0.04166155681014061, -0.03877246752381325, + 0.01399280782788992, 0.01457316055893898, -0.02407732978463173, + -0.006206234451383352, 0.07004162669181824, 0.01619933731853962, + -0.004103302024304867, 0.045894261449575424, 0.03686122968792915, + 0.04804258793592453, 0.05579492822289467, -0.0030228029936552048, + -0.07259590923786163, 0.023546412587165833, -0.0011336577590554953, + 0.04003886505961418, -0.0139979999512434, -0.00017333473078906536, + -0.05454326048493385, -0.021969035267829895, -0.03901325911283493, + 0.012417349964380264, -0.05304381251335144, 0.004690664820373058, + -0.06269649416208267, 0.037584088742733, -0.039430856704711914, + 0.01138926949352026, 0.024504829198122025, -0.023247476667165756, + -0.001942053553648293, 0.01666364073753357, 0.005435148254036903, + -0.026095639914274216, -0.002955301431939006, -0.043733760714530945, + -0.06700831651687622, -0.06828623265028, 0.045247308909893036, + -0.02567214146256447, -0.03503000736236572, 0.0028738975524902344, + 0.007734893821179867, 0.041422292590141296, -0.01760552078485489, + -0.016545895487070084, -0.007150533143430948, 0.02324298955500126, + 0.009319701232016087, 0.003866465063765645, 0.025515582412481308, + 0.03634219616651535, -0.033345021307468414, 0.020966341719031334, + 0.07540836185216904, -0.0002573730598669499, 0.05161430686712265, + 0.0003421941655687988, -0.016416313126683235, -0.018457459285855293, + -0.04053647443652153, -0.008631067350506783, 0.011850157752633095, + 0.014364494942128658, -0.029469167813658714, -0.0497945100069046, + 0.008882390335202217, 0.04006727412343025, 0.010511534288525581, + -0.03620539978146553, -0.030285054817795753, -0.10150802135467529, + -0.01594187133014202, -0.05266118794679642, -0.017643682658672333, + 0.017300395295023918, -0.080828458070755, 0.027883851900696754, + 0.0260021835565567, -0.013791227713227272, 0.01261923462152481, + 0.02038010023534298, -0.04244862496852875, 0.011010567657649517, + 0.011758117005228996, 0.0030098427087068558, 0.003606629790738225, + 0.0020418025087565184, -0.0004243037255946547, 0.03312380611896515, + 0.02103457599878311, 0.03972248733043671, -0.07605717331171036, + -0.039676181972026825, -0.07193399965763092, -0.030211182311177254, + -0.0786738321185112, -0.025149084627628326, -0.0017661137972027063, + -0.017345670610666275, 0.027870142832398415, -0.003350367769598961, + 0.060104407370090485, 0.020051708444952965, 0.014986025169491768, + 0.0056610992178320885, 0.0593392550945282, -0.0053368182852864265, + -0.025954807177186012, -0.07451668381690979, -0.0021227921824902296, + -0.03457536920905113, 0.010495556518435478, -0.0450577586889267, + -0.016477687284350395, 0.05702868103981018, 0.025161782279610634, + 0.016737932339310646, -0.00856244657188654, 0.01180358324199915, + -0.04093103110790253, -0.03350433334708214, -0.01662720926105976, + 0.002310116309672594, 0.0026625224854797125, -0.04226106405258179, + 0.02971433289349079, -0.07220850884914398, -0.0009123267373070121, + -0.02786707505583763, -0.025111757218837738, 0.033243875950574875, + 0.03572067618370056, -0.0019114067545160651, -0.030592206865549088, + -0.021260922774672508, 0.06805034726858139, 0.00013317271077539772, + 0.006557960994541645, -0.0329759456217289, 0.006780629511922598, + 0.00866342056542635, -0.01449753437191248, 0.028198137879371643, + 0.019570309668779373, -0.036116212606430054, 0.058676715940237045, + 0.021564209833741188, -0.026323653757572174, -0.02302497997879982, + 0.02744974195957184, 0.05593085661530495, 0.02073318511247635, + -0.074001245200634, -0.0184424240142107, 0.031868427991867065, + 0.018198778852820396, -0.0450170636177063, 0.030543111264705658, + -0.06377965956926346, 0.04590768367052078, -0.005907150451093912, + -0.01748581975698471, 0.006950956769287586, 0.05506323277950287, + 0.04594920575618744, -0.015593858435750008, -0.017087753862142563, + 0.029356854036450386, -0.06531023979187012, 0.005519233178347349 + ], + "content_embedding": [ + 0.019554156810045242, 0.04886673390865326, 0.00547438021749258, + -0.02931770123541355, 0.024625789374113083, 0.008072949014604092, + 0.03052533231675625, -0.006179450079798698, -0.0333777479827404, + -0.021061548963189125, -0.05983448028564453, 0.025203319266438484, + 0.032834798097610474, 0.013029776513576508, -0.011046950705349445, + -0.031452618539333344, -8.028985030250624e-5, 0.0007977305795066059, + -0.009920830838382244, -0.0018231356516480446, -0.006695937365293503, + -0.014926698990166187, -0.009614776819944382, -0.05784115567803383, + 0.02651236765086651, -0.011027690954506397, -0.009276495315134525, + 0.04284600168466568, -0.05643690750002861, 0.04219788312911987, + 0.05048283189535141, 0.004557965788990259, -0.01679980754852295, + -0.040684137493371964, -0.044776126742362976, -0.018855446949601173, + 0.06871335208415985, -0.014481916092336178, -0.024082450196146965, + 0.04807353392243385, 0.04378245398402214, -0.0010759941069409251, + 0.014099782332777977, -0.037625934928655624, 0.05659622326493263, + 0.01868855021893978, 0.025368744507431984, -0.03537469357252121, + 0.014213587157428265, -0.0705343633890152, -0.016253503039479256, + -0.005150542128831148, -0.017522070556879044, -0.03862348571419716, + 0.00953003577888012, 0.016891248524188995, -0.007589798420667648, + 0.006262748036533594, 0.01169696543365717, -0.05435675010085106, + 0.03128333017230034, 0.07449059188365936, -0.043514277786016464, + 0.022657765075564384, 0.01074683852493763, -0.051405169069767, + 0.00816179346293211, -0.014555123634636402, -0.01839461177587509, + -0.00691940588876605, 0.009614893235266209, -0.0071630412712693214, + 0.02593475580215454, 0.07137756794691086, -0.049324654042720795, + -0.04460940882563591, -0.004007663112133741, -0.018305329605937004, + -0.029537511989474297, 0.03624692186713219, 0.04646339640021324, + -0.053126320242881775, -0.008358806371688843, -0.001911070430651307, + 0.05718495324254036, -0.02093559131026268, -0.028956729918718338, + 0.005082732532173395, -0.028617851436138153, 0.1116873174905777, + 0.006924519315361977, 0.05388922244310379, 0.04239538311958313, + 0.03629518672823906, -0.07756507396697998, 0.08572934567928314, + -0.034708015620708466, -0.052686456590890884, -0.03376411274075508, + -0.011243571527302265, -0.028565097600221634, 0.03532436862587929, + 0.025328388437628746, -0.009712263941764832, 0.0416463203728199, + 0.02975877933204174, 0.04073808714747429, 0.045180853456258774, + 0.01522457879036665, 0.0021614122670143843, -0.0602865032851696, + -0.024015003815293312, -0.032889459282159805, 0.020978014916181564, + -0.04493942856788635, 0.012820002622902393, 0.0644337609410286, + -0.019941547885537148, -0.004083186853677034, 0.018924430012702942, + -0.017330998554825783, 0.0008878704975359142, 0.02702835015952587, + -0.026265576481819153, 0.04109559580683708, -0.009575187228620052, + -0.020085688680410385, -5.433974365587346e-5, 0.018562229350209236, + -0.02393198385834694, -0.030048735439777374, 0.01821220852434635, + -0.029501279816031456, -0.01944204978644848, -0.01614498719573021, + 0.03840102627873421, -0.04210539907217026, -0.050642915070056915, + 0.026918280869722366, -0.008575397543609142, 0.011357792653143406, + 0.015665695071220398, 0.01620817743241787, -0.0165872685611248, + 0.028748027980327606, -0.10036404430866241, -0.04525408893823624, + 0.010306733660399914, -0.04815903678536415, 0.025287121534347534, + -0.010369544848799706, 0.023846469819545746, 0.016379185020923615, + 0.019874077290296555, 0.004489564802497625, -0.009458004496991634, + -0.032719556242227554, -0.0319439135491848, 0.06740261614322662, + 0.005906871519982815, 0.024023521691560745, 0.04657802730798721, + -0.08182766288518906, 0.062149085104465485, -0.0061394101940095425, + 0.014895983040332794, 0.029691752046346664, 0.0031379619613289833, + 0.04791149124503136, 0.06459061801433563, -0.012314707040786743, + 0.016569096595048904, -0.01054114755243063, -0.027581606060266495, + 0.02896907925605774, 0.0048512346111238, 0.04427911341190338, + -0.026665540412068367, 0.07613496482372284, -0.023571502417325974, + -0.004409146960824728, -0.036679890006780624, 0.0016537840710952878, + 0.024527747184038162, -0.02983722649514675, 0.0021592022385448217, + 0.005430649966001511, -0.08198896050453186, -0.017340485006570816, + -0.014370240271091461, 0.012875599786639214, -0.008910057134926319, + -0.013228043913841248, -0.0023926664143800735, -0.015292157419025898, + -0.03927253186702728, 0.07715654373168945, -0.012554320506751537, + 0.03217530623078346, -0.042463112622499466, -0.03743144869804382, + 0.008893481455743313, -0.0666876956820488, 0.02818153239786625, + -0.009054000489413738, 0.03337392210960388, 0.03328379616141319, + 0.009759150445461273, 0.01269217487424612, -0.02173757553100586, + 0.07456912100315094, 0.006997218355536461, 0.007162875030189753, + 0.040701836347579956, -0.04538433253765106, -0.0032951829489320517, + 0.002400761004537344, -0.04169681295752525, -0.05523253232240677, + 0.06444490700960159, 0.03810356929898262, -0.035072412341833115, + 0.04415622353553772, -0.02322838269174099, -0.009917438961565495, + 0.008140898309648037, -0.02388846129179001, 0.0034457307774573565, + -0.0054973880760371685, 0.024084730073809624, -0.007739664521068335, + -0.06571771949529648, 0.07359053194522858, -0.011194998398423195, + 0.0211270023137331, 0.017875710502266884, -0.01821357198059559, + 0.03745369240641594, 0.023267023265361786, 0.07556653767824173, + 0.042081139981746674, 0.01543054636567831, 0.038234904408454895, + -0.019599558785557747, -0.08676281571388245, -0.013498742133378983, + -0.011986842378973961, -0.014071784913539886, -0.03772296756505966, + 0.009192361496388912, -0.020469751209020615, -0.004528891295194626, + 0.003913218155503273, 0.013755199499428272, 0.009065939113497734, + -0.049990858882665634, 0.030110390856862068, 0.05044790729880333, + 0.020461000502109528, -0.036168310791254044, 0.04875757917761803, + -0.023571951314806938, 0.04313709959387779, 0.02649555914103985, + 0.0055029122158885, 0.0138795655220747, -0.0009625149541534483, + -0.019915221258997917, -0.0024100886657834053, -0.00426845159381628, + 0.0008668623049743474, 0.014360995963215828, 0.01953921653330326, + 0.05843087658286095, 0.010600714012980461, -0.01941727101802826, + 0.016723858192563057, -0.027911949902772903, 0.011009825393557549, + -0.0070054735988378525, -0.02280472218990326, -0.0902462899684906, + -0.029843643307685852, -0.005726281087845564, -0.007387514691799879, + 0.05724000930786133, -0.016834275797009468, 0.015216480940580368, + 0.02209043875336647, 0.004264513496309519, 0.019342858344316483, + 0.018849363550543785, -0.04794900864362717, 0.02397482842206955, + -0.007828679867088795, 0.020298736169934273, 0.09897984564304352, + -0.04056645929813385, -0.011319068260490894, 0.0034366592299193144, + -0.0073711141012609005, -0.005176732316613197, 0.022680383175611496, + -0.01522906869649887, 0.002589346142485738, -0.016521241515874863, + 0.021019242703914642, -0.004480182658880949, 0.009419859386980534, + 0.013903859071433544, -0.05053260177373886, 0.0012486587511375546, + 0.017442021518945694, -0.016688739880919456, -0.034772295504808426, + 0.03298048675060272, 0.04604269936680794, -0.008270222693681717, + 0.024096740409731865, 0.022777941077947617, 0.03941553831100464, + -0.030161242932081223, -0.01781023107469082, -0.03533001244068146, + 0.02088671736419201, 0.0896298736333847, 0.0061553712002933025, + 0.054883576929569244, -0.04094908386468887, 0.04382561892271042, + -0.003113290062174201, 0.015607516281306744, -0.04469290003180504, + -0.013229981996119022, -0.0026423041708767414, -0.017652839422225952, + 0.005086386110633612, 0.035375673323869705, 9.725322161102667e-5, + -0.026332346722483635, 0.01958891935646534, 0.041558533906936646, + -0.02570311166346073, -0.04806942865252495, -0.035452235490083694, + -0.015692519024014473, -0.0027702273800969124, 0.022831231355667114, + 0.05247980356216431, 0.0010726081673055887, -0.025438379496335983, + 0.006880710367113352, 0.009276037104427814, -0.021758491173386574, + -0.008571256883442402, 0.003312689019367099, 0.032590776681900024, + 0.02640446089208126, 0.0034450399689376354, 0.028627941384911537, + 0.07457757741212845, -0.012303084135055542, 0.045139290392398834, + -3.2679530704626814e-5, 0.024825602769851685, 0.04402294382452965, + 0.025363540276885033, 0.03840187191963196, 0.019677763804793358, + -0.03521053493022919, 0.036523785442113876, -0.06001857668161392, + 0.006586587987840176, 0.022668125107884407, 0.06063239648938179, + 0.015586800873279572, -0.08306828886270523, -0.04289577156305313, + -0.0050475094467401505, -0.0309798214584589, 0.05810924991965294, + -0.017818300053477287, -0.01088999304920435, -0.017884155735373497, + 0.04356890171766281, -0.023094575852155685, -0.04477296397089958, + -0.0007801170577295125, 0.018146106973290443, -0.05633535981178284, + 0.006709645502269268, -0.037334758788347244, -0.055724598467350006, + 0.00994165986776352, -0.009849119931459427, -0.027259083464741707, + -0.02565668150782585, 0.005235382355749607, 0.016267497092485428, + 0.00393668282777071, -0.05648971349000931, -0.05955129489302635, + 0.026542942970991135, 0.040565431118011475, -0.02225298061966896, + 0.0017030639573931694, -0.02689032256603241, -0.029025251045823097, + 0.030817490071058273, -0.007936912588775158, 0.05566547438502312, + 0.0174697358161211, -0.014709461480379105, -0.07380940765142441, + 0.07026955485343933, -0.024563433602452278, 0.05333513021469116, + 0.020963717252016068, -0.015575055032968521, -0.04304461553692818, + 0.00822180975228548, -0.013204170390963554, -0.0028262599371373653, + 0.015431943349540234, -0.025627007707953453, 0.0006762628327123821, + -0.02078782208263874, 0.009704814292490482, -0.006950112525373697, + -0.020425891503691673, 0.044901806861162186, 0.020927794277668, + 0.009534145705401897, 0.004958992823958397, -0.037563592195510864, + 0.03806327283382416, 0.0783824622631073, 0.011150919832289219, + -0.024385575205087662, 0.03461897745728493, 0.02127663977444172, + -0.012272517196834087, -0.01546854991465807, -0.06705902516841888, + -0.01649612747132778, 0.06068763509392738, 0.07799869775772095, + 0.0014717800077050924, -0.04836009815335274, -0.026833070442080498, + 0.013509837910532951, 0.04280327260494232, 0.009658309631049633, + -0.007854060269892216, 0.09166036546230316, -0.00760420598089695, + 0.024465130642056465, -0.041591379791498184, 0.007116211112588644, + 0.05567977577447891, 0.01807284727692604, 0.028304288163781166, + -0.014866949990391731, 0.026369474828243256, 0.008983064442873001, + -0.02317068539559841, 0.03937782347202301, -0.01901034638285637, + -0.0021325594279915094, 0.030924763530492783, -0.020755570381879807, + 0.030001072213053703, 0.03809978440403938, 0.0334426648914814, + -0.042968180030584335, -0.012311535887420177, 0.03697645664215088, + -0.041293930262327194, 0.01696925237774849, 0.03560850769281387, + 0.03847989812493324, -0.037730954587459564, -0.05352506786584854, + -0.02746652066707611, -0.05294184759259224, -0.017248092219233513, + -0.005418767221271992, 0.01951681822538376, -0.017932193353772163, + -0.007422131486237049, 0.03827866166830063, 0.05701953545212746, + -0.02210610918700695, 0.010034722276031971, -0.07954911887645721, + -0.0485968291759491, -0.028629625216126442, 0.009054362773895264, + 0.02398092858493328, -0.009973667562007904, 0.0011409823782742023, + -0.011182617396116257, 0.0020028105936944485, 0.006942914333194494, + -0.039862822741270065, -0.0066703930497169495, -0.004236259963363409, + -0.1050848662853241, 0.025538505986332893, 0.029989799484610558, + 0.01211432833224535, 0.04559238627552986, 0.050545401871204376, + 0.05476491525769234, 0.01163802482187748, -0.054837070405483246, + 0.07410066574811935, 0.0024028397165238857, -0.10432479530572891, + 0.07078853249549866, -0.012614017352461815, 0.06030529364943504, + 0.054839566349983215, 0.014005501754581928, -0.1118561178445816, + 0.009275965392589569, -0.023663705214858055, -0.028527697548270226, + 0.0584726445376873, -0.045622922480106354, 0.03320262208580971, + 0.11574956774711609, 0.007655338849872351, -0.07191741466522217, + 0.03859880194067955, -0.06297247856855392, -0.018405890092253685, + -0.028816718608140945, -0.0126464469358325, 0.0202946774661541, + -0.03483844920992851, 0.024250855669379234, -0.006263037212193012, + 0.02315174601972103, -0.0037849699147045612, -0.0359908789396286, + -0.037620242685079575, 0.05056930333375931, -0.006831671576946974, + -0.10494183748960495, 0.07645728439092636, -0.028021620586514473, + 0.009692930616438389, -0.10232461243867874, 0.05202733352780342, + -0.05352945625782013, 0.02073156274855137, -0.017188169062137604, + 0.0020886484999209642, -0.04133611172437668, -0.044309552758932114, + 0.018564041703939438, 0.026704275980591774, -0.055197130888700485, + 0.007287430576980114, -0.1245347261428833, -0.037194203585386276, + 0.06665091216564178, 0.061706364154815674, 0.01804385520517826, + -0.013962237164378166, -0.024994580075144768, -0.011294208467006683, + -0.009245212189853191, 0.029879143461585045, -0.046995047479867935, + 0.02158532105386257, -0.002750945743173361, -0.08373189717531204, + 0.006320012733340263, -0.029947226867079735, -0.09511808305978775, + 0.03186006098985672, 0.0028628590516746044, -0.02251911163330078, + 0.03152289241552353, -0.07237773388624191, 0.013269931077957153, + 0.09650041908025742, -0.045900944620370865, -0.006541605107486248, + 0.006844623479992151, 0.03238837793469429, -0.020313216373324394, + -0.013520441018044949, 0.000403873244067654, -0.024447409436106682, + -0.014962681569159031, 0.006884200032800436, -0.04582136869430542, + -0.02838127687573433, 0.02936256304383278, 0.04077419266104698, + 0.02266734093427658, 0.013416043482720852, 0.0345231331884861, + 0.052821315824985504, 0.028492338955402374, -0.01292923279106617, + -0.03745116665959358, 0.0028044944629073143, 0.006625712383538485, + 0.017212992534041405, -0.00239459122531116, -0.002529381774365902, + -0.029911531135439873, 0.029142336919903755, -0.07905209809541702, + 0.01719854585826397, -0.03854485601186752, -0.0116807846352458, + -0.046873632818460464, 0.05275706946849823, -0.04152297228574753, + -0.03727864846587181, 0.048614900559186935, -0.010790652595460415, + -0.0024877521209418774, 0.012286793440580368, 0.08292551338672638, + 0.03208302706480026, 0.02867130935192108, -0.039545897394418716, + -0.02800118923187256, -0.07673710584640503, 0.005896291229873896, + -0.030191265046596527, -0.0187264122068882, -0.03647911176085472, + 0.0371234230697155, 0.05115009844303131, -0.04013253375887871, + -0.03304912894964218, 0.05173036456108093, 0.04860515147447586, + -0.00717319268733263, 0.045631736516952515, 0.04671349748969078, + 0.004948618821799755, 0.009837846271693707, 0.04381090775132179, + 0.0918705016374588, -0.005869758781045675, 0.02249985560774803, + 0.010875782929360867, 0.02956228516995907, 0.0036174776032567024, + 0.011353887617588043, -0.05186513066291809, 0.0173268411308527, + 0.011808891780674458, -0.00798418466001749, -0.00929324608296156, + -0.004763087723404169, 0.010464239865541458, -0.001494695316068828, + -0.024331238120794296, -0.042497288435697556, -0.049635156989097595, + 0.006581253372132778, -0.05040008947253227, -0.01882144808769226, + -0.02419630065560341, -0.06420613825321198, 0.020046783611178398, + 0.024303266778588295, -0.009806456044316292, -0.003666533390060067, + 0.0018573087872937322, -0.02440156601369381, 0.0059090061113238335, + -0.03926969692111015, 0.011777290143072605, -0.032393939793109894, + -0.0015500712906941772, 0.019816264510154724, 0.0037668957374989986, + -0.0033931683283299208, -0.01164526678621769, -0.07331964373588562, + -0.008798380382359028, -0.016916625201702118, -0.034663647413253784, + -0.05818398669362068, 0.015174336731433868, -0.010697754099965096, + -0.04179975762963295, 0.0064012641087174416, 0.01987038180232048, + 0.019733907654881477, -0.0013441460905596614, 0.05315450206398964, + -0.021319502964615822, 0.05351290851831436, 0.052106473594903946, + -0.010152475908398628, -0.017445174977183342, 0.030323222279548645, + 0.02796917036175728, 0.0026626174803823233, -0.03001641295850277, + 0.014342408627271652, 0.03061019256711006, -0.00294340867549181, + -0.018154315650463104, 0.02443081885576248, 0.003663803683593869, + -0.023388244211673737, -0.0018819351680576801, 0.0010939103085547686, + 0.03557095676660538, -0.053037356585264206, -0.06498610228300095, + 0.04878298565745354, -0.03413922339677811, -0.02763182483613491, + 0.0009715812630020082, 0.00486012501642108, 0.03292006626725197, + 0.023539533838629723, 0.011763699352741241, -0.026951594278216362, + -0.03602864220738411, 0.04788520559668541, -0.04133124649524689, + -0.013166938908398151, -0.037222955375909805, 0.014670289121568203, + -0.056680355221033096, -0.008314837701618671, 0.04227377846837044, + -0.04986898973584175, -0.03443481773138046, -0.00174588686786592, + 0.0027100981678813696, -0.001079584937542677, -0.06199319660663605, + 0.03100454993546009, 0.06752201914787292, 0.028809374198317528, + -0.06289442628622055, -0.005715612787753344, -0.052428603172302246, + -0.01548265665769577, -0.048164043575525284, 0.04651368409395218, + 0.00703784916549921, 0.0493292361497879, 0.024252086877822876, + -0.03811171278357506, 0.030049748718738556, 0.02749229036271572, + -0.0005660666502080858, 0.009821311570703983, 0.01306783128529787, + 0.01775788515806198, -0.051085181534290314, 0.028779184445738792 + ], + "chunk_ind": 0 + }, + { + "url": "https://docs.danswer.dev/more/use_cases/sales", + "title": "Sales", + "content": "Keep your team up to date on every conversation and update so they can close.\n\nRecall Every Detail\nBeing able to instantly revisit every detail of any call without reading transcripts is helping Sales teams provide more tailored pitches, build stronger relationships, and close more deals. Instead of searching and reading through hours of transcripts in preparation for a call, your team can now ask Danswer \"What specific features was ACME interested in seeing for the demo\". Since your team doesn't have time to read every transcript prior to a call, Danswer provides a more thorough summary because it can instantly parse hundreds of pages and distill out the relevant information. Even for fast lookups it becomes much more convenient - for example to brush up on connection building topics by asking \"What rapport building topic did we chat about in the last call with ACME\".\n\nKnow Every Product Update\nIt is impossible for Sales teams to keep up with every product update. Because of this, when a prospect has a question that the Sales team does not know, they have no choice but to rely on the Product and Engineering orgs to get an authoritative answer. Not only is this distracting to the other teams, it also slows down the time to respond to the prospect (and as we know, time is the biggest killer of deals). With Danswer, it is even possible to get answers live on call because of how fast accessing information becomes. A question like \"Have we shipped the Microsoft AD integration yet?\" can now be answered in seconds meaning that prospects can get answers while on the call instead of asynchronously and sales cycles are reduced as a result.", + "title_embedding": [ + 0.008453648537397385, 0.049128592014312744, 0.0009390072082169354, + -0.011420674622058868, 0.009472657926380634, 0.05824451148509979, + -0.04129518195986748, -0.018892904743552208, -0.007598293945193291, + -0.03804052621126175, -0.003747896058484912, 0.005537204910069704, + 0.026590371504426003, -0.02672540210187435, 0.02651272714138031, + -0.033856429159641266, 0.03809495270252228, -0.013283955864608288, + -0.00013869917893316597, 0.020309027284383774, -0.011650309897959232, + -0.02103874459862709, 0.01066586747765541, -0.057243604212999344, + 0.031903959810733795, -0.012924387119710445, -0.012852400541305542, + 0.01951044611632824, 0.018149282783269882, -0.01314238179475069, + 0.01411629281938076, -0.009169652126729488, 0.017607972025871277, + -0.0685962364077568, -0.03812728449702263, -0.06783904880285263, + 0.009403989650309086, 0.0073580555617809296, 0.024488259106874466, + 0.0005618860013782978, 0.0418679341673851, 0.01953919045627117, + -0.05312833935022354, -0.024998614564538002, 0.017944931983947754, + -0.004367160610854626, 0.023977765813469887, 0.012059491127729416, + 0.049591515213251114, -0.05067070946097374, 0.001997043378651142, + 0.0024496091064065695, -0.029425427317619324, -0.01783224567770958, + 0.0016368982614949346, 0.006662128958851099, 0.0406189002096653, + 0.011542750522494316, 0.023075561970472336, -0.012605642899870872, + 0.010134617798030376, 0.054372530430555344, -0.008662662468850613, + 0.016197390854358673, 0.015719175338745117, -0.024816671386361122, + -0.006610441952943802, 0.015418685972690582, -0.011234346777200699, + -0.003028685925528407, 0.09540237486362457, 0.026426734402775764, + 0.03904794156551361, 0.025213684886693954, -0.01716603711247444, + -0.03701924905180931, 0.0097318384796381, 0.003443458816036582, + -0.022713838145136833, 0.029555244371294975, -0.0012461059959605336, + -0.02693314291536808, 0.027585584670305252, 0.00919498410075903, + 0.0784342810511589, -0.004147294908761978, 0.0058822669088840485, + -0.03257093206048012, -0.0194808728992939, -0.015468079596757889, + -0.007020206656306982, 0.017711102962493896, 0.09179206192493439, + 0.07245390862226486, -0.08327075839042664, -0.01777547225356102, + -0.0036551833618432283, 0.005220785271376371, -0.013074155896902084, + -0.05137576162815094, -0.03984086588025093, -0.001254269853234291, + 0.0388508178293705, -0.026559771969914436, 0.0941508412361145, + 0.028184799477458, 0.030329154804348946, -0.015901995822787285, + -0.05033569037914276, -0.040444258600473404, -0.08671051263809204, + 0.016047460958361626, -0.015745067968964577, 0.036006249487400055, + -0.019317714497447014, 0.0028998597990721464, 0.08916892111301422, + -0.06901372224092484, 0.05473657324910164, 0.06781236082315445, + 0.016306273639202118, 0.0011640831362456083, 0.0008445090497843921, + 0.06246241182088852, -0.020153285935521126, -0.011525464244186878, + -0.043366242200136185, 0.036528658121824265, 0.012839822098612785, + -0.0585474967956543, 0.04394524171948433, 0.017970293760299683, + 0.0273651871830225, -0.006580607499927282, 0.04960521310567856, + 0.04129025712609291, -0.039038222283124924, -0.007922167889773846, + 0.06417153030633926, 0.00870921928435564, 0.04419026896357536, + 0.03394830971956253, 0.04194091632962227, -0.009943729266524315, + -0.026762165129184723, -0.06321611255407333, 0.018493760377168655, + 0.02112971432507038, -0.008362037129700184, 0.0030741533264517784, + 0.02977512590587139, 0.007839385420084, -0.030427763238549232, + -0.005435082130134106, -0.023782288655638695, -0.06198855862021446, + -0.006325197406113148, 0.03481286019086838, 0.01096314936876297, + -0.008224114775657654, 0.016886647790670395, 0.018134206533432007, + -0.02693709172308445, 0.08969569951295853, -0.03854775056242943, + -0.036120977252721786, 0.003924157004803419, 0.05902013182640076, + 0.02604551427066326, 0.04082872346043587, -0.006722352001816034, + -0.011970511637628078, -0.014307579025626183, 0.0019104834645986557, + 0.05876787751913071, -0.002502535469830036, 0.027572669088840485, + 0.027537895366549492, -0.026239819824695587, -0.02237943559885025, + 0.026839105412364006, -0.04806261509656906, 0.022188611328601837, + 0.05301826074719429, -0.017422696575522423, -0.04489205405116081, + 0.04667934030294418, -0.03202678635716438, -0.022968970239162445, + 0.019313516095280647, -0.06724600493907928, 0.023997649550437927, + 0.02574523165822029, -0.021134337410330772, -0.018166225403547287, + -0.02959403023123741, 0.038194961845874786, 0.009773771278560162, + 0.026523802429437637, -0.014497771859169006, 0.009200031869113445, + 0.01488631684333086, -0.10185936093330383, 0.018728939816355705, + -0.01575741171836853, 0.02251303941011429, 0.02899281494319439, + -0.04970388114452362, 0.007716581225395203, 0.06781720370054245, + 0.07015375792980194, -0.02934109978377819, -0.006221897434443235, + 0.012976646423339844, -0.00737345777451992, -0.02422930672764778, + -0.03612032160162926, 0.003943525720387697, -0.02053997851908207, + 0.01488402672111988, -0.016384800896048546, -0.02631048858165741, + 0.0029128696769475937, 0.0012878051493316889, 0.029553934931755066, + 0.06647666543722153, -0.014463554136455059, 0.04724975302815437, + 0.01416710950434208, 0.0259545985609293, -0.019878843799233437, + -0.04123354330658913, -0.003678504843264818, -0.015237071551382542, + -0.037017468363046646, -0.014126508496701717, 0.037044424563646317, + 0.041153766214847565, 0.034035731106996536, 0.0518031120300293, + -0.004720885772258043, -0.01163511723279953, 0.07213476300239563, + -0.06036211922764778, -0.03499453514814377, -0.006943386513739824, + -0.06392820924520493, -0.013479998335242271, 0.01668090932071209, + 0.030153054744005203, 0.04787909612059593, 0.042566943913698196, + 0.01869821362197399, 0.05060578137636185, -0.005738548934459686, + 0.0004935020115226507, -0.04920157790184021, 0.05485580489039421, + -0.02717270515859127, -0.026000261306762695, 0.017413917928934097, + 0.00194182014092803, 0.06368009001016617, -0.021680809557437897, + 0.011896755546331406, 0.008441813290119171, -0.009322874248027802, + 0.004058067686855793, 0.003404452698305249, -0.0070596663281321526, + -0.01350175030529499, 0.027827585116028786, -0.017853371798992157, + 0.05100760981440544, -0.01331804133951664, -0.021219315007328987, + 0.012195413932204247, -0.04513333737850189, 0.022477995604276657, + 0.004410626832395792, 0.033304695039987564, 0.023220136761665344, + 0.00041832958231680095, 0.007724999450147152, 0.0359807088971138, + 0.010411631315946579, 0.0007441110792569816, -0.018354782834649086, + -0.030612032860517502, 0.04444800317287445, -0.004541076719760895, + -0.012099254876375198, 0.03223736584186554, -0.017639242112636566, + 0.012390367686748505, 0.055463794618844986, 0.09133724123239517, + -0.028237899765372276, -0.026783155277371407, -0.029024433344602585, + 0.014482105150818825, 0.05629871413111687, -0.03724139928817749, + 0.008170249871909618, 0.06597549468278885, 0.051776643842458725, + 0.042193781584501266, -0.01338224858045578, 0.03543481230735779, + 0.0065676262602210045, -0.04679378867149353, 0.048750247806310654, + -0.01348006259649992, 0.06560897082090378, -0.10058096796274185, + 0.06226775795221329, 0.06525543332099915, -0.020321687683463097, + 0.05926727131009102, -0.023439910262823105, 0.00998155027627945, + -0.04136430844664574, 0.04513855278491974, -0.07410337775945663, + -0.0032536713406443596, 0.022534336894750595, -0.0035887588746845722, + 0.018703486770391464, -0.023037323728203773, -0.03570957109332085, + -0.03149940446019173, 0.01058092713356018, -0.08196881413459778, + -0.012937279418110847, 0.02611234411597252, 0.03242015466094017, + 0.00964296329766512, -0.03003847971558571, -0.02878165803849697, + 0.005552087444812059, 0.11100566387176514, 0.006707212887704372, + 0.007847320288419724, -0.04757271707057953, -0.010918735526502132, + 0.007332456298172474, -0.04022597521543503, -0.03945135325193405, + 0.08289318531751633, -0.049061018973588943, -0.04947024583816528, + -0.030500037595629692, -0.03648613020777702, 0.007221090141683817, + -0.023051844909787178, 0.06497090309858322, 0.024345578625798225, + -0.0074218385852873325, -0.04062318801879883, -0.020212918519973755, + -0.009461181238293648, -0.04491201415657997, 0.05126942694187164, + -0.005242756102234125, 0.024492694064974785, -0.02291315235197544, + 0.06517285853624344, -0.006112807895988226, 0.004548671655356884, + 0.009358521550893784, 0.0066603804007172585, -0.005717182531952858, + 0.046729590743780136, 0.04008319228887558, 0.09077014029026031, + 0.03511488437652588, -0.05282759666442871, 0.020438214763998985, + -0.01378707680851221, -0.005117158405482769, 0.07433145493268967, + 0.0034097072202712297, -0.011192821897566319, 0.0009265196276828647, + -0.03159809112548828, -0.033578045666217804, -0.012528836727142334, + -0.006292750593274832, 0.0452519953250885, -0.025647340342402458, + -0.026395585387945175, -0.044332105666399, -0.0012870433274656534, + 0.016866305842995644, -0.00726186903193593, -0.014325585216283798, + 0.02150380238890648, 0.05446008965373039, 0.01817481219768524, + 0.017272990196943283, 7.192481280071661e-5, -0.021787019446492195, + 0.03518282249569893, 0.04129958152770996, 0.005599076859652996, + 0.07016170769929886, 0.0068466924130916595, -0.038150086998939514, + 0.022464951500296593, -0.007263584528118372, 0.04023060202598572, + -0.006662019528448582, -0.03398700803518295, -0.027063554152846336, + -0.014334858395159245, 0.00031888100784271955, 0.03320762887597084, + -0.0263507217168808, -0.01863865926861763, 0.018559059128165245, + 0.06845609098672867, -0.0037615702021867037, 0.023087816312909126, + -0.019276361912488937, -0.03351914510130882, 0.021971892565488815, + 0.041575655341148376, 0.05621027201414108, -0.003078967332839966, + -0.06297048181295395, -0.05009821802377701, -0.026463210582733154, + 0.0035874273162335157, 0.021911393851041794, -0.074904665350914, + -0.0012849566992372274, 0.06580246239900589, -0.0096419183537364, + -0.01183160487562418, 0.002244731178507209, -0.02129410021007061, + -0.004490557126700878, 0.006305266637355089, 0.020787451416254044, + -0.028946323320269585, 0.09907153993844986, 0.06419308483600616, + -0.018514622002840042, -0.03974919393658638, -0.055583421140909195, + -0.04144161939620972, -0.01479779276996851, -0.015063298866152763, + -0.05278000980615616, 0.056262142956256866, 0.0039010541513562202, + 0.025815758854150772, -0.01457720622420311, 0.017469312995672226, + 0.032789044082164764, 0.010338534601032734, 0.009348046034574509, + -0.002339842962101102, 0.023598607629537582, 0.01676766760647297, + -0.03503762558102608, 0.033228978514671326, -0.03216487169265747, + -0.027531251311302185, 0.05846886709332466, -0.00979926623404026, + 0.011551604606211185, 0.026247017085552216, 0.00776244793087244, + -0.042052820324897766, 0.003744697431102395, -0.013622709549963474, + -0.021054048091173172, -0.022621311247348785, 0.03077824041247368, + 0.019676415249705315, -0.02765408344566822, -0.013561422936618328, + -0.015634974464774132, -0.03614448755979538, 0.014710169285535812, + 0.00825627613812685, 0.014769040048122406, 0.006793464533984661, + -0.010395821183919907, -0.0022474846336990595, 0.033902380615472794, + 0.003390782279893756, -0.04533020406961441, -0.09353062510490417, + -0.027594659477472305, -0.026893651112914085, 0.00822615996003151, + 0.03448451682925224, -0.013120760209858418, 0.03301888331770897, + -0.03717275336384773, -0.017613839358091354, 0.03131122142076492, + 0.01355862244963646, -0.016891464591026306, -0.005221182014793158, + -0.09999044984579086, -0.0072242445312440395, 0.023685455322265625, + -0.006333169527351856, 0.05389386788010597, 0.0006377844838425517, + 0.06066382676362991, -0.010544034652411938, -0.03069271147251129, + 0.046539101749658585, 0.04315992072224617, -0.04527072235941887, + 0.08369841426610947, -0.05231470242142677, -0.02663319930434227, + -0.017284002155065536, 0.05720992013812065, -0.02959314174950123, + -0.053442198783159256, -0.055300384759902954, -0.0034046657383441925, + 0.058179739862680435, -0.0067557781003415585, 0.048700254410505295, + 0.06980213522911072, 0.0022220234386622906, -0.02891203574836254, + -0.03402455151081085, -0.07090416550636292, -0.02885468490421772, + -0.033339668065309525, 0.057542525231838226, 0.035563718527555466, + -0.0376402772963047, 0.01505962759256363, 0.025728864595294, + 0.001696597901172936, 0.04947248846292496, -0.0798964574933052, + -0.02692596986889839, -0.012759744189679623, -0.02195296436548233, + -0.014371627941727638, 0.02381875552237034, -0.01423177681863308, + -0.014195146039128304, -0.09804418683052063, -0.0008884363924153149, + -0.0418919213116169, 0.015419455245137215, -0.0015723679680377245, + -0.008167393505573273, -0.027652231976389885, -0.01364823617041111, + 0.042369287461042404, -0.020887810736894608, -0.01855718344449997, + -0.030453767627477646, -0.08889546990394592, -0.042489275336265564, + -0.003145430004224181, -0.0007042307406663895, 0.016261309385299683, + -0.04196145758032799, -0.02786160260438919, 0.00031149861752055585, + 0.0020716730505228043, -0.018168985843658447, -0.035522907972335815, + 0.06329862773418427, -0.06277810037136078, -0.04981480538845062, + 0.05988100543618202, 0.0031491960398852825, -0.03463412821292877, + -0.010109111666679382, -0.013390148058533669, -0.08232187479734421, + 0.018557677045464516, -0.023832205682992935, -0.021515224128961563, + 0.03051081858575344, -0.021489854902029037, 0.009503633715212345, + 0.025148555636405945, -0.023579541593790054, -0.035016197711229324, + -0.022730164229869843, 0.04465099051594734, -0.04341805726289749, + 0.011980813927948475, 0.024123655632138252, -0.026239709928631783, + -0.017752202227711678, 0.027042675763368607, 0.023839112371206284, + 0.01306204218417406, 0.039557792246341705, 0.07731491327285767, + 0.02771804668009281, 0.07320678234100342, -0.008236434310674667, + -0.025150621309876442, 0.0035144551657140255, -0.045307569205760956, + -0.004989498760551214, 0.006890833377838135, 0.013798183761537075, + -0.04717986658215523, 0.00230599008500576, -0.06974467635154724, + 0.013648996129631996, -0.05875125527381897, -0.0020281318575143814, + -0.07060255855321884, 0.04474693909287453, -0.010507912375032902, + 0.01920556277036667, 0.037952445447444916, -0.04831290617585182, + -0.030323892831802368, 0.017083611339330673, 0.01788332499563694, + -0.019379939883947372, 0.0296696275472641, -0.0202578566968441, + -0.05725785344839096, -0.07489712536334991, 0.023742130026221275, + -0.07134415209293365, -0.011462788097560406, 0.0046195476315915585, + 0.04435937851667404, 0.01344655267894268, -0.003911314997822046, + -0.03020038641989231, -0.0032732610125094652, 0.03007005713880062, + 0.006368617527186871, -0.03210403770208359, 0.00835089199244976, + 0.05988067761063576, -0.03537531942129135, 0.05247778445482254, + 0.03723180294036865, -0.008693824522197247, 0.04847349226474762, + 0.016112500801682472, 0.011540782637894154, -0.0065521071664988995, + -0.03243750333786011, -0.011966057121753693, 0.017163656651973724, + -0.0029253605753183365, -0.053153038024902344, 0.0018134346464648843, + 0.01733018085360527, 0.029417017474770546, 0.030433885753154755, + 0.0021621473133563995, -0.027712296694517136, -0.05925380811095238, + -0.022185055539011955, -0.0350322499871254, -0.02007930353283882, + 0.010399214923381805, -0.055177975445985794, 0.0007819311576895416, + 0.024769598618149757, 0.03780986741185188, 0.03521614894270897, + -0.01817735843360424, -0.08278614282608032, -0.021156983450055122, + 0.03359638899564743, -0.023659229278564453, -0.007879458367824554, + 0.0292595736682415, -0.035273004323244095, 0.032482825219631195, + 0.02688293345272541, 0.023407144472002983, -0.047480449080467224, + 0.0006359686376526952, -0.04895651713013649, 0.011627614498138428, + -0.07718108594417572, 0.010565578006207943, -0.01866811513900757, + 0.01029923651367426, -0.023772811517119408, -0.032370492815971375, + 0.05088132247328758, 0.011682837270200253, 0.03289812430739403, + 0.017765464261174202, 0.0604407899081707, -0.03875206410884857, + -0.05453289672732353, -0.05849386751651764, -0.008108421228826046, + -0.036137521266937256, -0.003845603670924902, -0.010756440460681915, + 0.01515593845397234, 0.09156721830368042, 0.02919408679008484, + 0.024247899651527405, -0.020837178453803062, 0.04395196586847305, + -0.10095755755901337, -0.07707840204238892, -0.04705304652452469, + 0.007648217957466841, -0.015342561528086662, -0.02814168483018875, + 0.039529476314783096, -0.06335531175136566, -0.03782089054584503, + -0.032349780201911926, -0.011073637753725052, 0.03126451373100281, + 0.01734590344130993, -0.0038809722755104303, 0.013081631623208523, + -0.03124905936419964, 0.0832752138376236, -0.007435368373990059, + 0.00989855732768774, 0.004071374889463186, -0.021534224972128868, + -0.022376252338290215, 0.0033940861467272043, 0.01537957414984703, + -0.01530678290873766, -0.006626737304031849, 0.0722239539027214, + 0.023105483502149582, -0.048958491533994675, -0.031564872711896896, + 0.020468993112444878, 0.02797403372824192, 0.02208004891872406, + -0.06905028969049454, -0.0069040716625750065, 0.05185015872120857, + 0.020353762432932854, -0.059334978461265564, 0.019217371940612793, + -0.06793943047523499, 0.052697695791721344, 0.039136022329330444, + -0.03286914899945259, 0.0017800497589632869, 0.031667277216911316, + 0.0455632358789444, -0.028096599504351616, 0.007715262472629547, + 0.005349942483007908, -0.051207322627305984, -0.04513049125671387 + ], + "content_embedding": [ + 0.018702084198594093, 0.03602918982505798, -0.0030462138820439577, + -0.044632311910390854, -0.00032779801404103637, 0.013867323286831379, + 0.028261501342058182, -0.0397375151515007, -0.04266185685992241, + -0.01854686811566353, -0.00980929471552372, 0.005383333191275597, + 0.025814494118094444, 0.020457584410905838, -0.01165740005671978, + -0.02068958804011345, 0.024463720619678497, -0.029769178479909897, + 0.00032462665694765747, 0.020778311416506767, -0.009875921532511711, + -0.03926593065261841, -0.007392906118184328, -0.02128470689058304, + 0.020180456340312958, -0.007395976223051548, -0.01573384366929531, + 0.01813557744026184, -0.04103250429034233, 0.025176111608743668, + 0.07708748430013657, -0.03301733359694481, -0.00411647092550993, + -0.03946784511208534, -0.06054544821381569, -0.040751539170742035, + 0.01922212913632393, -0.027164554223418236, -0.07051867246627808, + 0.03071393072605133, 0.07258772104978561, -0.026042146608233452, + 0.00869719311594963, -0.028085211291909218, 0.0623227022588253, + -0.018381644040346146, 0.07613946497440338, -0.037470318377017975, + 0.022420862689614296, -0.05576684698462486, 0.008394862525165081, + -0.032719686627388, -0.01822705753147602, -0.0010964440880343318, + 0.01320287398993969, 0.03199688717722893, 0.02982492372393608, + 0.013676099479198456, 0.04192396625876427, -0.007906809449195862, + 0.009779189713299274, 0.014214487746357918, -0.0091244550421834, + 0.0358707420527935, 0.003965431824326515, -0.10266417264938354, + 0.007566090207546949, 0.001176235033199191, -0.027059122920036316, + -0.011146960780024529, 0.010613090358674526, 0.0269278846681118, + 0.00038031316944397986, 0.024177612736821175, -0.004191671498119831, + 0.005235857795923948, 0.018077049404382706, -0.04018911346793175, + -0.05125276744365692, 0.01798614114522934, 0.022944867610931396, + -0.04374289512634277, 0.003367446828633547, 0.026103869080543518, + 0.03640212118625641, -0.006004476919770241, 0.011275619268417358, + 0.016332507133483887, 0.0004818506713490933, 0.04315895587205887, + 0.022076765075325966, -0.004544341471046209, 0.03210305795073509, + 0.0906452164053917, -0.07215604186058044, 0.08494149893522263, + 0.006179131101816893, -0.004107883665710688, -0.012479269877076149, + -0.034221433103084564, -0.017944667488336563, 0.04593302682042122, + 0.019292891025543213, -0.0031508891843259335, 0.06966886669397354, + 0.062235988676548004, 0.01879720948636532, -6.745052814949304e-5, + 0.013703186996281147, -0.003644032636657357, -0.04458186402916908, + -0.0017342075007036328, -0.033848460763692856, 0.019645417109131813, + -0.02943187765777111, 0.001105084316805005, 0.11609244346618652, + -0.05156521871685982, 0.009876714088022709, 0.005161592271178961, + -0.032977450639009476, -0.04834079369902611, 0.021944768726825714, + -0.012638481333851814, 0.04569210857152939, 0.015415391884744167, + -0.03170562908053398, -0.0031294531654566526, 0.015164556913077831, + -0.034650497138500214, 0.006696060299873352, 0.006991597358137369, + -0.05405446141958237, 0.002478727139532566, 0.03736428543925285, + 0.006255546119064093, -0.023612871766090393, -0.04719111695885658, + 0.019092371687293053, -0.007389509119093418, 0.005412441678345203, + 0.0032002630177885294, 0.014097358100116253, 0.0011166664771735668, + -0.0012068386422470212, -0.0596468411386013, 0.0030182863119989634, + 0.018532730638980865, 0.020043276250362396, -0.0014203430619090796, + 0.03718654438853264, -0.04137871786952019, 0.003067273646593094, + 0.060497768223285675, -0.002445906400680542, -0.05149608850479126, + -0.011358898133039474, -0.0057965232990682125, 0.04786103963851929, + 0.05521485581994057, 0.03300704434514046, 0.01832137256860733, + -0.03220272436738014, 0.05887257307767868, -0.04280361905694008, + 0.030713768675923347, 0.031198250129818916, 0.018273506313562393, + 0.021060051396489143, 0.018141141161322594, -0.01044323481619358, + 0.012220986187458038, 0.011005178093910217, -0.008985857479274273, + 0.05407913029193878, -0.010373812168836594, 0.015498371794819832, + -0.04114103317260742, 0.02436467818915844, -0.033720944076776505, + 0.04162474721670151, -0.03209234029054642, 0.007773025427013636, + 0.03150646388530731, -0.042852289974689484, -0.0062582893297076225, + 0.04668346792459488, -0.06638985872268677, -0.005832660011947155, + -0.022002393379807472, -0.015554124489426613, -0.017163358628749847, + -0.04198216274380684, -0.01709570921957493, 0.026661567389965057, + -0.049418482929468155, 0.06740570068359375, 0.0159238763153553, + 0.0023050543386489153, -0.031238939613103867, -0.03321292996406555, + -0.004760078154504299, -0.07937376946210861, 0.03254229202866554, + -0.023319443687796593, 0.04906806722283363, 0.05458753556013107, + -0.013899387791752815, 0.03574313595890999, 0.011882249265909195, + 0.11678190529346466, 0.0007563747349195182, 0.025212422013282776, + 0.023760458454489708, -0.021716047078371048, -0.017915191128849983, + -0.02478560246527195, -0.028406206518411636, -0.06178540363907814, + 0.044959306716918945, -0.003325885394588113, -0.02079332433640957, + -0.010518986731767654, -0.00242405547760427, -0.0030814141500741243, + 0.0010505993850529194, 0.0034556719474494457, 0.005322635173797607, + -0.02364535629749298, -0.0034255431964993477, -0.04123266786336899, + -0.09191295504570007, 0.03907715901732445, 0.017849568277597427, + 0.003202608088031411, -0.009892004542052746, 0.00447180075570941, + 0.02633223496377468, 0.010955878533422947, 0.08653970807790756, + 0.020712584257125854, 0.0020712309051305056, 0.02159838192164898, + -0.03148637339472771, -0.1106131300330162, -0.0034493962302803993, + 0.008495570160448551, 0.025898300111293793, -0.01585080474615097, + 0.047963947057724, 0.03191608935594559, 0.05672791600227356, + 0.017725899815559387, -0.011000119149684906, 0.05266193300485611, + -0.02026527188718319, -0.0076444013975560665, 0.04474569857120514, + 0.0030594514682888985, -0.04981522262096405, -0.0012618869077414274, + -0.028191188350319862, 0.06203592196106911, -0.04548441618680954, + -0.01024117786437273, 0.012013573199510574, -0.03531227633357048, + -0.0303136445581913, 0.01046642567962408, -0.029064299538731575, + -0.015392802655696869, 0.02021191082894802, 0.015328207053244114, + 0.07215247303247452, -0.024603676050901413, -0.0021844934672117233, + 0.01121720764786005, -0.03952696546912193, 0.057082369923591614, + -0.007885781116783619, -0.05230427160859108, -0.0490812249481678, + -0.031762074679136276, -0.009603463113307953, -0.008093117736279964, + 0.030981115996837616, -0.0013626269064843655, -0.019603300839662552, + 0.025847315788269043, 0.012290321290493011, 0.007945788092911243, + 0.011359087191522121, 0.01893901824951172, 0.03544235974550247, + 0.01802144944667816, 0.07854204624891281, 0.07568025588989258, + -0.05122705176472664, 0.008560816757380962, -0.03897644579410553, + 0.024606050923466682, 0.03792334720492363, 0.01617903634905815, + -0.04735874757170677, 0.003156541381031275, -0.011881450191140175, + -0.026287615299224854, 0.030718199908733368, 0.04659629613161087, + -0.006789658684283495, -0.03779527172446251, -0.0030539771541953087, + -0.05919982120394707, -0.03511202707886696, -0.079665407538414, + 0.0580061711370945, 0.07525473088026047, 0.02381243370473385, + 0.04075026884675026, 0.011406723409891129, 0.020479097962379456, + -0.004844597075134516, -0.012301536276936531, -0.07029860466718674, + 0.0063995844684541225, 0.03451430797576904, 0.023998068645596504, + 0.03119623102247715, -0.0481903962790966, 0.03359334170818329, + 0.02563287690281868, 0.04894624277949333, -0.05896732583642006, + -0.020169634371995926, 0.01319917943328619, 0.00783664919435978, + 0.0051937587559223175, 0.0014421058585867286, -0.026831623166799545, + -0.0031287523452192545, 0.024796785786747932, -0.0008400659426115453, + -0.03314085304737091, -0.038561608642339706, -0.013552311807870865, + -0.012201554141938686, -0.010810038074851036, -0.025452986359596252, + 0.058491192758083344, -0.017493925988674164, -0.04302553832530975, + 0.00978845451027155, 0.0001076174812624231, -0.042208705097436905, + -0.011103725992143154, 0.019692320376634598, 0.035941820591688156, + 0.02046988718211651, -0.013415461406111717, 0.013622494414448738, + 0.03867186978459358, -0.019314907491207123, 0.018686568364501, + -0.026298167183995247, 0.052582357078790665, 0.027494613081216812, + 0.046435534954071045, 0.03811647742986679, 0.0486551970243454, + 0.0019721186254173517, -0.017199190333485603, -0.006901210639625788, + 0.025136850774288177, 0.0804467722773552, 0.061168037354946136, + 0.017717817798256874, -0.06603220850229263, -0.05064086616039276, + 0.039705704897642136, -0.024581512436270714, 0.09781734645366669, + -0.030609596520662308, 0.006824797950685024, -0.004317844286561012, + 0.0027715987525880337, 0.000943489489145577, -0.013181749731302261, + -0.025556521490216255, -0.028432460501790047, -0.03751988708972931, + -0.019560452550649643, -0.0136410528793931, -0.024382753297686577, + 0.02857314422726631, -0.011391760781407356, -0.0005634031840600073, + 0.03159620240330696, -0.005198314320296049, -0.006265239790081978, + 0.025611309334635735, -0.058613672852516174, -0.04532675817608833, + -0.021873218938708305, 0.02903951145708561, -0.0121288001537323, + 0.04538734629750252, -0.027941465377807617, -0.07353822141885757, + 0.03773269057273865, -0.00512319291010499, 0.028662901371717453, + 0.04792957380414009, -0.01053295936435461, -0.035430196672677994, + 0.009273026138544083, 0.004318219143897295, 0.08100441098213196, + 0.014380274340510368, -0.03378414362668991, -0.01987980492413044, + 0.021573858335614204, 0.02855539321899414, -0.007542841136455536, + 0.004633236676454544, 0.008639613166451454, 0.006394797004759312, + 0.019281607121229172, -0.015256315469741821, -0.00148143304977566, + -0.02985287643969059, 0.025533605366945267, -0.011111553758382797, + 0.01765139028429985, 0.05442074313759804, -0.09088895469903946, + 0.032542143017053604, 0.0574481226503849, 0.027539772912859917, + 0.028780700638890266, 0.04139337316155434, -0.014601831324398518, + -0.04883953556418419, -0.015341846272349358, -0.03965975344181061, + -0.000157194648636505, 0.0576823353767395, 0.0886307954788208, + 0.008708767592906952, -0.03148962929844856, 0.00118768191896379, + -0.009285139851272106, 0.0349595844745636, 0.00010961518273688853, + -0.007748626638203859, 0.08073285222053528, -0.026761949062347412, + 0.008831962943077087, -0.04131530225276947, -0.00733856251463294, + 0.06809361279010773, 0.03708426281809807, 0.06835067272186279, + 0.006261076312512159, 0.015920374542474747, -0.004048558417707682, + -0.02677253447473049, 0.07650309801101685, -0.023148853331804276, + -0.057494066655635834, 0.014478741213679314, -0.007499868981540203, + 0.02524508722126484, 0.04229635000228882, 0.017856169492006302, + -0.048948079347610474, -0.014559978619217873, 0.05829133093357086, + -0.007394919637590647, 0.008901085704565048, 0.03540206328034401, + 0.011715879663825035, 0.03447958827018738, -0.05490283668041229, + -0.0033728398848325014, -0.029942180961370468, -0.0025663028936833143, + 0.02124219387769699, 0.02587033249437809, -0.03495795652270317, + -0.01766275428235531, 0.05129474401473999, 0.050688110291957855, + -0.04483504965901375, -0.01242926623672247, -0.10075337439775467, + -0.039148375391960144, -0.01708081364631653, 0.022980742156505585, + 0.0012766321888193488, -0.05624091997742653, 0.02920171432197094, + -0.0004628047754522413, -0.012556084431707859, -0.004911895841360092, + -0.006203844211995602, 0.011994684115052223, -0.005147894844412804, + -0.0653131902217865, 0.021533435210585594, 0.027332814410328865, + 0.010824107564985752, 0.06882979720830917, 0.002402055310085416, + 0.06736285239458084, 0.007376556750386953, -0.09907388687133789, + 0.0738959014415741, 0.011251496151089668, -0.089520663022995, + 0.07383604347705841, -0.02708776667714119, 0.039623651653528214, + 0.001983445603400469, 0.034104056656360626, -0.10747380554676056, + -0.01417585276067257, -0.04512251541018486, 0.001210794085636735, + 0.05437818914651871, -0.004397509153932333, 0.03336326405405998, + 0.06369500607252121, 0.014758906327188015, -0.009938295930624008, + 0.027304060757160187, -0.025614865124225616, -0.019298158586025238, + -0.03774742782115936, -0.021663375198841095, 0.004094315692782402, + -0.05966729298233986, 0.025774789974093437, 0.004207789432257414, + 0.03211497142910957, 0.0222456231713295, -0.07142850011587143, + -0.02816791646182537, 0.008917749859392643, -0.03328888863325119, + -0.04815923050045967, 0.055692847818136215, 0.005043115001171827, + 0.007406118791550398, -0.10431766510009766, 0.003935595508664846, + -0.07654271274805069, 0.018002459779381752, -0.026796353980898857, + -0.01480060163885355, -0.003071046667173505, -0.031164491549134254, + -0.004783581010997295, 0.028996651992201805, -0.00432590302079916, + 0.03827083855867386, -0.10670920461416245, -0.009844367392361164, + 0.05061553791165352, 0.044921379536390305, -0.021305931732058525, + -0.010570063255727291, -0.037161700427532196, 0.03244778513908386, + -0.026579100638628006, -0.021481862291693687, -0.0803975760936737, + 0.0602957159280777, -0.030482472851872444, -0.016915978863835335, + -0.03414126858115196, -0.07902093976736069, -0.05861777812242508, + 0.02578902617096901, -0.006971773691475391, 0.021272379904985428, + 0.004250700585544109, -0.04617677628993988, -0.025576869025826454, + 0.09362083673477173, -0.02747775986790657, -0.010427952744066715, + -0.005847673863172531, 0.03786874935030937, -0.023494398221373558, + -0.03599749505519867, 0.06915943324565887, -0.0005259242025204003, + -0.020210636779665947, 0.012060794048011303, -0.03892034292221069, + -0.03120974451303482, 0.03798247501254082, 0.08222217112779617, + -0.03596770018339157, -0.001334474771283567, 0.06480739265680313, + 0.016418466344475746, 0.017459729686379433, -0.05252225697040558, + -0.05017365887761116, -0.013104243203997612, -0.023724595084786415, + 0.006762322038412094, -0.005946440156549215, 0.006083739455789328, + -0.014293180778622627, 0.041052985936403275, -0.02417348138988018, + 0.03206375241279602, -0.05389661714434624, 0.0052835363894701, + -0.038879118859767914, 0.0735245794057846, -0.004067298024892807, + -0.007775747217237949, 0.03442619740962982, -0.013299554586410522, + -0.01858234964311123, 0.015830783173441887, 0.050536882132291794, + 0.039121512323617935, 0.025488585233688354, -0.083254374563694, + -0.0376444049179554, -0.03673558309674263, 0.02858356013894081, + -0.0017373028676956892, -0.029856612905859947, -0.020456591621041298, + 0.014330082572996616, 0.026495426893234253, -0.029473086819052696, + -0.05005846545100212, 0.036553170531988144, 0.03604103624820709, + -0.014557436108589172, 0.0075491974130272865, 0.02816123701632023, + 0.022223982959985733, -0.010599660687148571, 0.0738152265548706, + 0.09043020755052567, -0.0195071529597044, 0.03981706127524376, + 0.04508437216281891, 0.03942303732037544, 0.016333166509866714, + -0.007340291049331427, -0.041841596364974976, -0.014305119402706623, + -0.005970897153019905, -0.04999639838933945, -0.00753607414662838, + -0.026936067268252373, 0.02390979416668415, -0.02593693509697914, + -0.015126893296837807, -0.035575494170188904, -0.01576480083167553, + 0.01482314057648182, 0.01960604451596737, 0.012122674845159054, + 0.018187053501605988, -0.04843643680214882, -0.032472606748342514, + -0.002006485592573881, -0.003907614853233099, 0.012371492572128773, + -0.03494970500469208, -0.04294227808713913, 0.004812099505215883, + -0.02547234669327736, 0.028849929571151733, -0.021960295736789703, + -0.0013683459255844355, 0.030571121722459793, -0.04714812710881233, + -0.0034763696603477, -0.03908781707286835, -0.04101671278476715, + 0.013097001239657402, 0.004385901615023613, -0.033829864114522934, + -0.04254792630672455, 0.015273491851985455, -0.040665704756975174, + -0.00920754112303257, -0.041413065046072006, -0.013588172383606434, + -0.0017439010553061962, -0.030785854905843735, 0.07103034108877182, + -0.01929519884288311, 0.025452272966504097, -0.022381870076060295, + -0.028560174629092216, -0.011340905912220478, -0.008002392016351223, + -0.013147140853106976, 0.021906575188040733, -0.04703265056014061, + 0.00766343716531992, 0.06170996278524399, -0.004122643731534481, + -0.01931242272257805, 0.03596718981862068, 0.0322248749434948, + -0.026486199349164963, -0.03052559122443199, -0.03129229322075844, + 0.024310404434800148, -0.029317326843738556, -0.07851212471723557, + 0.04514205455780029, -0.020682433620095253, -0.04681077226996422, + 0.008951415307819843, 0.04007868468761444, 0.028472354635596275, + 0.009118284098803997, -0.013431325554847717, -0.020488806068897247, + -0.027376288548111916, 0.035840798169374466, -0.03870074450969696, + -0.002007996430620551, -0.017398731783032417, 0.031902845948934555, + -0.07956399768590927, -0.04125808924436569, 0.01867605932056904, + -0.08004764467477798, -0.005094117484986782, 0.03083234466612339, + -0.01335862372070551, -0.04482260346412659, -0.033836718648672104, + 0.02282416820526123, 0.06287918239831924, 0.010162614285945892, + -0.05158773064613342, 0.0008301119669340551, -0.03881796821951866, + 0.002004651352763176, -0.022358564659953117, 0.022438282147049904, + 0.04948568344116211, 0.03556858375668526, 0.04151606187224388, + -0.03065376915037632, 0.0035080660600215197, -0.004210236947983503, + -0.02430005557835102, 0.02775806188583374, 0.0647825375199318, + 0.003446039743721485, -0.015345090068876743, -0.004865385591983795 + ], + "chunk_ind": 0 + }, + { + "url": "https://docs.danswer.dev/more/use_cases/operations", + "title": "Operations", + "content": "Double the productivity of your Ops teams like IT, HR, etc.\n\nAutomatically Resolve Tickets\nModern teams are leveraging AI to auto-resolve up to 50% of tickets. Whether it is an employee asking about benefits details or how to set up the VPN for remote work, Danswer can help your team help themselves. This frees up your team to do the real impactful work of landing star candidates or improving your internal processes.\n\nAI Aided Onboarding\nOne of the periods where your team needs the most help is when they're just ramping up. Instead of feeling lost in dozens of new tools, Danswer gives them a single place where they can ask about anything in natural language. Whether it's how to set up their work environment or what their onboarding goals are, Danswer can walk them through every step with the help of Generative AI. This lets your team feel more empowered and gives time back to the more seasoned members of your team to focus on moving the needle.", + "title_embedding": [ + 0.010730741545557976, 0.018373621627688408, -0.0013679212424904108, + 0.0001091610174626112, 0.034599218517541885, 0.038814906030893326, + -0.03269535303115845, -0.015120825730264187, -0.011999477632343769, + -0.023377608507871628, -0.003536652075126767, 0.019120972603559494, + 0.032052282243967056, -0.03316797688603401, 0.002971385605633259, + -0.06660863012075424, 0.015637297183275223, -0.004499480128288269, + -0.01167437992990017, 0.024017684161663055, 0.011193061247467995, + 0.02649473212659359, 0.04822992533445358, -0.022897351533174515, + 0.04229900613427162, 0.004849326331168413, -0.0013035786105319858, + -0.022214103490114212, -0.03940191864967346, -0.010781657882034779, + 0.049329955130815506, -0.010857120156288147, 0.04519270732998848, + -0.025993647053837776, -0.03483803570270538, -0.08306694775819778, + 0.023532472550868988, 0.01766788400709629, 0.07221196591854095, + -0.001823332509957254, 0.003104567062109709, 0.05640452727675438, + -0.04483344778418541, -0.04334229975938797, 0.019754929468035698, + -0.011862428858876228, 0.03863349184393883, 0.01501200906932354, + 0.03863223269581795, -0.05570034310221672, -0.0267505943775177, + 0.03793827444314957, -0.015061624348163605, -0.025455573573708534, + 0.015559123829007149, -0.01849287934601307, 0.04038143530488014, + 0.026783155277371407, 0.0023170525673776865, -0.005963196512311697, + 0.04137548804283142, 0.01764686405658722, -0.02246052585542202, + -0.012901525013148785, -0.017714571207761765, -0.01027537789195776, + -0.016164785251021385, -0.007979510352015495, 0.021984701976180077, + -0.009949913248419762, 0.08931540697813034, 0.019962448626756668, + 0.03341870754957199, 0.028858954086899757, 0.0011826930567622185, + -0.00010822620242834091, 0.021924806758761406, -0.009182920679450035, + -0.018256863579154015, 0.04718794673681259, 0.020425673574209213, + 0.0002538118860684335, 0.05454721301794052, 0.019339540973305702, + 0.09129136055707932, -0.010062575340270996, 0.011977903544902802, + -0.022689297795295715, -0.02022380940616131, 0.013067576102912426, + 0.005217134952545166, 0.037848640233278275, 0.09128513187170029, + 0.052562836557626724, -0.08711232244968414, -0.0015501821180805564, + 0.00441542686894536, -0.025076182559132576, -0.014110158197581768, + -0.06030100956559181, -0.0010345132322981954, 0.018056612461805344, + 0.03524528071284294, -0.01293126679956913, 0.04700767621397972, + 0.06564126163721085, 0.010113431140780449, 0.018003467470407486, + -0.05646204203367233, -0.02047823928296566, -0.048007529228925705, + 0.05380301922559738, -0.01607179455459118, 0.0008504731231369078, + -0.015249709598720074, -0.005599239841103554, 0.054663073271512985, + -0.036346085369586945, 0.024314023554325104, 0.0768347755074501, + 0.03234424442052841, 0.008295328356325626, -0.0015371616464108229, + 0.043760448694229126, -0.0018124273046851158, -0.017262862995266914, + -0.042646538466215134, 0.025395702570676804, 0.03709862381219864, + -0.03921937197446823, 0.050630342215299606, 0.024703586474061012, + 0.022064829245209694, -0.008263661526143551, 0.03165263310074806, + 0.017121389508247375, -0.06672775000333786, -0.06984685361385345, + 0.11498068273067474, -0.02806312032043934, 0.025353405624628067, + 0.006591377779841423, -0.011244958266615868, -0.014589745551347733, + 0.031125560402870178, -0.06834094226360321, -0.015911657363176346, + 0.0176913570612669, -0.022801805287599564, 0.01242455281317234, + 0.001349485362879932, 0.05671858787536621, -0.024975799024105072, + -0.019439268857240677, -0.0038488772697746754, -0.057438429445028305, + -0.004414469934999943, 0.018468070775270462, 0.006111294496804476, + 0.018633801490068436, 0.04350016638636589, 0.04317209869623184, + -0.058885347098112106, 0.06605420261621475, -0.060788847506046295, + -0.059009939432144165, 0.003782198065891862, 0.04133265092968941, + 0.019440937787294388, 0.0046022264286875725, 0.010991339571774006, + -0.0554840974509716, 0.04083068668842316, 0.007237149402499199, + 0.04218628630042076, 0.015044232830405235, 0.0565854087471962, + 0.03397437930107117, 0.017036354169249535, -0.02862199954688549, + -0.0019827275536954403, -0.04851892590522766, 0.020672423765063286, + 0.029321348294615746, -0.04698231443762779, -0.06201909855008125, + 0.01672869734466076, -0.06657315790653229, -0.061089128255844116, + -0.006803641561418772, -0.041214216500520706, 0.025210469961166382, + 0.03659403696656227, -0.056864507496356964, -0.017966432496905327, + -0.04572960361838341, 0.03334927558898926, 0.0010855993023142219, + 0.009142755530774593, -0.045417286455631256, 0.0508459098637104, + -0.0020350103732198477, -0.08134196698665619, -0.014234581962227821, + -0.018346119672060013, 0.027286984026432037, 0.01077864971011877, + 0.025067729875445366, -0.0010993028990924358, 0.055933188647031784, + 0.08128975331783295, -0.00964485201984644, 0.02629624865949154, + 0.02482009492814541, -0.004299841821193695, -0.02121540531516075, + -0.012340475805103779, -0.010211183689534664, -0.033044300973415375, + 0.01627231389284134, 0.0025289515033364296, -0.024867739528417587, + -0.02439618855714798, 0.031935419887304306, 0.015503033064305782, + 0.058931007981300354, 0.0018315770430490375, 0.05652814358472824, + -0.01652582921087742, -0.009255263954401016, -0.03914093226194382, + -0.015466556884348392, 0.04188372567296028, -0.027017222717404366, + -0.022300412878394127, 0.016426892951130867, 0.017171800136566162, + 0.07701553404331207, 0.043874118477106094, 0.07433117181062698, + 0.026014234870672226, 0.013816924765706062, 0.014636811800301075, + -0.036465343087911606, -0.03994021564722061, 0.003269175998866558, + -0.03902558237314224, -0.0039056213572621346, 0.01340826041996479, + -0.012667474336922169, 0.013740241527557373, 0.014225244522094727, + -0.01617649756371975, 0.045204829424619675, 0.0021076719276607037, + -0.03156042471528053, -0.051285773515701294, 0.050252512097358704, + -0.03781714290380478, 0.017089596018195152, 0.028835022822022438, + 0.005575904157012701, 0.05654308199882507, -0.004466162994503975, + 0.05050184950232506, 0.011318539269268513, -0.049033407121896744, + 0.022067567333579063, 0.011711984872817993, 0.005116294138133526, + -0.02025405503809452, -0.010977067984640598, -0.030916348099708557, + 0.05374428257346153, 0.00808737613260746, -0.044140078127384186, + 0.030092595145106316, -0.01702306419610977, -0.020538438111543655, + 0.00427399855107069, 0.002780117094516754, -0.005801517982035875, + -0.04212724789977074, 0.024279015138745308, -6.510222738143057e-5, + -0.020652174949645996, -0.006957880686968565, 0.0016846335493028164, + -0.0663430467247963, -0.0335371308028698, 2.869481068046298e-5, + -0.037348829209804535, 0.039201609790325165, -0.000545984017662704, + 0.02819826453924179, 0.04861782118678093, 0.08528425544500351, + 0.012239249423146248, 0.014359706081449986, -0.019882500171661377, + -0.03053932823240757, 0.04383694753050804, -0.05820803344249725, + -0.0014366158284246922, 0.06986244767904282, 0.06611118465662003, + 0.024454524740576744, -0.02059408277273178, -0.016752762719988823, + -0.01459463406354189, -0.02889612317085266, 0.04970743879675865, + -0.028303874656558037, 0.03978912904858589, -0.07312510907649994, + 0.05429210141301155, 0.00571118388324976, -0.02056923136115074, + 0.053339049220085144, -0.018249599263072014, -0.021281961351633072, + -0.05355891212821007, 0.0478244312107563, -0.06067035347223282, + -0.0008842060924507678, 0.04420148581266403, 0.000588231545407325, + 0.056371595710515976, 0.014279269613325596, -0.05001707002520561, + -0.037117116153240204, 0.030044561251997948, -0.05617158114910126, + 0.02152038738131523, 0.017715860158205032, 0.02034214325249195, + 0.021688375622034073, -0.024712584912776947, -0.03572659194469452, + -0.03006441332399845, 0.11338905245065689, 0.029146766290068626, + -0.01698526367545128, -0.05355465039610863, -0.0036588346119970083, + -0.0032888834830373526, -0.022690337151288986, -0.05653419718146324, + 0.053955987095832825, -0.010936236009001732, -0.05121518298983574, + -0.04233774170279503, -0.0650610476732254, -0.009094701148569584, + -0.00337587739340961, 0.05269891023635864, 0.05860234051942825, + 0.015521558932960033, -0.038103096187114716, -0.029688136652112007, + -0.011934547685086727, -0.05070113763213158, 0.025371648371219635, + 0.03601797670125961, 0.021862944588065147, -0.020703352987766266, + 0.05289195850491524, -0.004169228952378035, -0.03900706395506859, + -0.0029100535903126, 0.003359412308782339, -0.03696342185139656, + 0.007729679811745882, 0.005611894652247429, 0.04680318757891655, + 0.002203753450885415, -0.057359859347343445, 0.0586426742374897, + 0.027003217488527298, -0.030223455280065536, 0.06817735731601715, + 0.03476058319211006, -0.011411039158701897, -5.9986756241414696e-5, + -0.03633617237210274, -0.016408616676926613, 0.00833839550614357, + 0.0022074412554502487, 0.048158638179302216, -0.031786687672138214, + -0.028205247595906258, -0.056731242686510086, -0.023744143545627594, + -0.00782334990799427, 0.03123593144118786, -0.016950296238064766, + 0.01794753596186638, 0.04291892051696777, 0.0099559361115098, + -0.0012914348626509309, -0.004629518836736679, -0.05153423175215721, + 0.058259084820747375, 0.056788213551044464, -0.0333746112883091, + 0.030548732727766037, 0.01471715047955513, -0.002818142296746373, + -0.013207555748522282, 0.022568998858332634, 0.025705108419060707, + -0.014197800308465958, -0.02527414821088314, 0.0009442049195058644, + 0.005861984565854073, 0.00919109396636486, 0.012627449817955494, + 0.017443764954805374, -0.0162491612136364, 0.05474800989031792, + 0.02047792077064514, 0.0003552198759280145, 0.0005291366251185536, + 0.003957713954150677, -0.073692187666893, 0.044466596096754074, + 0.02759671024978161, 0.03048691712319851, 0.024890316650271416, + -0.04379572719335556, -0.004758962895721197, -0.012311465106904507, + 0.003943922929465771, 0.035470347851514816, -0.06701556593179703, + -0.0010001214686781168, 0.06592956185340881, 0.008649672381579876, + 0.005259859841316938, -0.00453479727730155, -0.026313234120607376, + 0.0128184137865901, -0.04696577414870262, 0.0357656292617321, + -0.007445288822054863, 0.10806939005851746, 0.0789022147655487, + -0.01642726920545101, -0.042041581124067307, -0.015044954605400562, + -0.020660532638430595, -0.01043805480003357, -0.011654903180897236, + -0.05754747614264488, 0.06964936852455139, 0.03990045189857483, + -0.0017825361574068666, -0.0382373072206974, 0.005986799951642752, + 0.022415796294808388, -0.011907496489584446, -0.015296644531190395, + 0.005779241677373648, 0.051009099930524826, -0.013554011471569538, + -0.036914244294166565, -0.018791811540722847, -0.03514847159385681, + -0.050969723612070084, 0.07429437339305878, -0.014117387123405933, + 0.01858820766210556, 0.029953083023428917, 0.013503451831638813, + 0.024872425943613052, -0.01447504572570324, -0.03305073082447052, + -0.03757826238870621, 0.009820172563195229, 0.004212466534227133, + 0.009773447178304195, -0.03176327049732208, -0.00981978140771389, + -0.018437808379530907, -0.016843365505337715, 0.04063236713409424, + 0.03664008155465126, -0.017759712412953377, 0.017491042613983154, + 0.017961829900741577, -0.007718573324382305, 0.08410634845495224, + 0.04713455215096474, -0.030763784423470497, -0.07868245244026184, + -0.04293506219983101, -0.02663402259349823, 0.06502995640039444, + 0.04134368151426315, -0.0075789024122059345, 0.037559330463409424, + -0.022960234433412552, -0.03559660166501999, 0.02433147467672825, + 0.004160139709711075, -0.006933990400284529, -0.018853498622775078, + -0.07674053311347961, -0.015214351005852222, -0.0031465317588299513, + -0.0032662826124578714, 0.020626401528716087, -0.023296812549233437, + 0.01533068809658289, -0.018831759691238403, -0.013347934931516647, + 0.05832105129957199, 0.016617508605122566, -0.06054726615548134, + 0.020946228876709938, -0.05081603676080704, 0.0005762121290899813, + -0.013293956406414509, 0.05424816533923149, -0.028400346636772156, + -0.008744322694838047, -0.027219830080866814, -0.009028765372931957, + 0.08349941670894623, -0.02220912277698517, -0.0035634897649288177, + 0.039803411811590195, -0.009209544397890568, -0.05272921174764633, + -0.03965644910931587, -0.05518830195069313, -0.0076417475938797, + 0.011989743448793888, 0.07631900161504745, 0.03853122144937515, + -0.03482687100768089, 0.010916730388998985, -0.012828757055103779, + 0.04919871687889099, 0.05019025877118111, -0.04743821546435356, + -0.024848056957125664, 0.03214584290981293, -0.03514641523361206, + -0.030496522784233093, -0.005360030569136143, -0.03538995608687401, + -0.016368992626667023, -0.07339470833539963, 0.022023534402251244, + -0.012556820176541805, 0.018365638330578804, 0.025001555681228638, + -0.029118210077285767, -0.046666670590639114, 0.008231466636061668, + 0.015543444082140923, -0.018029138445854187, 0.007146908901631832, + -0.009827123954892159, -0.09648268669843674, -0.03867226839065552, + -0.0066386335529387, -0.03292228281497955, 0.022209322080016136, + -0.048995133489370346, -0.03118632733821869, -0.017475144937634468, + 0.005314778070896864, -0.010616753250360489, -0.04724809527397156, + 0.027169331908226013, 0.003503959160298109, -0.06886278837919235, + 0.022409209981560707, 0.008452179841697216, -0.024392019957304, + -0.020817982032895088, -0.004606961738318205, -0.08660633116960526, + 0.031076667830348015, 0.020726440474390984, -0.011223231442272663, + 0.05015091598033905, 0.021451715379953384, 0.049609262496232986, + 0.05560477077960968, -0.05192512646317482, -0.016848105937242508, + 0.009753277525305748, 0.03756638243794441, -0.03334583342075348, + 0.040159404277801514, 0.01897178590297699, -0.056337252259254456, + 0.018127072602510452, 8.343596709892154e-5, 0.02721443772315979, + -0.037742555141448975, 0.01802193559706211, 0.09791397303342819, + -0.025166938081383705, 0.07462649047374725, -0.027766922488808632, + -0.06332777440547943, -0.005818391218781471, -0.018811773508787155, + -0.03425326570868492, 0.003983109258115292, -0.034356050193309784, + -0.04613350331783295, 0.006219684612005949, -0.02750561013817787, + 0.03812394291162491, -0.029214290902018547, -0.008362852968275547, + -0.046344004571437836, 0.015400147996842861, -0.027044160291552544, + 0.019668098539114, 0.01860121265053749, -0.02408520132303238, + -0.07096672058105469, -0.0003313044144306332, 0.024360060691833496, + -0.02565479651093483, 0.033272501081228256, -0.009740769863128662, + -0.07754906266927719, -0.005022569093853235, 0.01834244467318058, + -0.0670025572180748, 0.0031950040720403194, 0.016223441809415817, + 0.05652153119444847, 0.05896124988794327, -0.05402825400233269, + -0.007856715470552444, 0.010221654549241066, -0.005885730497539043, + 0.01112558413296938, -0.03978840634226799, -0.008038174360990524, + 0.048503343015909195, -0.04076062887907028, 0.05717281252145767, + 0.02310008741915226, -0.015716947615146637, 0.00578808831050992, + 0.005534487310796976, 0.01627301797270775, 0.012666025198996067, + -0.037932366132736206, -0.02889685146510601, 0.00509311119094491, + -0.015608384273946285, -0.059495046734809875, -0.02494220808148384, + 0.024446364492177963, 0.03732331097126007, -0.006206210236996412, + 0.03822293132543564, -0.030338769778609276, -0.10873781144618988, + -0.021372010931372643, -0.04941859468817711, -0.0004265901807229966, + -0.011848388239741325, -0.040232446044683456, 0.02904931642115116, + -0.0006163326324895024, 0.04501201957464218, -0.0009105035569518805, + -0.034277040511369705, -0.028672119602560997, 0.0012552812695503235, + 0.04327942058444023, 0.0058600720949471, 0.007959491573274136, + -0.0052106245420873165, -0.01613856852054596, 0.029123952612280846, + 0.007203509099781513, 0.006210403982549906, -0.053780049085617065, + -0.003420298220589757, -0.03657878562808037, 0.019055450335144997, + -0.07376986742019653, -0.019189076498150826, -0.007890082895755768, + -0.022230826318264008, 0.006225301884114742, 0.01237472239881754, + 0.06595351547002792, 0.04492981359362602, 0.01869170181453228, + 0.012387770228087902, 0.05166162550449371, -0.06800039112567902, + -0.03918451815843582, -0.056165844202041626, 0.02326592244207859, + -0.015541363507509232, 0.033293239772319794, -0.007216084748506546, + -0.012664951384067535, 0.05342225730419159, 0.009652439504861832, + 0.027964351698756218, -0.016909338533878326, 0.03330600634217262, + -0.060126710683107376, -0.07517267763614655, -0.025813739746809006, + -0.024271255359053612, 0.011216769926249981, -0.020943764597177505, + 0.01686793565750122, -0.06828945130109787, -0.018023250624537468, + -0.004829781129956245, -0.022270847111940384, 0.030936313793063164, + -0.013894669711589813, 0.0368037149310112, -0.05242981016635895, + -0.05051284283399582, 0.06604990363121033, 0.009301775135099888, + -0.014957845211029053, -0.04281012713909149, -0.0006833449588157237, + -0.005769087467342615, -0.010068363510072231, 0.058508969843387604, + 0.01898365654051304, -0.018636951223015785, 0.04689044877886772, + 0.056091975420713425, -0.03881967067718506, -0.019658103585243225, + -0.0003990831028204411, 0.07142409682273865, 0.012017408385872841, + -0.04087359085679054, 0.03531723469495773, 0.030207011848688126, + 0.005139552056789398, -0.07084347307682037, 0.009759706445038319, + -0.074960857629776, 0.05481172725558281, 0.028797954320907593, + -0.016217226162552834, 0.03127933293581009, 0.06848310679197311, + 0.0331764742732048, -0.007261150050908327, -0.023036431521177292, + -0.015215406194329262, -0.056041885167360306, -0.05146646127104759 + ], + "content_embedding": [ + -0.035467296838760376, 0.05443365499377251, 0.004722591955214739, + -0.022551164031028748, 0.013528825715184212, -0.01865273527801037, + 0.02473974972963333, -0.0093984454870224, -0.04820428788661957, + -0.011309967376291752, -0.041853927075862885, 0.02753269486129284, + 0.023483864963054657, 0.0086299953982234, -0.004760670010000467, + -0.011422916315495968, 0.02231433242559433, -0.018113020807504654, + 0.027575815096497536, 0.008628769777715206, 0.031078867614269257, + -0.022500742226839066, 0.011097921058535576, -0.020555853843688965, + 0.041947007179260254, 0.007781036198139191, -0.03356451168656349, + 0.05067972093820572, -0.05009249970316887, 0.013387891463935375, + 0.04737459868192673, -0.02956528402864933, -0.012283756397664547, + -0.02492530830204487, -0.051903702318668365, -0.020648062229156494, + 0.07008657604455948, -0.05525917932391167, -0.005968133453279734, + 0.042117420583963394, 0.04600752145051956, -0.023857053369283676, + 0.024955181404948235, -0.019478371366858482, 0.027009692043066025, + -0.0035888778511434793, 0.050264179706573486, -0.026319395750761032, + 0.02974606677889824, -0.03574950620532036, -0.0011404261458665133, + 0.00908538419753313, -0.026853032410144806, -0.01625720039010048, + -0.011155915446579456, 0.012969470582902431, -0.03395452722907066, + 0.004224491771310568, 0.027397319674491882, -0.02826162986457348, + 0.042576149106025696, 0.04274202510714531, -0.023343440145254135, + 0.031393349170684814, 0.003865004051476717, -0.068922258913517, + 0.021687647327780724, -0.006593589670956135, 0.014760294929146767, + -0.007734894752502441, -0.0031233499757945538, -0.014138679951429367, + 0.01479700393974781, 0.05785622447729111, -0.04781193286180496, + -0.02295715920627117, -0.02882302924990654, -0.018427176401019096, + -0.018964029848575592, 0.06096252053976059, 0.05383418872952461, + -0.0870966985821724, 0.0031813366804271936, 0.01873805560171604, + 0.046315208077430725, -0.016668183729052544, 0.007771935313940048, + -0.008389935828745365, -0.03101789578795433, 0.12752030789852142, + 0.03457779064774513, 0.03240315988659859, 0.048814207315444946, + 0.05700814723968506, -0.06329526007175446, 0.057182129472494125, + -0.027157682925462723, -0.008035550825297832, -0.02922128140926361, + -0.033899255096912384, -0.013119292445480824, 0.05552409961819649, + 0.04266372323036194, -0.003065067809075117, 0.03790399059653282, + 0.01804651878774166, 0.018934324383735657, 0.04061003401875496, + 0.03233874961733818, 0.017353640869259834, -0.045026157051324844, + 0.0002423059631837532, 0.00812580157071352, 0.017658300697803497, + -0.007964730262756348, 0.01015512179583311, 0.04046032205224037, + -0.06913582980632782, 0.004168798215687275, 0.03317571058869362, + 0.012463781051337719, -0.020389260724186897, -0.022882815450429916, + -0.015693804249167442, 0.0500093474984169, 0.05442529916763306, + -0.05275300145149231, -0.0008568991324864328, -0.009641895070672035, + -0.003380047157406807, -0.019793419167399406, 0.0063031697645783424, + -0.03327865153551102, 0.016263391822576523, 0.03218098729848862, + 0.022281551733613014, -0.06236790120601654, -0.06712637841701508, + 0.013925489969551563, 0.01688770391047001, 0.01467123068869114, + 0.029035737738013268, -0.013510127551853657, -0.0371820330619812, + 0.01489016879349947, -0.05226032808423042, -0.021125929430127144, + -0.002014611614868045, -0.05400311201810837, 0.019783688709139824, + -0.0014786357060074806, -0.01895768567919731, 0.01083068735897541, + -0.005890357308089733, 0.003382777562364936, -0.04273455590009689, + -0.0478648841381073, 0.006365248002111912, 0.04027433693408966, + 0.022969869896769524, 0.06722807139158249, 0.02113204449415207, + -0.03740633279085159, 0.0656556561589241, -0.014887429773807526, + 0.022357333451509476, 0.036207813769578934, 0.009992213919758797, + 0.03665810078382492, 0.07260408997535706, -0.005702183116227388, + -0.00880548357963562, 0.033025965094566345, -0.017093362286686897, + 0.039024271070957184, 0.04158668965101242, 0.008214588277041912, + -0.017436640337109566, 0.05074054002761841, -0.021646566689014435, + 0.0577220693230629, -0.06182146817445755, 0.03490613400936127, + 0.009574614465236664, -0.05135552957653999, -0.016593866050243378, + 0.001374077401123941, -0.0582745335996151, 0.009521838277578354, + -0.04114346206188202, 0.05998831242322922, -0.017294712364673615, + -0.017998410388827324, -0.036417942494153976, -0.014111478812992573, + -0.035168007016181946, 0.04580182209610939, 0.006420421414077282, + 0.012049577198922634, -0.03691839054226875, -0.041549112647771835, + -0.02593182772397995, -0.07608001679182053, -0.004325924441218376, + -0.029293090105056763, 0.05871257558465004, 0.04561365023255348, + -0.018353786319494247, 0.018815817311406136, -0.02768997848033905, + 0.10190171003341675, 0.05626858025789261, -0.006779504008591175, + 0.005354198161512613, -0.053908295929431915, -0.03453999012708664, + 0.02781379222869873, -0.04989396408200264, -0.03950505331158638, + 0.02282608300447464, 0.006389955058693886, -0.007375086657702923, + -0.016243990510702133, 0.028544900938868523, -0.020701216533780098, + 0.012176213786005974, -0.019373498857021332, 0.023940887302160263, + -0.03986368328332901, -0.0040043736808001995, -0.02535220980644226, + -0.08186554163694382, 0.07860240340232849, 0.004978376906365156, + 0.06400016695261002, -0.02991490252315998, -0.03288387134671211, + 0.03286135569214821, 0.0247455183416605, 0.08134172111749649, + 0.062203265726566315, 0.017080431804060936, -0.003969072364270687, + -0.024894852191209793, -0.07480036467313766, 0.03882874175906181, + -0.0074541885405778885, -0.011322571896016598, -0.06285038590431213, + 0.004618136677891016, -0.019962741062045097, 0.01853892207145691, + 0.002305575180798769, 0.023541608825325966, 0.017216432839632034, + -0.044929757714271545, -0.022532327100634575, 0.05122198164463043, + 0.0008766956743784249, -0.025474006310105324, 0.04030180349946022, + -0.013362268917262554, 0.049265045672655106, 0.001589711057022214, + 0.014848759397864342, 0.0022126054391264915, -0.028914116322994232, + -0.009881545789539814, -0.010318109765648842, 0.006385906133800745, + 0.010556558147072792, 0.007666149642318487, 0.016665013507008553, + 0.09090837836265564, 0.008256189525127411, -0.006583006586879492, + 0.0044986652210354805, -0.0336960032582283, 0.047732625156641006, + -0.007069372106343508, -0.044969744980335236, -0.0907006487250328, + -0.04223865643143654, 0.007469010539352894, 0.006147805601358414, + 0.04827409237623215, 0.03519561514258385, 0.032267000526189804, + 0.05173507332801819, -0.016001909971237183, 0.034578241407871246, + -0.02854917198419571, -0.01587686315178871, 0.03134807571768761, + -0.010930678807199001, 0.044245973229408264, 0.1186295673251152, + -0.031642355024814606, 0.01669827103614807, -0.026775898411870003, + -0.002936996053904295, -0.013815718702971935, -0.009453569538891315, + -0.035879991948604584, -0.02232815884053707, -0.009286822751164436, + -0.01117252279073, 0.014098073355853558, 0.023366371169686317, + 0.022420832887291908, -0.029833031818270683, 0.0013570807641372085, + -0.0211170744150877, -0.027633074671030045, -0.02915397845208645, + 0.023663034662604332, 0.04199281334877014, -0.0311698317527771, + 0.025238486006855965, -0.00992826372385025, 0.005087476689368486, + -0.050041183829307556, -0.018602682277560234, -0.06774407625198364, + -0.019117988646030426, 0.08245334029197693, 0.030311768874526024, + 0.020432988181710243, -0.03738946095108986, 0.04251522198319435, + 0.002886879490688443, 0.0938342958688736, -0.05836429446935654, + -0.0232597254216671, 0.0074686286970973015, -0.0020157117396593094, + -0.013439277186989784, 0.02590363286435604, 0.0034541902132332325, + 0.002955070696771145, -0.0020802158396691084, -0.011827156879007816, + -0.06622112542390823, -0.05226997658610344, -0.031827233731746674, + 0.0012936017010360956, 0.01702217012643814, -0.016136569902300835, + 0.04939497634768486, 0.006943605840206146, -0.05098084360361099, + 0.03143058344721794, -0.004094736184924841, -0.011557313613593578, + 0.000757173984311521, -0.01120754610747099, 0.036905039101839066, + 0.02395678497850895, 0.009881307370960712, 0.05312298238277435, + 0.05778184533119202, 0.02520277164876461, 0.020175758749246597, + -0.025740133598446846, 0.04891965910792351, 0.05155428871512413, + 0.04089348018169403, 0.06249197572469711, 0.02368168905377388, + -0.03247880935668945, 0.019493652507662773, -0.06181112304329872, + 0.019057979807257652, 0.037210095673799515, 0.02910085767507553, + 0.02495957538485527, -0.08179862797260284, -0.007498551160097122, + 0.036072876304388046, -0.016061626374721527, 0.0725645050406456, + 0.005645937751978636, 0.0032921379897743464, -0.029446475207805634, + 0.020205944776535034, 0.002388844033703208, -0.029442399740219116, + -0.031475961208343506, 0.024486446753144264, -0.038434699177742004, + -0.05131153389811516, 0.00129043054766953, -0.01692604087293148, + -0.007422945462167263, -0.02433120459318161, 0.004650108516216278, + -0.04251663386821747, -0.009143602102994919, 0.017507102340459824, + 0.042100246995687485, -0.06103592365980148, -0.0508011095225811, + -0.000937039265409112, 0.025160834193229675, -0.048878248780965805, + 0.010422220453619957, -0.014773974195122719, -0.06574267894029617, + 0.0027621325571089983, -0.0019821953028440475, 0.006184928119182587, + 0.024707674980163574, -0.022308405488729477, -0.06509386748075485, + 0.04186487942934036, 0.0023416660260409117, 0.0650840550661087, + 0.03807358071208, -0.024585191160440445, -0.017596496269106865, + 0.005341595038771629, 0.03675152733922005, 0.06293662637472153, + 0.010854244232177734, -0.050199203193187714, -0.037359531968832016, + 0.017929432913661003, 0.023822667077183723, 0.019726712256669998, + -0.00759292533621192, 0.043509598821401596, -0.0014670701930299401, + -0.0006681938539259136, -0.0055070724338293076, -0.07182206958532333, + 0.037307076156139374, 0.06350742280483246, 0.049223095178604126, + 0.017340589314699173, 0.05529596656560898, 0.023639194667339325, + -0.02478986792266369, -0.02248029224574566, -0.042737238109111786, + -0.0018032155930995941, 0.05576873943209648, 0.12722158432006836, + 0.004959811456501484, -0.033451229333877563, -0.007337609305977821, + 0.018852578476071358, 0.031502317637205124, 0.013375848531723022, + -0.0066598327830433846, 0.07771285623311996, -0.017693838104605675, + -0.030503049492836, -0.04393269121646881, 0.013323146849870682, + 0.010107941925525665, 0.02004137821495533, 0.0377974770963192, + 0.013478322885930538, 0.024949608370661736, -0.01634461060166359, + -0.015226340852677822, 0.013924108818173409, 0.0038409747648984194, + -0.00358059024438262, -0.005057516973465681, -0.008272752165794373, + 0.04373026266694069, -0.0014998909318819642, 0.009924792684614658, + -0.040317751467227936, -0.04541180655360222, 0.06625904142856598, + -0.028080880641937256, 0.04732294753193855, 0.0047000702470541, + 0.02857903018593788, 0.004553706850856543, -0.04188435524702072, + 0.023083623498678207, -0.060619525611400604, 0.01963491179049015, + -0.008608276024460793, 0.0034778753761202097, -0.016133952885866165, + 0.009059683419764042, -0.0009118590969592333, 0.08675801753997803, + 0.004535067826509476, -0.021998068317770958, -0.0789642184972763, + -0.033289894461631775, -0.04486677423119545, 0.014945252798497677, + 0.04513613134622574, -0.05311649665236473, 0.009399711154401302, + 0.004345519933849573, -0.004021052736788988, 0.01709410734474659, + -0.020986782386898994, -0.011685016565024853, -0.02048366330564022, + -0.08524532616138458, -0.004114300478249788, 0.040531281381845474, + -0.0005771859432570636, 0.02984555996954441, -0.0002479814866092056, + 0.04756562039256096, -0.013039377517998219, -0.09513615071773529, + 0.07444311678409576, 0.0044719018042087555, -0.09768522530794144, + 0.04403488337993622, 0.013910059817135334, 0.06657753884792328, + 0.026994489133358, 0.03657658398151398, -0.11561834812164307, + 0.02878704108297825, -0.012593223713338375, -0.01532658003270626, + 0.06045927479863167, -0.04569881781935692, -0.0029045850969851017, + 0.06762480735778809, 0.012874988839030266, -0.011422640644013882, + 0.025211291387677193, -0.07570745050907135, -0.018061399459838867, + -0.033531878143548965, -0.010049374774098396, 0.02582205832004547, + -0.015443898737430573, 0.029427431523799896, -0.02071801945567131, + 0.02054932527244091, 0.017246615141630173, -0.07276910543441772, + -0.053859222680330276, 0.060189153999090195, -0.04336293414235115, + -0.06396458297967911, 0.08002400398254395, -0.041166432201862335, + 0.000430541840614751, -0.10547704994678497, -0.014112395234405994, + -0.06070064380764961, 0.01796649396419525, -0.045275188982486725, + -0.0018861661665141582, -0.0022482818458229303, 0.004191190470010042, + 6.745498831151053e-5, 0.07350871711969376, -0.01797996647655964, + 0.03183342143893242, -0.10409794747829437, -0.0291685052216053, + 0.02071727253496647, 0.021267961710691452, -0.021560702472925186, + -0.05026571452617645, -0.013422243297100067, -0.0011607048800215125, + 0.016948888078331947, -0.01588856242597103, -0.05063013359904289, + 0.05952488258481026, -0.05575632303953171, -0.06906414031982422, + -0.004353572614490986, -0.02773641049861908, -0.043097492307424545, + 0.03103402815759182, 0.02164989709854126, 0.0013185666175559163, + 0.02606332302093506, -0.059723641723394394, -0.008657965809106827, + 0.06580374389886856, -0.020771000534296036, -0.022305399179458618, + 0.008068420924246311, -0.004975682124495506, -0.033461254090070724, + -0.040884874761104584, 0.052932899445295334, -0.0054899416863918304, + -0.03155453875660896, 0.002439886098727584, -0.0361575223505497, + -0.03652369976043701, -0.010043974034488201, 0.01681465655565262, + 3.9381829992635176e-5, 0.011527255177497864, 0.06904088705778122, + -0.005501871462911367, 0.0259085800498724, -0.021282166242599487, + -0.03796657174825668, -0.002881726250052452, -0.018672630190849304, + -0.003463461296632886, -0.008101037703454494, -0.019035371020436287, + -0.0025111068971455097, 0.03926572576165199, -0.0513470396399498, + 0.04829537495970726, -0.001188569120131433, -0.0121685229241848, + -0.059901442378759384, 0.07364466786384583, 0.006562754046171904, + 0.00707247294485569, 0.028408123180270195, -0.02494397945702076, + -0.04187498614192009, 0.0066386316902935505, 0.06244710460305214, + 0.02900586649775505, 0.04932861402630806, -0.04402685537934303, + -0.006739918142557144, -0.0502609983086586, -0.0015801729168742895, + -0.026301531121134758, -0.024203499779105186, -0.019028285518288612, + 0.055076178163290024, 0.009030332788825035, -0.04907704144716263, + -0.019399652257561684, 0.009713590145111084, 0.05042042210698128, + -0.00020382895309012383, -0.010405965149402618, 0.01872927136719227, + 0.002546734409406781, 0.020958390086889267, 0.0634453296661377, + 0.04931068792939186, -0.014592095278203487, 0.0075549716129899025, + 0.02017839439213276, 0.03344761207699776, -0.005005223676562309, + 0.01818416453897953, -0.05528895929455757, 0.03879536688327789, + 0.018610917031764984, -0.029319677501916885, -0.00493574095889926, + 0.01762193627655506, 0.008898349478840828, -0.017192110419273376, + -0.03400791808962822, -0.026591692119836807, -0.03768239915370941, + 0.007602880708873272, -0.020310858264565468, -0.0036565649788826704, + -0.00616755336523056, -0.057577136904001236, 0.008387535810470581, + 0.021555650979280472, -0.01923108845949173, -0.019822189584374428, + -0.03861076384782791, -0.04258895292878151, 0.0005390863516367972, + -0.009946192614734173, 0.04911184310913086, -0.05009220167994499, + 0.00297548552043736, 0.019344164058566093, 0.005506082437932491, + 0.018321573734283447, -0.027131471782922745, -0.052699681371450424, + -0.02292790077626705, 0.0568309910595417, -0.008538461290299892, + -0.05772045999765396, 0.018903164193034172, -0.03690820932388306, + -0.034110669046640396, -0.008381159976124763, 0.03926640748977661, + 0.04370100051164627, -0.007475440856069326, 0.06952399015426636, + -0.0031064660288393497, 0.040785301476716995, -0.008955440483987331, + -0.016698094084858894, -0.007412049453705549, 0.022290483117103577, + 0.006028760224580765, -0.019992463290691376, -0.04154061898589134, + -0.023284243419766426, 0.04972238838672638, 0.0008079080143943429, + -0.0057194954715669155, 0.037808094173669815, -0.00983867421746254, + -0.030191699042916298, 0.014910571277141571, 0.0004001195775344968, + 0.08586109429597855, -0.014850648120045662, -0.07815773040056229, + 0.05393945425748825, -0.019953783601522446, 0.0016224493738263845, + 0.018219690769910812, 0.014311570674180984, 0.05567210912704468, + 0.004363455809652805, 0.01872050203382969, -0.05933142080903053, + -0.04872509092092514, 0.02652469463646412, -0.04346488043665886, + -0.027931908145546913, -0.03355146571993828, 0.004589339718222618, + -0.05587214604020119, -0.015419036149978638, 0.04789341986179352, + -0.059666525572538376, 0.00552733987569809, 0.012681040912866592, + 0.007240649312734604, -0.0045613935217261314, -0.060716547071933746, + 0.03223521634936333, 0.024270178750157356, -0.025784391909837723, + -0.01736401580274105, -0.0014227400533854961, -0.011367680504918098, + -0.035415612161159515, -0.01793254353106022, 0.033805977553129196, + -0.0080083217471838, 0.021929381415247917, 0.012236963026225567, + 0.002641203347593546, 0.0067292568273842335, -0.007680798415094614, + -0.02231515571475029, 0.023524953052401543, 0.010132606141269207, + 0.0030864113941788673, -0.03816894069314003, -0.0007171767647378147 + ], + "chunk_ind": 0 + } +] diff --git a/backend/danswer/seeding/initial_docs_cohere.json b/backend/danswer/seeding/initial_docs_cohere.json new file mode 100644 index 00000000000..cb8331b6e43 --- /dev/null +++ b/backend/danswer/seeding/initial_docs_cohere.json @@ -0,0 +1,44 @@ +[ + { + "url": "https://docs.danswer.dev/more/use_cases/overview", + "title": "Use Cases Overview", + "content": "How to leverage Danswer in your organization\n\nDanswer Overview\nDanswer is the AI Assistant connected to your organization's docs, apps, and people. Danswer makes Generative AI more versatile for work by enabling new types of questions like \"What is the most common feature request we've heard from customers this month\". Whereas other AI systems have no context of your team and are generally unhelpful with work related questions, Danswer makes it possible to ask these questions in natural language and get back answers in seconds.\n\nDanswer can connect to +30 different tools and the use cases are not limited to the ones in the following pages. The highlighted use cases are for inspiration and come from feedback gathered from our users and customers.\n\n\nCommon Getting Started Questions:\n\nWhy are these docs connected in my Danswer deployment?\nAnswer: This is just an example of how connectors work in Danswer. You can connect up your own team's knowledge and you will be able to ask questions unique to your organization. Danswer will keep all of the knowledge up to date and in sync with your connected applications.\n\nIs my data being sent anywhere when I connect it up to Danswer?\nAnswer: No! Danswer is built with data security as our highest priority. We open sourced it so our users can know exactly what is going on with their data. By default all of the document processing happens within Danswer. The only time it is sent outward is for the GenAI call to generate answers.\n\nWhere is the feature for auto sync-ing document level access permissions from all connected sources?\nAnswer: This falls under the Enterprise Edition set of Danswer features built on top of the MIT/community edition. If you are on Danswer Cloud, you have access to them by default. If you're running it yourself, reach out to the Danswer team to receive access.", + "chunk_ind": 0 + }, + { + "url": "https://docs.danswer.dev/more/use_cases/enterprise_search", + "title": "Enterprise Search", + "content": "Value of Enterprise Search with Danswer\n\nWhat is Enterprise Search and why is it Important?\nAn Enterprise Search system gives team members a single place to access all of the disparate knowledge of an organization. Critical information is saved across a host of channels like call transcripts with prospects, engineering design docs, IT runbooks, customer support email exchanges, project management tickets, and more. As fast moving teams scale up, information gets spread out and more disorganized.\n\nSince it quickly becomes infeasible to check across every source, decisions get made on incomplete information, employee satisfaction decreases, and the most valuable members of your team are tied up with constant distractions as junior teammates are unable to unblock themselves. Danswer solves this problem by letting anyone on the team access all of the knowledge across your organization in a permissioned and secure way. Users can ask questions in natural language and get back answers and documents across all of the connected sources instantly.\n\nWhat's the real cost?\nA typical knowledge worker spends over 2 hours a week on search, but more than that, the cost of incomplete or incorrect information can be extremely high. Customer support/success that isn't able to find the reference to similar cases could cause hours or even days of delay leading to lower customer satisfaction or in the worst case - churn. An account exec not realizing that a prospect had previously mentioned a specific need could lead to lost deals. An engineer not realizing a similar feature had previously been built could result in weeks of wasted development time and tech debt with duplicate implementation. With a lack of knowledge, your whole organization is navigating in the dark - inefficient and mistake prone.", + "chunk_ind": 0 + }, + { + "url": "https://docs.danswer.dev/more/use_cases/enterprise_search", + "title": "Enterprise Search", + "content": "More than Search\nWhen analyzing the entire corpus of knowledge within your company is as easy as asking a question in a search bar, your entire team can stay informed and up to date. Danswer also makes it trivial to identify where knowledge is well documented and where it is lacking. Team members who are centers of knowledge can begin to effectively document their expertise since it is no longer being thrown into a black hole. All of this allows the organization to achieve higher efficiency and drive business outcomes.\n\nWith Generative AI, the entire user experience has evolved as well. For example, instead of just finding similar cases for your customer support team to reference, Danswer breaks down the issue and explains it so that even the most junior members can understand it. This in turn lets them give the most holistic and technically accurate response possible to your customers. On the other end, even the super stars of your sales team will not be able to review 10 hours of transcripts before hopping on that critical call, but Danswer can easily parse through it in mere seconds and give crucial context to help your team close.", + "chunk_ind": 0 + }, + { + "url": "https://docs.danswer.dev/more/use_cases/ai_platform", + "title": "AI Platform", + "content": "Build AI Agents powered by the knowledge and workflows specific to your organization.\n\nBeyond Answers\nAgents enabled by generative AI and reasoning capable models are helping teams to automate their work. Danswer is helping teams make it happen. Danswer provides out of the box user chat sessions, attaching custom tools, handling LLM reasoning, code execution, data analysis, referencing internal knowledge, and much more.\n\nDanswer as a platform is not a no-code agent builder. We are made by developers for developers and this gives your team the full flexibility and power to create agents not constrained by blocks and simple logic paths.\n\nFlexibility and Extensibility\nDanswer is open source and completely whitebox. This not only gives transparency to what happens within the system but also means that your team can directly modify the source code to suit your unique needs.", + "chunk_ind": 0 + }, + { + "url": "https://docs.danswer.dev/more/use_cases/customer_support", + "title": "Customer Support", + "content": "Help your customer support team instantly answer any question across your entire product.\n\nAI Enabled Support\nCustomer support agents have one of the highest breadth jobs. They field requests that cover the entire surface area of the product and need to help your users find success on extremely short timelines. Because they're not the same people who designed or built the system, they often lack the depth of understanding needed - resulting in delays and escalations to other teams. Modern teams are leveraging AI to help their CS team optimize the speed and quality of these critical customer-facing interactions.\n\nThe Importance of Context\nThere are two critical components of AI copilots for customer support. The first is that the AI system needs to be connected with as much information as possible (not just support tools like Zendesk or Intercom) and that the knowledge needs to be as fresh as possible. Sometimes a fix might even be in places rarely checked by CS such as pull requests in a code repository. The second critical component is the ability of the AI system to break down difficult concepts and convoluted processes into more digestible descriptions and for your team members to be able to chat back and forth with the system to build a better understanding.\n\nDanswer takes care of both of these. The system connects up to over 30+ different applications and the knowledge is pulled in constantly so that the information access is always up to date.", + "chunk_ind": 0 + }, + { + "url": "https://docs.danswer.dev/more/use_cases/sales", + "title": "Sales", + "content": "Keep your team up to date on every conversation and update so they can close.\n\nRecall Every Detail\nBeing able to instantly revisit every detail of any call without reading transcripts is helping Sales teams provide more tailored pitches, build stronger relationships, and close more deals. Instead of searching and reading through hours of transcripts in preparation for a call, your team can now ask Danswer \"What specific features was ACME interested in seeing for the demo\". Since your team doesn't have time to read every transcript prior to a call, Danswer provides a more thorough summary because it can instantly parse hundreds of pages and distill out the relevant information. Even for fast lookups it becomes much more convenient - for example to brush up on connection building topics by asking \"What rapport building topic did we chat about in the last call with ACME\".\n\nKnow Every Product Update\nIt is impossible for Sales teams to keep up with every product update. Because of this, when a prospect has a question that the Sales team does not know, they have no choice but to rely on the Product and Engineering orgs to get an authoritative answer. Not only is this distracting to the other teams, it also slows down the time to respond to the prospect (and as we know, time is the biggest killer of deals). With Danswer, it is even possible to get answers live on call because of how fast accessing information becomes. A question like \"Have we shipped the Microsoft AD integration yet?\" can now be answered in seconds meaning that prospects can get answers while on the call instead of asynchronously and sales cycles are reduced as a result.", + "chunk_ind": 0 + }, + { + "url": "https://docs.danswer.dev/more/use_cases/operations", + "title": "Operations", + "content": "Double the productivity of your Ops teams like IT, HR, etc.\n\nAutomatically Resolve Tickets\nModern teams are leveraging AI to auto-resolve up to 50% of tickets. Whether it is an employee asking about benefits details or how to set up the VPN for remote work, Danswer can help your team help themselves. This frees up your team to do the real impactful work of landing star candidates or improving your internal processes.\n\nAI Aided Onboarding\nOne of the periods where your team needs the most help is when they're just ramping up. Instead of feeling lost in dozens of new tools, Danswer gives them a single place where they can ask about anything in natural language. Whether it's how to set up their work environment or what their onboarding goals are, Danswer can walk them through every step with the help of Generative AI. This lets your team feel more empowered and gives time back to the more seasoned members of your team to focus on moving the needle.", + "chunk_ind": 0 + } +] diff --git a/backend/danswer/chat/input_prompts.yaml b/backend/danswer/seeding/input_prompts.yaml similarity index 100% rename from backend/danswer/chat/input_prompts.yaml rename to backend/danswer/seeding/input_prompts.yaml diff --git a/backend/danswer/seeding/load_docs.py b/backend/danswer/seeding/load_docs.py new file mode 100644 index 00000000000..1567f7f6bbb --- /dev/null +++ b/backend/danswer/seeding/load_docs.py @@ -0,0 +1,229 @@ +import datetime +import json +import os +from typing import cast + +from sqlalchemy.orm import Session + +from danswer.access.models import default_public_access +from danswer.configs.constants import DEFAULT_BOOST +from danswer.configs.constants import DocumentSource +from danswer.configs.constants import KV_DOCUMENTS_SEEDED_KEY +from danswer.configs.model_configs import DEFAULT_DOCUMENT_ENCODER_MODEL +from danswer.connectors.models import Document +from danswer.connectors.models import IndexAttemptMetadata +from danswer.connectors.models import InputType +from danswer.connectors.models import Section +from danswer.db.connector import check_connectors_exist +from danswer.db.connector import create_connector +from danswer.db.connector_credential_pair import add_credential_to_connector +from danswer.db.credentials import PUBLIC_CREDENTIAL_ID +from danswer.db.document import check_docs_exist +from danswer.db.enums import AccessType +from danswer.db.enums import ConnectorCredentialPairStatus +from danswer.db.index_attempt import mock_successful_index_attempt +from danswer.db.search_settings import get_current_search_settings +from danswer.document_index.factory import get_default_document_index +from danswer.indexing.indexing_pipeline import index_doc_batch_prepare +from danswer.indexing.models import ChunkEmbedding +from danswer.indexing.models import DocMetadataAwareIndexChunk +from danswer.key_value_store.factory import get_kv_store +from danswer.key_value_store.interface import KvKeyNotFoundError +from danswer.server.documents.models import ConnectorBase +from danswer.utils.logger import setup_logger +from danswer.utils.retry_wrapper import retry_builder +from danswer.utils.variable_functionality import fetch_versioned_implementation + +logger = setup_logger() + + +def _create_indexable_chunks( + preprocessed_docs: list[dict], + tenant_id: str | None, +) -> tuple[list[Document], list[DocMetadataAwareIndexChunk]]: + ids_to_documents = {} + chunks = [] + for preprocessed_doc in preprocessed_docs: + document = Document( + id=preprocessed_doc["url"], # For Web connector, the URL is the ID + # The section is not really used past this point since we have already done the other processing + # for the chunking and embedding. + sections=[ + Section(text=preprocessed_doc["content"], link=preprocessed_doc["url"]) + ], + source=DocumentSource.WEB, + semantic_identifier=preprocessed_doc["title"], + metadata={}, + doc_updated_at=None, + primary_owners=[], + secondary_owners=[], + ) + if preprocessed_doc["chunk_ind"] == 0: + ids_to_documents[document.id] = document + + chunk = DocMetadataAwareIndexChunk( + chunk_id=preprocessed_doc["chunk_ind"], + blurb=preprocessed_doc["content"] + .split(".", 1)[0] + .split("!", 1)[0] + .split("?", 1)[0], + content=preprocessed_doc["content"], + source_links={0: preprocessed_doc["url"]}, + section_continuation=False, + source_document=document, + title_prefix=preprocessed_doc["title"], + metadata_suffix_semantic="", + metadata_suffix_keyword="", + mini_chunk_texts=None, + large_chunk_reference_ids=[], + embeddings=ChunkEmbedding( + full_embedding=preprocessed_doc["content_embedding"], + mini_chunk_embeddings=[], + ), + title_embedding=preprocessed_doc["title_embedding"], + tenant_id=tenant_id, + access=default_public_access, + document_sets=set(), + boost=DEFAULT_BOOST, + ) + chunks.append(chunk) + + return list(ids_to_documents.values()), chunks + + +# Cohere is used in EE version +def load_processed_docs(cohere_enabled: bool) -> list[dict]: + initial_docs_path = os.path.join( + os.getcwd(), + "danswer", + "seeding", + "initial_docs.json", + ) + processed_docs = json.load(open(initial_docs_path)) + return processed_docs + + +def seed_initial_documents( + db_session: Session, tenant_id: str | None, cohere_enabled: bool = False +) -> None: + """ + Seed initial documents so users don't have an empty index to start + + Documents are only loaded if: + - This is the first setup (if the user deletes the docs, we don't load them again) + - The index is empty, there are no docs and no (non-default) connectors + - The user has not updated the embedding models + - If they do, then we have to actually index the website + - If the embedding model is already updated on server startup, they're not a new user + + Note that regardless of any search settings, the default documents are always loaded with + the predetermined chunk sizes and single pass embedding. + + Steps are as follows: + - Check if this needs to run + - Create the connector representing this + - Create the cc-pair (attaching the public credential) and mocking values like the last success + - Indexing the documents into Postgres + - Indexing the documents into Vespa + - Create a fake index attempt with fake times + """ + logger.info("Seeding initial documents") + + kv_store = get_kv_store() + try: + kv_store.load(KV_DOCUMENTS_SEEDED_KEY) + logger.info("Documents already seeded, skipping") + return + except KvKeyNotFoundError: + pass + + if check_docs_exist(db_session): + logger.info("Documents already exist, skipping") + return + + if check_connectors_exist(db_session): + logger.info("Connectors already exist, skipping") + return + + search_settings = get_current_search_settings(db_session) + if search_settings.model_name != DEFAULT_DOCUMENT_ENCODER_MODEL and not ( + search_settings.model_name == "embed-english-v3.0" and cohere_enabled + ): + logger.info("Embedding model has been updated, skipping") + return + + document_index = get_default_document_index( + primary_index_name=search_settings.index_name, secondary_index_name=None + ) + + # Create a connector so the user can delete it if they want + # or reindex it with a new search model if they want + connector_data = ConnectorBase( + name="Sample Use Cases", + source=DocumentSource.WEB, + input_type=InputType.LOAD_STATE, + connector_specific_config={ + "base_url": "https://docs.danswer.dev/more/use_cases", + "web_connector_type": "recursive", + }, + refresh_freq=None, # Never refresh by default + prune_freq=None, + indexing_start=None, + ) + + connector = create_connector(db_session, connector_data) + connector_id = cast(int, connector.id) + + last_index_time = datetime.datetime.now(datetime.timezone.utc) + + result = add_credential_to_connector( + db_session=db_session, + user=None, + connector_id=connector_id, + credential_id=PUBLIC_CREDENTIAL_ID, + access_type=AccessType.PUBLIC, + cc_pair_name=connector_data.name, + groups=None, + initial_status=ConnectorCredentialPairStatus.PAUSED, + last_successful_index_time=last_index_time, + ) + cc_pair_id = cast(int, result.data) + processed_docs = fetch_versioned_implementation( + "danswer.seeding.load_docs", + "load_processed_docs", + )(cohere_enabled) + + docs, chunks = _create_indexable_chunks(processed_docs, tenant_id) + + index_doc_batch_prepare( + document_batch=docs, + index_attempt_metadata=IndexAttemptMetadata( + connector_id=connector_id, + credential_id=PUBLIC_CREDENTIAL_ID, + ), + db_session=db_session, + ignore_time_skip=True, # Doesn't actually matter here + ) + + # In this case since there are no other connectors running in the background + # and this is a fresh deployment, there is no need to grab any locks + logger.info( + "Indexing seeding documents into Vespa " + "(Vespa may take a few seconds to become ready after receiving the schema)" + ) + + # Retries here because the index may take a few seconds to become ready + # as we just sent over the Vespa schema and there is a slight delay + + index_with_retries = retry_builder()(document_index.index) + index_with_retries(chunks=chunks) + + # Mock a run for the UI even though it did not actually call out to anything + mock_successful_index_attempt( + connector_credential_pair_id=cc_pair_id, + search_settings_id=search_settings.id, + docs_indexed=len(docs), + db_session=db_session, + ) + + kv_store.store(KV_DOCUMENTS_SEEDED_KEY, True) diff --git a/backend/danswer/seeding/load_yamls.py b/backend/danswer/seeding/load_yamls.py new file mode 100644 index 00000000000..0046352679c --- /dev/null +++ b/backend/danswer/seeding/load_yamls.py @@ -0,0 +1,166 @@ +import yaml +from sqlalchemy.orm import Session + +from danswer.configs.chat_configs import INPUT_PROMPT_YAML +from danswer.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT +from danswer.configs.chat_configs import PERSONAS_YAML +from danswer.configs.chat_configs import PROMPTS_YAML +from danswer.context.search.enums import RecencyBiasSetting +from danswer.db.document_set import get_or_create_document_set_by_name +from danswer.db.input_prompt import insert_input_prompt_if_not_exists +from danswer.db.models import DocumentSet as DocumentSetDBModel +from danswer.db.models import Persona +from danswer.db.models import Prompt as PromptDBModel +from danswer.db.models import Tool as ToolDBModel +from danswer.db.persona import get_prompt_by_name +from danswer.db.persona import upsert_persona +from danswer.db.persona import upsert_prompt + + +def load_prompts_from_yaml( + db_session: Session, prompts_yaml: str = PROMPTS_YAML +) -> None: + with open(prompts_yaml, "r") as file: + data = yaml.safe_load(file) + + all_prompts = data.get("prompts", []) + for prompt in all_prompts: + upsert_prompt( + user=None, + prompt_id=prompt.get("id"), + name=prompt["name"], + description=prompt["description"].strip(), + system_prompt=prompt["system"].strip(), + task_prompt=prompt["task"].strip(), + include_citations=prompt["include_citations"], + datetime_aware=prompt.get("datetime_aware", True), + default_prompt=True, + personas=None, + db_session=db_session, + commit=True, + ) + + +def load_personas_from_yaml( + db_session: Session, + personas_yaml: str = PERSONAS_YAML, + default_chunks: float = MAX_CHUNKS_FED_TO_CHAT, +) -> None: + with open(personas_yaml, "r") as file: + data = yaml.safe_load(file) + + all_personas = data.get("personas", []) + for persona in all_personas: + doc_set_names = persona["document_sets"] + doc_sets: list[DocumentSetDBModel] = [ + get_or_create_document_set_by_name(db_session, name) + for name in doc_set_names + ] + + # Assume if user hasn't set any document sets for the persona, the user may want + # to later attach document sets to the persona manually, therefore, don't overwrite/reset + # the document sets for the persona + doc_set_ids: list[int] | None = None + if doc_sets: + doc_set_ids = [doc_set.id for doc_set in doc_sets] + else: + doc_set_ids = None + + prompt_ids: list[int] | None = None + prompt_set_names = persona["prompts"] + if prompt_set_names: + prompts: list[PromptDBModel | None] = [ + get_prompt_by_name(prompt_name, user=None, db_session=db_session) + for prompt_name in prompt_set_names + ] + if any([prompt is None for prompt in prompts]): + raise ValueError("Invalid Persona configs, not all prompts exist") + + if prompts: + prompt_ids = [prompt.id for prompt in prompts if prompt is not None] + + p_id = persona.get("id") + tool_ids = [] + if persona.get("image_generation"): + image_gen_tool = ( + db_session.query(ToolDBModel) + .filter(ToolDBModel.name == "ImageGenerationTool") + .first() + ) + if image_gen_tool: + tool_ids.append(image_gen_tool.id) + + llm_model_provider_override = persona.get("llm_model_provider_override") + llm_model_version_override = persona.get("llm_model_version_override") + + # Set specific overrides for image generation persona + if persona.get("image_generation"): + llm_model_version_override = "gpt-4o" + + existing_persona = ( + db_session.query(Persona).filter(Persona.name == persona["name"]).first() + ) + + upsert_persona( + user=None, + persona_id=(-1 * p_id) if p_id is not None else None, + name=persona["name"], + description=persona["description"], + num_chunks=persona.get("num_chunks") + if persona.get("num_chunks") is not None + else default_chunks, + llm_relevance_filter=persona.get("llm_relevance_filter"), + starter_messages=persona.get("starter_messages"), + llm_filter_extraction=persona.get("llm_filter_extraction"), + icon_shape=persona.get("icon_shape"), + icon_color=persona.get("icon_color"), + llm_model_provider_override=llm_model_provider_override, + llm_model_version_override=llm_model_version_override, + recency_bias=RecencyBiasSetting(persona["recency_bias"]), + prompt_ids=prompt_ids, + document_set_ids=doc_set_ids, + tool_ids=tool_ids, + builtin_persona=True, + is_public=True, + display_priority=existing_persona.display_priority + if existing_persona is not None + else persona.get("display_priority"), + is_visible=existing_persona.is_visible + if existing_persona is not None + else persona.get("is_visible"), + db_session=db_session, + ) + + +def load_input_prompts_from_yaml( + db_session: Session, input_prompts_yaml: str = INPUT_PROMPT_YAML +) -> None: + with open(input_prompts_yaml, "r") as file: + data = yaml.safe_load(file) + + all_input_prompts = data.get("input_prompts", []) + for input_prompt in all_input_prompts: + # If these prompts are deleted (which is a hard delete in the DB), on server startup + # they will be recreated, but the user can always just deactivate them, just a light inconvenience + + insert_input_prompt_if_not_exists( + user=None, + input_prompt_id=input_prompt.get("id"), + prompt=input_prompt["prompt"], + content=input_prompt["content"], + is_public=input_prompt["is_public"], + active=input_prompt.get("active", True), + db_session=db_session, + commit=True, + ) + + +def load_chat_yamls( + db_session: Session, + prompt_yaml: str = PROMPTS_YAML, + personas_yaml: str = PERSONAS_YAML, + input_prompts_yaml: str = INPUT_PROMPT_YAML, +) -> None: + load_prompts_from_yaml(db_session, prompt_yaml) + load_personas_from_yaml(db_session, personas_yaml) + load_input_prompts_from_yaml(db_session, input_prompts_yaml) diff --git a/backend/danswer/chat/personas.yaml b/backend/danswer/seeding/personas.yaml similarity index 62% rename from backend/danswer/chat/personas.yaml rename to backend/danswer/seeding/personas.yaml index d76b13fb10b..5e5d62d6d1d 100644 --- a/backend/danswer/chat/personas.yaml +++ b/backend/danswer/seeding/personas.yaml @@ -41,6 +41,15 @@ personas: icon_color: "#6FB1FF" display_priority: 0 is_visible: true + starter_messages: + - name: "Give me an overview of what's here" + message: "Sample some documents and tell me what you find." + - name: "Use AI to solve a work related problem" + message: "Ask me what problem I would like to solve, then search the knowledge base to help me find a solution." + - name: "Find updates on a topic of interest" + message: "Once I provide a topic, retrieve related documents and tell me when there was last activity on the topic if available." + - name: "Surface contradictions" + message: "Have me choose a subject. Once I have provided it, check against the knowledge base and point out any inconsistencies. For all your following responses, focus on identifying contradictions." - id: 1 name: "General GPT" @@ -57,6 +66,15 @@ personas: icon_color: "#FF6F6F" display_priority: 1 is_visible: true + starter_messages: + - name: "Summarize a document" + message: "If I have provided a document please summarize it for me. If not, please ask me to upload a document either by dragging it into the input bar or clicking the +file icon." + - name: "Help me with coding" + message: 'Write me a "Hello World" script in 5 random languages to show off the functionality.' + - name: "Draft a professional email" + message: "Help me craft a professional email. Let's establish the context and the anticipated outcomes of the email before proposing a draft." + - name: "Learn something new" + message: "What is the difference between a Gantt chart, a Burndown chart and a Kanban board?" - id: 2 name: "GPT Internet Search" @@ -91,3 +109,12 @@ personas: image_generation: true display_priority: 3 is_visible: false + starter_messages: + - name: "Create visuals for a presentation" + message: "Generate someone presenting a graph which clearly demonstrates an upwards trajectory." + - name: "Find inspiration for a marketing campaign" + message: "Generate an image of two happy individuals sipping on a soda drink in a glass bottle." + - name: "Visualize a product design" + message: "I want to add a search bar to my Iphone app. Generate me generic examples of how other apps implement this." + - name: "Generate a humorous image response" + message: "My teammate just made a silly mistake and I want to respond with a facepalm. Can you generate me one?" diff --git a/backend/danswer/chat/prompts.yaml b/backend/danswer/seeding/prompts.yaml similarity index 87% rename from backend/danswer/chat/prompts.yaml rename to backend/danswer/seeding/prompts.yaml index 899f6cfd7b6..200577cfa01 100644 --- a/backend/danswer/chat/prompts.yaml +++ b/backend/danswer/seeding/prompts.yaml @@ -37,19 +37,19 @@ prompts: include_citations: true - name: "ImageGeneration" - description: "Generates images based on user prompts!" + description: "Generates images from user descriptions!" system: > - You are an advanced image generation system capable of creating diverse and detailed images. + You are an AI image generation assistant. Your role is to create high-quality images based on user descriptions. - You can interpret user prompts and generate high-quality, creative images that match their descriptions. + For appropriate requests, you will generate an image that matches the user's requirements. + For inappropriate or unsafe requests, you will politely decline and explain why the request cannot be fulfilled. - You always strive to create safe and appropriate content, avoiding any harmful or offensive imagery. + You aim to be helpful while maintaining appropriate content standards. task: > - Generate an image based on the user's description. + Based on the user's description, create a high-quality image that accurately reflects their request. + Pay close attention to the specified details, styles, and desired elements. - Provide a detailed description of the generated image, including key elements, colors, and composition. - - If the request is not possible or appropriate, explain why and suggest alternatives. + If the request is not appropriate or cannot be fulfilled, explain why and suggest alternatives. datetime_aware: true include_citations: false @@ -62,6 +62,9 @@ prompts: You give concise responses to very simple questions, but provide more thorough responses to more complex and open-ended questions. + Below is my employee information in Mindvalley: + + DANSWER_EMPLOYEE_REPLACEMENT You are happy to help with writing, analysis, question answering, math, coding and all sorts of other tasks. You use markdown where reasonable and also for coding. diff --git a/backend/ee/danswer/server/api_key/api.py b/backend/danswer/server/api_key/api.py similarity index 81% rename from backend/ee/danswer/server/api_key/api.py rename to backend/danswer/server/api_key/api.py index c7353f055fb..cc21af616c7 100644 --- a/backend/ee/danswer/server/api_key/api.py +++ b/backend/danswer/server/api_key/api.py @@ -3,15 +3,15 @@ from sqlalchemy.orm import Session from danswer.auth.users import current_admin_user +from danswer.db.api_key import ApiKeyDescriptor +from danswer.db.api_key import fetch_api_keys +from danswer.db.api_key import insert_api_key +from danswer.db.api_key import regenerate_api_key +from danswer.db.api_key import remove_api_key +from danswer.db.api_key import update_api_key from danswer.db.engine import get_session from danswer.db.models import User -from ee.danswer.db.api_key import ApiKeyDescriptor -from ee.danswer.db.api_key import fetch_api_keys -from ee.danswer.db.api_key import insert_api_key -from ee.danswer.db.api_key import regenerate_api_key -from ee.danswer.db.api_key import remove_api_key -from ee.danswer.db.api_key import update_api_key -from ee.danswer.server.api_key.models import APIKeyArgs +from danswer.server.api_key.models import APIKeyArgs router = APIRouter(prefix="/admin/api-key") diff --git a/backend/ee/danswer/server/api_key/models.py b/backend/danswer/server/api_key/models.py similarity index 100% rename from backend/ee/danswer/server/api_key/models.py rename to backend/danswer/server/api_key/models.py diff --git a/backend/danswer/server/auth_check.py b/backend/danswer/server/auth_check.py index 8a35a560a24..bf55d80d90e 100644 --- a/backend/danswer/server/auth_check.py +++ b/backend/danswer/server/auth_check.py @@ -6,10 +6,12 @@ from danswer.auth.users import current_admin_user from danswer.auth.users import current_curator_or_admin_user +from danswer.auth.users import current_limited_user from danswer.auth.users import current_user from danswer.auth.users import current_user_with_expired_token from danswer.configs.app_configs import APP_API_PREFIX from danswer.server.danswer_api.ingestion import api_key_dep +from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop PUBLIC_ENDPOINT_SPECS = [ @@ -79,6 +81,14 @@ def check_router_auth( (1) have auth enabled OR (2) are explicitly marked as a public endpoint """ + + control_plane_dep = fetch_ee_implementation_or_noop( + "danswer.server.tenants.access", "control_plane_dep" + ) + current_cloud_superuser = fetch_ee_implementation_or_noop( + "danswer.auth.users", "current_cloud_superuser" + ) + for route in application.routes: # explicitly marked as public if is_route_in_spec_list(route, public_endpoint_specs): @@ -93,11 +103,14 @@ def check_router_auth( for dependency in route_dependant_obj.dependencies: depends_fn = dependency.cache_key[0] if ( - depends_fn == current_user + depends_fn == current_limited_user + or depends_fn == current_user or depends_fn == current_admin_user or depends_fn == current_curator_or_admin_user or depends_fn == api_key_dep or depends_fn == current_user_with_expired_token + or depends_fn == control_plane_dep + or depends_fn == current_cloud_superuser ): found_auth = True break @@ -107,5 +120,5 @@ def check_router_auth( # print(f"(\"{route.path}\", {set(route.methods)}),") raise RuntimeError( - f"Did not find current_user or current_admin_user dependency in route - {route}" + f"Did not find user dependency in private route - {route}" ) diff --git a/backend/danswer/server/danswer_api/ingestion.py b/backend/danswer/server/danswer_api/ingestion.py index cea3ec86575..c65c870d461 100644 --- a/backend/danswer/server/danswer_api/ingestion.py +++ b/backend/danswer/server/danswer_api/ingestion.py @@ -3,12 +3,14 @@ from fastapi import HTTPException from sqlalchemy.orm import Session +from danswer.auth.users import api_key_dep from danswer.configs.constants import DocumentSource from danswer.connectors.models import Document from danswer.connectors.models import IndexAttemptMetadata from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id from danswer.db.document import get_documents_by_cc_pair from danswer.db.document import get_ingestion_documents +from danswer.db.engine import get_current_tenant_id from danswer.db.engine import get_session from danswer.db.models import User from danswer.db.search_settings import get_current_search_settings @@ -21,7 +23,6 @@ from danswer.server.danswer_api.models import IngestionDocument from danswer.server.danswer_api.models import IngestionResult from danswer.utils.logger import setup_logger -from ee.danswer.auth.users import api_key_dep logger = setup_logger() @@ -67,6 +68,7 @@ def upsert_ingestion_doc( doc_info: IngestionDocument, _: User | None = Depends(api_key_dep), db_session: Session = Depends(get_session), + tenant_id: str = Depends(get_current_tenant_id), ) -> IngestionResult: doc_info.document.from_ingestion_api = True @@ -101,6 +103,7 @@ def upsert_ingestion_doc( document_index=curr_doc_index, ignore_time_skip=True, db_session=db_session, + tenant_id=tenant_id, ) new_doc, __chunk_count = indexing_pipeline( @@ -134,6 +137,7 @@ def upsert_ingestion_doc( document_index=sec_doc_index, ignore_time_skip=True, db_session=db_session, + tenant_id=tenant_id, ) sec_ind_pipeline( diff --git a/backend/danswer/server/documents/cc_pair.py b/backend/danswer/server/documents/cc_pair.py index 876886ca28d..55808ebcee7 100644 --- a/backend/danswer/server/documents/cc_pair.py +++ b/backend/danswer/server/documents/cc_pair.py @@ -1,4 +1,6 @@ import math +from datetime import datetime +from http import HTTPStatus from fastapi import APIRouter from fastapi import Depends @@ -10,6 +12,13 @@ from danswer.auth.users import current_curator_or_admin_user from danswer.auth.users import current_user from danswer.background.celery.celery_utils import get_deletion_attempt_snapshot +from danswer.background.celery.tasks.doc_permission_syncing.tasks import ( + try_creating_permissions_sync_task, +) +from danswer.background.celery.tasks.pruning.tasks import ( + try_creating_prune_generator_task, +) +from danswer.background.celery.versioned_apps.primary import app as primary_app from danswer.db.connector_credential_pair import add_credential_to_connector from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id from danswer.db.connector_credential_pair import remove_credential_from_connector @@ -17,7 +26,11 @@ update_connector_credential_pair_from_id, ) from danswer.db.document import get_document_counts_for_cc_pairs +from danswer.db.document import get_documents_for_cc_pair +from danswer.db.engine import CURRENT_TENANT_ID_CONTEXTVAR +from danswer.db.engine import get_current_tenant_id from danswer.db.engine import get_session +from danswer.db.enums import AccessType from danswer.db.enums import ConnectorCredentialPairStatus from danswer.db.index_attempt import cancel_indexing_attempts_for_ccpair from danswer.db.index_attempt import cancel_indexing_attempts_past_model @@ -25,17 +38,20 @@ from danswer.db.index_attempt import get_latest_index_attempt_for_cc_pair_id from danswer.db.index_attempt import get_paginated_index_attempts_for_cc_pair_id from danswer.db.models import User +from danswer.db.search_settings import get_current_search_settings +from danswer.redis.redis_connector import RedisConnector +from danswer.redis.redis_pool import get_redis_client from danswer.server.documents.models import CCPairFullInfo from danswer.server.documents.models import CCStatusUpdateRequest from danswer.server.documents.models import ConnectorCredentialPairIdentifier from danswer.server.documents.models import ConnectorCredentialPairMetadata +from danswer.server.documents.models import DocumentSyncStatus from danswer.server.documents.models import PaginatedIndexAttempts from danswer.server.models import StatusResponse from danswer.utils.logger import setup_logger -from ee.danswer.db.user_group import validate_user_creation_permissions +from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop logger = setup_logger() - router = APIRouter(prefix="/manage") @@ -76,6 +92,7 @@ def get_cc_pair_full_info( cc_pair_id: int, user: User | None = Depends(current_curator_or_admin_user), db_session: Session = Depends(get_session), + tenant_id: str | None = Depends(get_current_tenant_id), ) -> CCPairFullInfo: cc_pair = get_connector_credential_pair_from_id( cc_pair_id, db_session, user, get_editable=False @@ -106,11 +123,16 @@ def get_cc_pair_full_info( latest_attempt = get_latest_index_attempt_for_cc_pair_id( db_session=db_session, - connector_credential_pair_id=cc_pair.id, + connector_credential_pair_id=cc_pair_id, secondary_index=False, only_finished=False, ) + search_settings = get_current_search_settings(db_session) + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + redis_connector_index = redis_connector.new_index(search_settings.id) + return CCPairFullInfo.from_models( cc_pair_model=cc_pair, number_of_index_attempts=count_index_attempts_for_connector( @@ -122,9 +144,11 @@ def get_cc_pair_full_info( connector_id=cc_pair.connector_id, credential_id=cc_pair.credential_id, db_session=db_session, + tenant_id=tenant_id, ), num_docs_indexed=documents_indexed, is_editable_for_current_user=is_editable_for_current_user, + indexing=redis_connector_index.fenced, ) @@ -141,6 +165,7 @@ def update_cc_pair_status( user=user, get_editable=True, ) + if not cc_pair: raise HTTPException( status_code=400, @@ -150,7 +175,6 @@ def update_cc_pair_status( if status_update_request.status == ConnectorCredentialPairStatus.PAUSED: cancel_indexing_attempts_for_ccpair(cc_pair_id, db_session) - # Just for good measure cancel_indexing_attempts_past_model(db_session) update_connector_credential_pair_from_id( @@ -159,6 +183,8 @@ def update_cc_pair_status( status=status_update_request.status, ) + db_session.commit() + @router.put("/admin/cc-pair/{cc_pair_id}/name") def update_cc_pair_name( @@ -189,6 +215,163 @@ def update_cc_pair_name( raise HTTPException(status_code=400, detail="Name must be unique") +@router.get("/admin/cc-pair/{cc_pair_id}/last_pruned") +def get_cc_pair_last_pruned( + cc_pair_id: int, + user: User = Depends(current_curator_or_admin_user), + db_session: Session = Depends(get_session), +) -> datetime | None: + cc_pair = get_connector_credential_pair_from_id( + cc_pair_id=cc_pair_id, + db_session=db_session, + user=user, + get_editable=False, + ) + if not cc_pair: + raise HTTPException( + status_code=400, + detail="cc_pair not found for current user's permissions", + ) + + return cc_pair.last_pruned + + +@router.post("/admin/cc-pair/{cc_pair_id}/prune") +def prune_cc_pair( + cc_pair_id: int, + user: User = Depends(current_curator_or_admin_user), + db_session: Session = Depends(get_session), + tenant_id: str | None = Depends(get_current_tenant_id), +) -> StatusResponse[list[int]]: + """Triggers pruning on a particular cc_pair immediately""" + + cc_pair = get_connector_credential_pair_from_id( + cc_pair_id=cc_pair_id, + db_session=db_session, + user=user, + get_editable=False, + ) + if not cc_pair: + raise HTTPException( + status_code=400, + detail="Connection not found for current user's permissions", + ) + + r = get_redis_client(tenant_id=tenant_id) + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + if redis_connector.prune.fenced: + raise HTTPException( + status_code=HTTPStatus.CONFLICT, + detail="Pruning task already in progress.", + ) + + logger.info( + f"Pruning cc_pair: cc_pair_id={cc_pair_id} " + f"connector_id={cc_pair.connector_id} " + f"credential_id={cc_pair.credential_id} " + f"{cc_pair.connector.name} connector." + ) + tasks_created = try_creating_prune_generator_task( + primary_app, cc_pair, db_session, r, CURRENT_TENANT_ID_CONTEXTVAR.get() + ) + if not tasks_created: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR, + detail="Pruning task creation failed.", + ) + + return StatusResponse( + success=True, + message="Successfully created the pruning task.", + ) + + +@router.get("/admin/cc-pair/{cc_pair_id}/sync-permissions") +def get_cc_pair_latest_sync( + cc_pair_id: int, + user: User = Depends(current_curator_or_admin_user), + db_session: Session = Depends(get_session), +) -> datetime | None: + cc_pair = get_connector_credential_pair_from_id( + cc_pair_id=cc_pair_id, + db_session=db_session, + user=user, + get_editable=False, + ) + if not cc_pair: + raise HTTPException( + status_code=400, + detail="cc_pair not found for current user's permissions", + ) + + return cc_pair.last_time_perm_sync + + +@router.post("/admin/cc-pair/{cc_pair_id}/sync-permissions") +def sync_cc_pair( + cc_pair_id: int, + user: User = Depends(current_curator_or_admin_user), + db_session: Session = Depends(get_session), + tenant_id: str | None = Depends(get_current_tenant_id), +) -> StatusResponse[list[int]]: + """Triggers permissions sync on a particular cc_pair immediately""" + + cc_pair = get_connector_credential_pair_from_id( + cc_pair_id=cc_pair_id, + db_session=db_session, + user=user, + get_editable=False, + ) + if not cc_pair: + raise HTTPException( + status_code=400, + detail="Connection not found for current user's permissions", + ) + + r = get_redis_client(tenant_id=tenant_id) + + redis_connector = RedisConnector(tenant_id, cc_pair_id) + if redis_connector.permissions.fenced: + raise HTTPException( + status_code=HTTPStatus.CONFLICT, + detail="Doc permissions sync task already in progress.", + ) + + logger.info( + f"Doc permissions sync cc_pair={cc_pair_id} " + f"connector_id={cc_pair.connector_id} " + f"credential_id={cc_pair.credential_id} " + f"{cc_pair.connector.name} connector." + ) + tasks_created = try_creating_permissions_sync_task( + primary_app, cc_pair_id, r, CURRENT_TENANT_ID_CONTEXTVAR.get() + ) + if not tasks_created: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR, + detail="Doc permissions sync task creation failed.", + ) + + return StatusResponse( + success=True, + message="Successfully created the doc permissions sync task.", + ) + + +@router.get("/admin/cc-pair/{cc_pair_id}/get-docs-sync-status") +def get_docs_sync_status( + cc_pair_id: int, + _: User = Depends(current_curator_or_admin_user), + db_session: Session = Depends(get_session), +) -> list[DocumentSyncStatus]: + all_docs_for_cc_pair = get_documents_for_cc_pair( + db_session=db_session, + cc_pair_id=cc_pair_id, + ) + return [DocumentSyncStatus.from_model(doc) for doc in all_docs_for_cc_pair] + + @router.put("/connector/{connector_id}/credential/{credential_id}") def associate_credential_to_connector( connector_id: int, @@ -197,11 +380,14 @@ def associate_credential_to_connector( user: User | None = Depends(current_curator_or_admin_user), db_session: Session = Depends(get_session), ) -> StatusResponse[int]: - validate_user_creation_permissions( + fetch_ee_implementation_or_noop( + "danswer.db.user_group", "validate_user_creation_permissions", None + )( db_session=db_session, user=user, target_group_ids=metadata.groups, - object_is_public=metadata.is_public, + object_is_public=metadata.access_type == AccessType.PUBLIC, + object_is_perm_sync=metadata.access_type == AccessType.SYNC, ) try: @@ -211,7 +397,8 @@ def associate_credential_to_connector( connector_id=connector_id, credential_id=credential_id, cc_pair_name=metadata.name, - is_public=True if metadata.is_public is None else metadata.is_public, + access_type=metadata.access_type, + auto_sync_options=metadata.auto_sync_options, groups=metadata.groups, ) diff --git a/backend/danswer/server/documents/connector.py b/backend/danswer/server/documents/connector.py index 73e28b8fb0b..9b9da834e05 100644 --- a/backend/danswer/server/documents/connector.py +++ b/backend/danswer/server/documents/connector.py @@ -9,6 +9,7 @@ from fastapi import Request from fastapi import Response from fastapi import UploadFile +from google.oauth2.credentials import Credentials # type: ignore from pydantic import BaseModel from sqlalchemy.orm import Session @@ -16,37 +17,43 @@ from danswer.auth.users import current_curator_or_admin_user from danswer.auth.users import current_user from danswer.background.celery.celery_utils import get_deletion_attempt_snapshot +from danswer.background.celery.tasks.indexing.tasks import try_creating_indexing_task +from danswer.background.celery.versioned_apps.primary import app as primary_app from danswer.configs.app_configs import ENABLED_CONNECTOR_TYPES from danswer.configs.constants import DocumentSource from danswer.configs.constants import FileOrigin -from danswer.connectors.gmail.connector_auth import delete_gmail_service_account_key -from danswer.connectors.gmail.connector_auth import delete_google_app_gmail_cred -from danswer.connectors.gmail.connector_auth import get_gmail_auth_url -from danswer.connectors.gmail.connector_auth import get_gmail_service_account_key -from danswer.connectors.gmail.connector_auth import get_google_app_gmail_cred -from danswer.connectors.gmail.connector_auth import ( - update_gmail_credential_access_tokens, +from danswer.connectors.google_utils.google_auth import ( + get_google_oauth_creds, ) -from danswer.connectors.gmail.connector_auth import ( - upsert_gmail_service_account_key, +from danswer.connectors.google_utils.google_kv import ( + build_service_account_creds, ) -from danswer.connectors.gmail.connector_auth import upsert_google_app_gmail_cred -from danswer.connectors.google_drive.connector_auth import build_service_account_creds -from danswer.connectors.google_drive.connector_auth import delete_google_app_cred -from danswer.connectors.google_drive.connector_auth import delete_service_account_key -from danswer.connectors.google_drive.connector_auth import get_auth_url -from danswer.connectors.google_drive.connector_auth import get_google_app_cred -from danswer.connectors.google_drive.connector_auth import ( - get_google_drive_creds_for_authorized_user, +from danswer.connectors.google_utils.google_kv import ( + delete_google_app_cred, ) -from danswer.connectors.google_drive.connector_auth import get_service_account_key -from danswer.connectors.google_drive.connector_auth import ( +from danswer.connectors.google_utils.google_kv import ( + delete_service_account_key, +) +from danswer.connectors.google_utils.google_kv import get_auth_url +from danswer.connectors.google_utils.google_kv import ( + get_google_app_cred, +) +from danswer.connectors.google_utils.google_kv import ( + get_service_account_key, +) +from danswer.connectors.google_utils.google_kv import ( update_credential_access_tokens, ) -from danswer.connectors.google_drive.connector_auth import upsert_google_app_cred -from danswer.connectors.google_drive.connector_auth import upsert_service_account_key -from danswer.connectors.google_drive.connector_auth import verify_csrf -from danswer.connectors.google_drive.constants import DB_CREDENTIALS_DICT_TOKEN_KEY +from danswer.connectors.google_utils.google_kv import ( + upsert_google_app_cred, +) +from danswer.connectors.google_utils.google_kv import ( + upsert_service_account_key, +) +from danswer.connectors.google_utils.google_kv import verify_csrf +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_DICT_TOKEN_KEY, +) from danswer.db.connector import create_connector from danswer.db.connector import delete_connector from danswer.db.connector import fetch_connector_by_id @@ -57,24 +64,29 @@ from danswer.db.connector_credential_pair import get_cc_pair_groups_for_ids from danswer.db.connector_credential_pair import get_connector_credential_pair from danswer.db.connector_credential_pair import get_connector_credential_pairs +from danswer.db.credentials import cleanup_gmail_credentials +from danswer.db.credentials import cleanup_google_drive_credentials from danswer.db.credentials import create_credential -from danswer.db.credentials import delete_gmail_service_account_credentials -from danswer.db.credentials import delete_google_drive_service_account_credentials +from danswer.db.credentials import delete_service_account_credentials from danswer.db.credentials import fetch_credential_by_id from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed from danswer.db.document import get_document_counts_for_cc_pairs +from danswer.db.engine import get_current_tenant_id from danswer.db.engine import get_session -from danswer.db.index_attempt import create_index_attempt +from danswer.db.enums import AccessType from danswer.db.index_attempt import get_index_attempts_for_cc_pair from danswer.db.index_attempt import get_latest_index_attempt_for_cc_pair_id from danswer.db.index_attempt import get_latest_index_attempts from danswer.db.index_attempt import get_latest_index_attempts_by_status from danswer.db.models import IndexingStatus +from danswer.db.models import SearchSettings from danswer.db.models import User -from danswer.db.models import UserRole from danswer.db.search_settings import get_current_search_settings -from danswer.dynamic_configs.interface import ConfigNotFoundError +from danswer.db.search_settings import get_secondary_search_settings from danswer.file_store.file_store import get_default_file_store +from danswer.key_value_store.interface import KvKeyNotFoundError +from danswer.redis.redis_connector import RedisConnector +from danswer.redis.redis_pool import get_redis_client from danswer.server.documents.models import AuthStatus from danswer.server.documents.models import AuthUrl from danswer.server.documents.models import ConnectorCredentialPairIdentifier @@ -95,7 +107,7 @@ from danswer.server.documents.models import RunConnectorRequest from danswer.server.models import StatusResponse from danswer.utils.logger import setup_logger -from ee.danswer.db.user_group import validate_user_creation_permissions +from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop logger = setup_logger() @@ -114,8 +126,8 @@ def check_google_app_gmail_credentials_exist( _: User = Depends(current_curator_or_admin_user), ) -> dict[str, str]: try: - return {"client_id": get_google_app_gmail_cred().web.client_id} - except ConfigNotFoundError: + return {"client_id": get_google_app_cred(DocumentSource.GMAIL).web.client_id} + except KvKeyNotFoundError: raise HTTPException(status_code=404, detail="Google App Credentials not found") @@ -124,7 +136,7 @@ def upsert_google_app_gmail_credentials( app_credentials: GoogleAppCredentials, _: User = Depends(current_admin_user) ) -> StatusResponse: try: - upsert_google_app_gmail_cred(app_credentials) + upsert_google_app_cred(app_credentials, DocumentSource.GMAIL) except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) @@ -136,10 +148,12 @@ def upsert_google_app_gmail_credentials( @router.delete("/admin/connector/gmail/app-credential") def delete_google_app_gmail_credentials( _: User = Depends(current_admin_user), + db_session: Session = Depends(get_session), ) -> StatusResponse: try: - delete_google_app_gmail_cred() - except ConfigNotFoundError as e: + delete_google_app_cred(DocumentSource.GMAIL) + cleanup_gmail_credentials(db_session=db_session) + except KvKeyNotFoundError as e: raise HTTPException(status_code=400, detail=str(e)) return StatusResponse( @@ -152,8 +166,10 @@ def check_google_app_credentials_exist( _: User = Depends(current_curator_or_admin_user), ) -> dict[str, str]: try: - return {"client_id": get_google_app_cred().web.client_id} - except ConfigNotFoundError: + return { + "client_id": get_google_app_cred(DocumentSource.GOOGLE_DRIVE).web.client_id + } + except KvKeyNotFoundError: raise HTTPException(status_code=404, detail="Google App Credentials not found") @@ -162,7 +178,7 @@ def upsert_google_app_credentials( app_credentials: GoogleAppCredentials, _: User = Depends(current_admin_user) ) -> StatusResponse: try: - upsert_google_app_cred(app_credentials) + upsert_google_app_cred(app_credentials, DocumentSource.GOOGLE_DRIVE) except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) @@ -174,10 +190,12 @@ def upsert_google_app_credentials( @router.delete("/admin/connector/google-drive/app-credential") def delete_google_app_credentials( _: User = Depends(current_admin_user), + db_session: Session = Depends(get_session), ) -> StatusResponse: try: - delete_google_app_cred() - except ConfigNotFoundError as e: + delete_google_app_cred(DocumentSource.GOOGLE_DRIVE) + cleanup_google_drive_credentials(db_session=db_session) + except KvKeyNotFoundError as e: raise HTTPException(status_code=400, detail=str(e)) return StatusResponse( @@ -190,8 +208,12 @@ def check_google_service_gmail_account_key_exist( _: User = Depends(current_curator_or_admin_user), ) -> dict[str, str]: try: - return {"service_account_email": get_gmail_service_account_key().client_email} - except ConfigNotFoundError: + return { + "service_account_email": get_service_account_key( + DocumentSource.GMAIL + ).client_email + } + except KvKeyNotFoundError: raise HTTPException( status_code=404, detail="Google Service Account Key not found" ) @@ -202,7 +224,7 @@ def upsert_google_service_gmail_account_key( service_account_key: GoogleServiceAccountKey, _: User = Depends(current_admin_user) ) -> StatusResponse: try: - upsert_gmail_service_account_key(service_account_key) + upsert_service_account_key(service_account_key, DocumentSource.GMAIL) except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) @@ -214,10 +236,12 @@ def upsert_google_service_gmail_account_key( @router.delete("/admin/connector/gmail/service-account-key") def delete_google_service_gmail_account_key( _: User = Depends(current_admin_user), + db_session: Session = Depends(get_session), ) -> StatusResponse: try: - delete_gmail_service_account_key() - except ConfigNotFoundError as e: + delete_service_account_key(DocumentSource.GMAIL) + cleanup_gmail_credentials(db_session=db_session) + except KvKeyNotFoundError as e: raise HTTPException(status_code=400, detail=str(e)) return StatusResponse( @@ -230,8 +254,12 @@ def check_google_service_account_key_exist( _: User = Depends(current_curator_or_admin_user), ) -> dict[str, str]: try: - return {"service_account_email": get_service_account_key().client_email} - except ConfigNotFoundError: + return { + "service_account_email": get_service_account_key( + DocumentSource.GOOGLE_DRIVE + ).client_email + } + except KvKeyNotFoundError: raise HTTPException( status_code=404, detail="Google Service Account Key not found" ) @@ -242,7 +270,7 @@ def upsert_google_service_account_key( service_account_key: GoogleServiceAccountKey, _: User = Depends(current_admin_user) ) -> StatusResponse: try: - upsert_service_account_key(service_account_key) + upsert_service_account_key(service_account_key, DocumentSource.GOOGLE_DRIVE) except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) @@ -254,10 +282,12 @@ def upsert_google_service_account_key( @router.delete("/admin/connector/google-drive/service-account-key") def delete_google_service_account_key( _: User = Depends(current_admin_user), + db_session: Session = Depends(get_session), ) -> StatusResponse: try: - delete_service_account_key() - except ConfigNotFoundError as e: + delete_service_account_key(DocumentSource.GOOGLE_DRIVE) + cleanup_google_drive_credentials(db_session=db_session) + except KvKeyNotFoundError as e: raise HTTPException(status_code=400, detail=str(e)) return StatusResponse( @@ -277,13 +307,13 @@ def upsert_service_account_credential( try: credential_base = build_service_account_creds( DocumentSource.GOOGLE_DRIVE, - delegated_user_email=service_account_credential_request.google_drive_delegated_user, + primary_admin_email=service_account_credential_request.google_primary_admin, ) - except ConfigNotFoundError as e: + except KvKeyNotFoundError as e: raise HTTPException(status_code=400, detail=str(e)) # first delete all existing service account credentials - delete_google_drive_service_account_credentials(user, db_session) + delete_service_account_credentials(user, db_session, DocumentSource.GOOGLE_DRIVE) # `user=None` since this credential is not a personal credential credential = create_credential( credential_data=credential_base, user=user, db_session=db_session @@ -303,13 +333,13 @@ def upsert_gmail_service_account_credential( try: credential_base = build_service_account_creds( DocumentSource.GMAIL, - delegated_user_email=service_account_credential_request.gmail_delegated_user, + primary_admin_email=service_account_credential_request.google_primary_admin, ) - except ConfigNotFoundError as e: + except KvKeyNotFoundError as e: raise HTTPException(status_code=400, detail=str(e)) # first delete all existing service account credentials - delete_gmail_service_account_credentials(user, db_session) + delete_service_account_credentials(user, db_session, DocumentSource.GMAIL) # `user=None` since this credential is not a personal credential credential = create_credential( credential_data=credential_base, user=user, db_session=db_session @@ -330,28 +360,15 @@ def check_drive_tokens( ): return AuthStatus(authenticated=False) token_json_str = str(db_credentials.credential_json[DB_CREDENTIALS_DICT_TOKEN_KEY]) - google_drive_creds = get_google_drive_creds_for_authorized_user( - token_json_str=token_json_str + google_drive_creds = get_google_oauth_creds( + token_json_str=token_json_str, + source=DocumentSource.GOOGLE_DRIVE, ) if google_drive_creds is None: return AuthStatus(authenticated=False) return AuthStatus(authenticated=True) -@router.get("/admin/connector/google-drive/authorize/{credential_id}") -def admin_google_drive_auth( - response: Response, credential_id: str, _: User = Depends(current_admin_user) -) -> AuthUrl: - # set a cookie that we can read in the callback (used for `verify_csrf`) - response.set_cookie( - key=_GOOGLE_DRIVE_CREDENTIAL_ID_COOKIE_NAME, - value=credential_id, - httponly=True, - max_age=600, - ) - return AuthUrl(auth_url=get_auth_url(credential_id=int(credential_id))) - - @router.post("/admin/connector/file/upload") def upload_files( files: list[UploadFile], @@ -476,13 +493,14 @@ def get_connector_indexing_status( get_editable: bool = Query( False, description="If true, return editable document sets" ), + tenant_id: str | None = Depends(get_current_tenant_id), ) -> list[ConnectorIndexingStatus]: indexing_statuses: list[ConnectorIndexingStatus] = [] # NOTE: If the connector is deleting behind the scenes, # accessing cc_pairs can be inconsistent and members like # connector or credential may be None. - # Additional checks are done to make sure the connector and credential still exists. + # Additional checks are done to make sure the connector and credential still exist. # TODO: make this one query ... possibly eager load or wrap in a read transaction # to avoid the complexity of trying to error check throughout the function cc_pairs = get_connector_credential_pairs( @@ -530,6 +548,12 @@ def get_connector_indexing_status( relationship.user_group_id ) + search_settings: SearchSettings | None = None + if not secondary_index: + search_settings = get_current_search_settings(db_session) + else: + search_settings = get_secondary_search_settings(db_session) + for cc_pair in cc_pairs: # TODO remove this to enable ingestion API if cc_pair.name == "DefaultCCPair": @@ -541,6 +565,13 @@ def get_connector_indexing_status( # This may happen if background deletion is happening continue + in_progress = False + if search_settings: + redis_connector = RedisConnector(tenant_id, cc_pair.id) + redis_connector_index = redis_connector.new_index(search_settings.id) + if redis_connector_index.fenced: + in_progress = True + latest_index_attempt = cc_pair_to_latest_index_attempt.get( (connector.id, credential.id) ) @@ -559,7 +590,7 @@ def get_connector_indexing_status( cc_pair_status=cc_pair.status, connector=ConnectorSnapshot.from_connector_db_model(connector), credential=CredentialSnapshot.from_credential_db_model(credential), - public_doc=cc_pair.is_public, + access_type=cc_pair.access_type, owner=credential.user.email if credential.user else "", groups=group_cc_pair_relationships_dict.get(cc_pair.id, []), last_finished_status=( @@ -586,6 +617,7 @@ def get_connector_indexing_status( connector_id=connector.id, credential_id=credential.id, db_session=db_session, + tenant_id=tenant_id, ), is_deletable=check_deletion_attempt_is_allowed( connector_credential_pair=cc_pair, @@ -594,6 +626,7 @@ def get_connector_indexing_status( allow_scheduled=True, ) is None, + in_progress=in_progress, ) ) @@ -624,11 +657,15 @@ def create_connector_from_model( ) -> ObjectCreationIdResponse: try: _validate_connector_allowed(connector_data.source) - validate_user_creation_permissions( + + fetch_ee_implementation_or_noop( + "danswer.db.user_group", "validate_user_creation_permissions", None + )( db_session=db_session, user=user, target_group_ids=connector_data.groups, - object_is_public=connector_data.is_public, + object_is_public=connector_data.access_type == AccessType.PUBLIC, + object_is_perm_sync=connector_data.access_type == AccessType.SYNC, ) connector_base = connector_data.to_connector_base() return create_connector( @@ -646,34 +683,39 @@ def create_connector_with_mock_credential( user: User = Depends(current_curator_or_admin_user), db_session: Session = Depends(get_session), ) -> StatusResponse: - if user and user.role != UserRole.ADMIN: - if connector_data.is_public: - raise HTTPException( - status_code=401, - detail="User does not have permission to create public credentials", - ) - if not connector_data.groups: - raise HTTPException( - status_code=401, - detail="Curators must specify 1+ groups", - ) + fetch_ee_implementation_or_noop( + "danswer.db.user_group", "validate_user_creation_permissions", None + )( + db_session=db_session, + user=user, + target_group_ids=connector_data.groups, + object_is_public=connector_data.access_type == AccessType.PUBLIC, + object_is_perm_sync=connector_data.access_type == AccessType.SYNC, + ) try: _validate_connector_allowed(connector_data.source) connector_response = create_connector( - db_session=db_session, connector_data=connector_data + db_session=db_session, + connector_data=connector_data, ) + mock_credential = CredentialBase( - credential_json={}, admin_public=True, source=connector_data.source + credential_json={}, + admin_public=True, + source=connector_data.source, ) credential = create_credential( - mock_credential, user=user, db_session=db_session + credential_data=mock_credential, + user=user, + db_session=db_session, ) + response = add_credential_to_connector( db_session=db_session, user=user, connector_id=cast(int, connector_response.id), # will aways be an int credential_id=credential.id, - is_public=connector_data.is_public or False, + access_type=connector_data.access_type, cc_pair_name=connector_data.name, groups=connector_data.groups, ) @@ -692,11 +734,14 @@ def update_connector_from_model( ) -> ConnectorSnapshot | StatusResponse[int]: try: _validate_connector_allowed(connector_data.source) - validate_user_creation_permissions( + fetch_ee_implementation_or_noop( + "danswer.db.user_group", "validate_user_creation_permissions", None + )( db_session=db_session, user=user, target_group_ids=connector_data.groups, - object_is_public=connector_data.is_public, + object_is_public=connector_data.access_type == AccessType.PUBLIC, + object_is_perm_sync=connector_data.access_type == AccessType.SYNC, ) connector_base = connector_data.to_connector_base() except ValueError as e: @@ -746,7 +791,13 @@ def connector_run_once( run_info: RunConnectorRequest, _: User = Depends(current_curator_or_admin_user), db_session: Session = Depends(get_session), + tenant_id: str = Depends(get_current_tenant_id), ) -> StatusResponse[list[int]]: + """Used to trigger indexing on a set of cc_pairs associated with a + single connector.""" + + r = get_redis_client(tenant_id=tenant_id) + connector_id = run_info.connector_id specified_credential_ids = run_info.credential_ids @@ -777,6 +828,7 @@ def connector_run_once( detail="Connector has no valid credentials, cannot create index attempts.", ) + # Prevents index attempts for cc pairs that already have an index attempt currently running skipped_credentials = [ credential_id for credential_id in credential_ids @@ -786,39 +838,58 @@ def connector_run_once( credential_id=credential_id, ), only_current=True, - disinclude_finished=True, db_session=db_session, + disinclude_finished=True, ) ] search_settings = get_current_search_settings(db_session) connector_credential_pairs = [ - get_connector_credential_pair(run_info.connector_id, credential_id, db_session) + get_connector_credential_pair(connector_id, credential_id, db_session) for credential_id in credential_ids if credential_id not in skipped_credentials ] - index_attempt_ids = [ - create_index_attempt( - connector_credential_pair_id=connector_credential_pair.id, - search_settings_id=search_settings.id, - from_beginning=run_info.from_beginning, - db_session=db_session, - ) - for connector_credential_pair in connector_credential_pairs - if connector_credential_pair is not None - ] + index_attempt_ids = [] + for cc_pair in connector_credential_pairs: + if cc_pair is not None: + attempt_id = try_creating_indexing_task( + primary_app, + cc_pair, + search_settings, + run_info.from_beginning, + db_session, + r, + tenant_id, + ) + if attempt_id: + logger.info( + f"connector_run_once - try_creating_indexing_task succeeded: " + f"connector={run_info.connector_id} " + f"cc_pair={cc_pair.id} " + f"attempt={attempt_id} " + ) + index_attempt_ids.append(attempt_id) + else: + logger.info( + f"connector_run_once - try_creating_indexing_task failed: " + f"connector={run_info.connector_id} " + f"cc_pair={cc_pair.id}" + ) if not index_attempt_ids: + msg = "No new indexing attempts created, indexing jobs are queued or running." + logger.info(msg) raise HTTPException( status_code=400, - detail="No new indexing attempts created, indexing jobs are queued or running.", + detail=msg, ) + msg = f"Successfully created {len(index_attempt_ids)} index attempts. {index_attempt_ids}" return StatusResponse( success=True, - message=f"Successfully created {len(index_attempt_ids)} index attempts", + message=msg, data=index_attempt_ids, ) @@ -837,7 +908,7 @@ def gmail_auth( httponly=True, max_age=600, ) - return AuthUrl(auth_url=get_gmail_auth_url(int(credential_id))) + return AuthUrl(auth_url=get_auth_url(int(credential_id), DocumentSource.GMAIL)) @router.get("/connector/google-drive/authorize/{credential_id}") @@ -851,7 +922,9 @@ def google_drive_auth( httponly=True, max_age=600, ) - return AuthUrl(auth_url=get_auth_url(int(credential_id))) + return AuthUrl( + auth_url=get_auth_url(int(credential_id), DocumentSource.GOOGLE_DRIVE) + ) @router.get("/connector/gmail/callback") @@ -868,12 +941,10 @@ def gmail_callback( ) credential_id = int(credential_id_cookie) verify_csrf(credential_id, callback.state) - if ( - update_gmail_credential_access_tokens( - callback.code, credential_id, user, db_session - ) - is None - ): + credentials: Credentials | None = update_credential_access_tokens( + callback.code, credential_id, user, db_session, DocumentSource.GMAIL + ) + if credentials is None: raise HTTPException( status_code=500, detail="Unable to fetch Gmail access tokens" ) @@ -895,10 +966,11 @@ def google_drive_callback( ) credential_id = int(credential_id_cookie) verify_csrf(credential_id, callback.state) - if ( - update_credential_access_tokens(callback.code, credential_id, user, db_session) - is None - ): + + credentials: Credentials | None = update_credential_access_tokens( + callback.code, credential_id, user, db_session, DocumentSource.GOOGLE_DRIVE + ) + if credentials is None: raise HTTPException( status_code=500, detail="Unable to fetch Google Drive access tokens" ) diff --git a/backend/danswer/server/documents/credential.py b/backend/danswer/server/documents/credential.py index 3d965481bf5..602ca27ee5c 100644 --- a/backend/danswer/server/documents/credential.py +++ b/backend/danswer/server/documents/credential.py @@ -8,6 +8,8 @@ from danswer.auth.users import current_curator_or_admin_user from danswer.auth.users import current_user from danswer.db.credentials import alter_credential +from danswer.db.credentials import cleanup_gmail_credentials +from danswer.db.credentials import cleanup_google_drive_credentials from danswer.db.credentials import create_credential from danswer.db.credentials import CREDENTIAL_PERMISSIONS_TO_IGNORE from danswer.db.credentials import delete_credential @@ -26,7 +28,7 @@ from danswer.server.documents.models import ObjectCreationIdResponse from danswer.server.models import StatusResponse from danswer.utils.logger import setup_logger -from ee.danswer.db.user_group import validate_user_creation_permissions +from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop logger = setup_logger() @@ -79,18 +81,6 @@ def get_cc_source_full_info( ] -@router.get("/credential/{id}") -def list_credentials_by_id( - user: User | None = Depends(current_user), - db_session: Session = Depends(get_session), -) -> list[CredentialSnapshot]: - credentials = fetch_credentials(db_session=db_session, user=user) - return [ - CredentialSnapshot.from_credential_db_model(credential) - for credential in credentials - ] - - @router.delete("/admin/credential/{credential_id}") def delete_credential_by_id_admin( credential_id: int, @@ -131,13 +121,21 @@ def create_credential_from_model( db_session: Session = Depends(get_session), ) -> ObjectCreationIdResponse: if not _ignore_credential_permissions(credential_info.source): - validate_user_creation_permissions( + fetch_ee_implementation_or_noop( + "danswer.db.user_group", "validate_user_creation_permissions", None + )( db_session=db_session, user=user, target_group_ids=credential_info.groups, object_is_public=credential_info.curator_public, ) + # Temporary fix for empty Google App credentials + if credential_info.source == DocumentSource.GMAIL: + cleanup_gmail_credentials(db_session=db_session) + if credential_info.source == DocumentSource.GOOGLE_DRIVE: + cleanup_google_drive_credentials(db_session=db_session) + credential = create_credential(credential_info, user, db_session) return ObjectCreationIdResponse( id=credential.id, diff --git a/backend/danswer/server/documents/document.py b/backend/danswer/server/documents/document.py index bf8cdbcef44..0b8d8e744bd 100644 --- a/backend/danswer/server/documents/document.py +++ b/backend/danswer/server/documents/document.py @@ -5,6 +5,10 @@ from sqlalchemy.orm import Session from danswer.auth.users import current_user +from danswer.context.search.models import IndexFilters +from danswer.context.search.preprocessing.access_filters import ( + build_access_filters_for_user, +) from danswer.db.engine import get_session from danswer.db.models import User from danswer.db.search_settings import get_current_search_settings @@ -12,8 +16,6 @@ from danswer.document_index.interfaces import VespaChunkRequest from danswer.natural_language_processing.utils import get_tokenizer from danswer.prompts.prompt_utils import build_doc_context_str -from danswer.search.models import IndexFilters -from danswer.search.preprocessing.access_filters import build_access_filters_for_user from danswer.server.documents.models import ChunkInfo from danswer.server.documents.models import DocumentInfo diff --git a/backend/danswer/server/documents/models.py b/backend/danswer/server/documents/models.py index 517813892b8..7b523d929ec 100644 --- a/backend/danswer/server/documents/models.py +++ b/backend/danswer/server/documents/models.py @@ -4,16 +4,17 @@ from pydantic import BaseModel from pydantic import Field -from pydantic import model_validator from danswer.configs.app_configs import MASK_CREDENTIAL_PREFIX from danswer.configs.constants import DocumentSource from danswer.connectors.models import DocumentErrorSummary from danswer.connectors.models import InputType +from danswer.db.enums import AccessType from danswer.db.enums import ConnectorCredentialPairStatus from danswer.db.models import Connector from danswer.db.models import ConnectorCredentialPair from danswer.db.models import Credential +from danswer.db.models import Document as DbDocument from danswer.db.models import IndexAttempt from danswer.db.models import IndexAttemptError as DbIndexAttemptError from danswer.db.models import IndexingStatus @@ -21,6 +22,20 @@ from danswer.server.utils import mask_credential_dict +class DocumentSyncStatus(BaseModel): + doc_id: str + last_synced: datetime | None + last_modified: datetime | None + + @classmethod + def from_model(cls, doc: DbDocument) -> "DocumentSyncStatus": + return DocumentSyncStatus( + doc_id=doc.id, + last_synced=doc.last_synced, + last_modified=doc.last_modified, + ) + + class DocumentInfo(BaseModel): num_chunks: int num_tokens: int @@ -49,11 +64,11 @@ class ConnectorBase(BaseModel): class ConnectorUpdateRequest(ConnectorBase): - is_public: bool = True + access_type: AccessType groups: list[int] = Field(default_factory=list) def to_connector_base(self) -> ConnectorBase: - return ConnectorBase(**self.model_dump(exclude={"is_public", "groups"})) + return ConnectorBase(**self.model_dump(exclude={"access_type", "groups"})) class ConnectorSnapshot(ConnectorBase): @@ -218,9 +233,12 @@ class CCPairFullInfo(BaseModel): number_of_index_attempts: int last_index_attempt_status: IndexingStatus | None latest_deletion_attempt: DeletionAttemptSnapshot | None - is_public: bool + access_type: AccessType is_editable_for_current_user: bool deletion_failure_message: str | None + indexing: bool + creator: UUID | None + creator_email: str | None @classmethod def from_models( @@ -231,6 +249,7 @@ def from_models( last_index_attempt: IndexAttempt | None, num_docs_indexed: int, # not ideal, but this must be computed separately is_editable_for_current_user: bool, + indexing: bool, ) -> "CCPairFullInfo": # figure out if we need to artificially deflate the number of docs indexed. # This is required since the total number of docs indexed by a CC Pair is @@ -261,12 +280,25 @@ def from_models( number_of_index_attempts=number_of_index_attempts, last_index_attempt_status=last_indexing_status, latest_deletion_attempt=latest_deletion_attempt, - is_public=cc_pair_model.is_public, + access_type=cc_pair_model.access_type, is_editable_for_current_user=is_editable_for_current_user, deletion_failure_message=cc_pair_model.deletion_failure_message, + indexing=indexing, + creator=cc_pair_model.creator_id, + creator_email=cc_pair_model.creator.email + if cc_pair_model.creator + else None, ) +class CeleryTaskStatus(BaseModel): + id: str + name: str + status: TaskStatus + start_time: datetime | None + register_time: datetime | None + + class FailedConnectorIndexingStatus(BaseModel): """Simplified version of ConnectorIndexingStatus for failed indexing attempts""" @@ -288,7 +320,7 @@ class ConnectorIndexingStatus(BaseModel): credential: CredentialSnapshot owner: str groups: list[int] - public_doc: bool + access_type: AccessType last_finished_status: IndexingStatus | None last_status: IndexingStatus | None last_success: datetime | None @@ -298,6 +330,10 @@ class ConnectorIndexingStatus(BaseModel): deletion_attempt: DeletionAttemptSnapshot | None is_deletable: bool + # index attempt in db can be marked successful while celery/redis + # is stil running/cleaning up + in_progress: bool + class ConnectorCredentialPairIdentifier(BaseModel): connector_id: int @@ -306,7 +342,8 @@ class ConnectorCredentialPairIdentifier(BaseModel): class ConnectorCredentialPairMetadata(BaseModel): name: str | None = None - is_public: bool | None = None + access_type: AccessType + auto_sync_options: dict[str, Any] | None = None groups: list[int] = Field(default_factory=list) @@ -360,18 +397,7 @@ class GoogleServiceAccountKey(BaseModel): class GoogleServiceAccountCredentialRequest(BaseModel): - google_drive_delegated_user: str | None = None # email of user to impersonate - gmail_delegated_user: str | None = None # email of user to impersonate - - @model_validator(mode="after") - def check_user_delegation(self) -> "GoogleServiceAccountCredentialRequest": - if (self.google_drive_delegated_user is None) == ( - self.gmail_delegated_user is None - ): - raise ValueError( - "Exactly one of google_drive_delegated_user or gmail_delegated_user must be set" - ) - return self + google_primary_admin: str | None = None # email of user to impersonate class FileUploadResponse(BaseModel): diff --git a/backend/danswer/server/features/document_set/api.py b/backend/danswer/server/features/document_set/api.py index c9cea2cf2a2..26287d3f6e4 100644 --- a/backend/danswer/server/features/document_set/api.py +++ b/backend/danswer/server/features/document_set/api.py @@ -18,7 +18,7 @@ from danswer.server.features.document_set.models import DocumentSet from danswer.server.features.document_set.models import DocumentSetCreationRequest from danswer.server.features.document_set.models import DocumentSetUpdateRequest -from ee.danswer.db.user_group import validate_user_creation_permissions +from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop router = APIRouter(prefix="/manage") @@ -30,7 +30,9 @@ def create_document_set( user: User = Depends(current_curator_or_admin_user), db_session: Session = Depends(get_session), ) -> int: - validate_user_creation_permissions( + fetch_ee_implementation_or_noop( + "danswer.db.user_group", "validate_user_creation_permissions", None + )( db_session=db_session, user=user, target_group_ids=document_set_creation_request.groups, @@ -53,7 +55,9 @@ def patch_document_set( user: User = Depends(current_curator_or_admin_user), db_session: Session = Depends(get_session), ) -> None: - validate_user_creation_permissions( + fetch_ee_implementation_or_noop( + "danswer.db.user_group", "validate_user_creation_permissions", None + )( db_session=db_session, user=user, target_group_ids=document_set_update_request.groups, diff --git a/backend/danswer/server/features/document_set/models.py b/backend/danswer/server/features/document_set/models.py index 55f3376545f..740cb6906cf 100644 --- a/backend/danswer/server/features/document_set/models.py +++ b/backend/danswer/server/features/document_set/models.py @@ -47,7 +47,6 @@ class DocumentSet(BaseModel): description: str cc_pair_descriptors: list[ConnectorCredentialPairDescriptor] is_up_to_date: bool - contains_non_public: bool is_public: bool # For Private Document Sets, who should be able to access these users: list[UUID] @@ -59,12 +58,6 @@ def from_model(cls, document_set_model: DocumentSetDBModel) -> "DocumentSet": id=document_set_model.id, name=document_set_model.name, description=document_set_model.description, - contains_non_public=any( - [ - not cc_pair.is_public - for cc_pair in document_set_model.connector_credential_pairs - ] - ), cc_pair_descriptors=[ ConnectorCredentialPairDescriptor( id=cc_pair.id, diff --git a/backend/danswer/server/features/folder/models.py b/backend/danswer/server/features/folder/models.py index d7b161414a3..3f7e1304cbc 100644 --- a/backend/danswer/server/features/folder/models.py +++ b/backend/danswer/server/features/folder/models.py @@ -1,3 +1,5 @@ +from uuid import UUID + from pydantic import BaseModel from danswer.server.query_and_chat.models import ChatSessionDetails @@ -23,7 +25,7 @@ class FolderUpdateRequest(BaseModel): class FolderChatSessionRequest(BaseModel): - chat_session_id: int + chat_session_id: UUID class DeleteFolderOptions(BaseModel): diff --git a/backend/danswer/server/features/notifications/api.py b/backend/danswer/server/features/notifications/api.py new file mode 100644 index 00000000000..a4f5415a6a1 --- /dev/null +++ b/backend/danswer/server/features/notifications/api.py @@ -0,0 +1,47 @@ +from fastapi import APIRouter +from fastapi import Depends +from fastapi import HTTPException +from sqlalchemy.orm import Session + +from danswer.auth.users import current_user +from danswer.db.engine import get_session +from danswer.db.models import User +from danswer.db.notification import dismiss_notification +from danswer.db.notification import get_notification_by_id +from danswer.db.notification import get_notifications +from danswer.server.settings.models import Notification as NotificationModel +from danswer.utils.logger import setup_logger + +logger = setup_logger() + +router = APIRouter(prefix="/notifications") + + +@router.get("") +def get_notifications_api( + user: User = Depends(current_user), + db_session: Session = Depends(get_session), +) -> list[NotificationModel]: + notifications = [ + NotificationModel.from_model(notif) + for notif in get_notifications(user, db_session, include_dismissed=False) + ] + return notifications + + +@router.post("/{notification_id}/dismiss") +def dismiss_notification_endpoint( + notification_id: int, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> None: + try: + notification = get_notification_by_id(notification_id, user, db_session) + except PermissionError: + raise HTTPException( + status_code=403, detail="Not authorized to dismiss this notification" + ) + except ValueError: + raise HTTPException(status_code=404, detail="Notification not found") + + dismiss_notification(notification, db_session) diff --git a/backend/danswer/server/features/persona/api.py b/backend/danswer/server/features/persona/api.py index bcc4800b860..fd092fb90ef 100644 --- a/backend/danswer/server/features/persona/api.py +++ b/backend/danswer/server/features/persona/api.py @@ -11,16 +11,23 @@ from danswer.auth.users import current_admin_user from danswer.auth.users import current_curator_or_admin_user +from danswer.auth.users import current_limited_user from danswer.auth.users import current_user from danswer.configs.constants import FileOrigin +from danswer.configs.constants import NotificationType from danswer.db.engine import get_session from danswer.db.models import User +from danswer.db.notification import create_notification +from danswer.db.persona import create_assistant_category from danswer.db.persona import create_update_persona +from danswer.db.persona import delete_persona_category +from danswer.db.persona import get_assistant_categories from danswer.db.persona import get_persona_by_id from danswer.db.persona import get_personas from danswer.db.persona import mark_persona_as_deleted from danswer.db.persona import mark_persona_as_not_deleted from danswer.db.persona import update_all_personas_display_priority +from danswer.db.persona import update_persona_category from danswer.db.persona import update_persona_public_status from danswer.db.persona import update_persona_shared_users from danswer.db.persona import update_persona_visibility @@ -28,9 +35,14 @@ from danswer.file_store.models import ChatFileType from danswer.llm.answering.prompts.utils import build_dummy_prompt from danswer.server.features.persona.models import CreatePersonaRequest +from danswer.server.features.persona.models import ImageGenerationToolStatus +from danswer.server.features.persona.models import PersonaCategoryCreate +from danswer.server.features.persona.models import PersonaCategoryResponse +from danswer.server.features.persona.models import PersonaSharedNotificationData from danswer.server.features.persona.models import PersonaSnapshot from danswer.server.features.persona.models import PromptTemplateResponse from danswer.server.models import DisplayPriorityRequest +from danswer.tools.utils import is_image_generation_available from danswer.utils.logger import setup_logger @@ -164,6 +176,9 @@ def create_persona( ) +# NOTE: This endpoint cannot update persona configuration options that +# are core to the persona, such as its display priority and +# whether or not the assistant is a built-in / default assistant @basic_router.patch("/{persona_id}") def update_persona( persona_id: int, @@ -179,15 +194,69 @@ def update_persona( ) +class PersonaCategoryPatchRequest(BaseModel): + category_description: str + category_name: str + + +@basic_router.get("/categories") +def get_categories( + db: Session = Depends(get_session), + _: User | None = Depends(current_user), +) -> list[PersonaCategoryResponse]: + return [ + PersonaCategoryResponse.from_model(category) + for category in get_assistant_categories(db_session=db) + ] + + +@admin_router.post("/categories") +def create_category( + category: PersonaCategoryCreate, + db: Session = Depends(get_session), + _: User | None = Depends(current_admin_user), +) -> PersonaCategoryResponse: + """Create a new assistant category""" + category_model = create_assistant_category( + name=category.name, description=category.description, db_session=db + ) + return PersonaCategoryResponse.from_model(category_model) + + +@admin_router.patch("/category/{category_id}") +def patch_persona_category( + category_id: int, + persona_category_patch_request: PersonaCategoryPatchRequest, + _: User | None = Depends(current_admin_user), + db_session: Session = Depends(get_session), +) -> None: + update_persona_category( + category_id=category_id, + category_description=persona_category_patch_request.category_description, + category_name=persona_category_patch_request.category_name, + db_session=db_session, + ) + + +@admin_router.delete("/category/{category_id}") +def delete_category( + category_id: int, + _: User | None = Depends(current_admin_user), + db_session: Session = Depends(get_session), +) -> None: + delete_persona_category(category_id=category_id, db_session=db_session) + + class PersonaShareRequest(BaseModel): user_ids: list[UUID] +# We notify each user when a user is shared with them @basic_router.patch("/{persona_id}/share") def share_persona( persona_id: int, persona_share_request: PersonaShareRequest, - user: User | None = Depends(current_user), + user: User = Depends(current_user), db_session: Session = Depends(get_session), ) -> None: update_persona_shared_users( @@ -197,6 +266,18 @@ def share_persona( db_session=db_session, ) + for user_id in persona_share_request.user_ids: + # Don't notify the user that they have access to their own persona + if user_id != user.id: + create_notification( + user_id=user_id, + notif_type=NotificationType.PERSONA_SHARED, + db_session=db_session, + additional_data=PersonaSharedNotificationData( + persona_id=persona_id, + ).model_dump(), + ) + @basic_router.delete("/{persona_id}") def delete_persona( @@ -211,28 +292,51 @@ def delete_persona( ) +@basic_router.get("/image-generation-tool") +def get_image_generation_tool( + _: User + | None = Depends(current_user), # User param not used but kept for consistency + db_session: Session = Depends(get_session), +) -> ImageGenerationToolStatus: # Use bool instead of str for boolean values + is_available = is_image_generation_available(db_session=db_session) + return ImageGenerationToolStatus(is_available=is_available) + + @basic_router.get("") def list_personas( user: User | None = Depends(current_user), db_session: Session = Depends(get_session), include_deleted: bool = False, + persona_ids: list[int] = Query(None), ) -> list[PersonaSnapshot]: - return [ - PersonaSnapshot.from_model(persona) - for persona in get_personas( - user=user, - include_deleted=include_deleted, - db_session=db_session, - get_editable=False, - joinedload_all=True, + personas = get_personas( + user=user, + include_deleted=include_deleted, + db_session=db_session, + get_editable=False, + joinedload_all=True, + ) + + if persona_ids: + personas = [p for p in personas if p.id in persona_ids] + + # Filter out personas with unavailable tools + personas = [ + p + for p in personas + if not ( + any(tool.in_code_tool_id == "ImageGenerationTool" for tool in p.tools) + and not is_image_generation_available(db_session=db_session) ) ] + return [PersonaSnapshot.from_model(p) for p in personas] + @basic_router.get("/{persona_id}") def get_persona( persona_id: int, - user: User | None = Depends(current_user), + user: User | None = Depends(current_limited_user), db_session: Session = Depends(get_session), ) -> PersonaSnapshot: return PersonaSnapshot.from_model( diff --git a/backend/danswer/server/features/persona/models.py b/backend/danswer/server/features/persona/models.py index 777ef2037ee..f32a62090d2 100644 --- a/backend/danswer/server/features/persona/models.py +++ b/backend/danswer/server/features/persona/models.py @@ -1,18 +1,19 @@ +from datetime import datetime from uuid import UUID from pydantic import BaseModel from pydantic import Field +from danswer.context.search.enums import RecencyBiasSetting from danswer.db.models import Persona +from danswer.db.models import PersonaCategory from danswer.db.models import StarterMessage -from danswer.search.enums import RecencyBiasSetting from danswer.server.features.document_set.models import DocumentSet from danswer.server.features.prompt.models import PromptSnapshot -from danswer.server.features.tool.api import ToolSnapshot +from danswer.server.features.tool.models import ToolSnapshot from danswer.server.models import MinimalUserSnapshot from danswer.utils.logger import setup_logger - logger = setup_logger() @@ -38,6 +39,10 @@ class CreatePersonaRequest(BaseModel): icon_shape: int | None = None uploaded_image_id: str | None = None # New field for uploaded image remove_image: bool | None = None + is_default_persona: bool = False + display_priority: int | None = None + search_start_date: datetime | None = None + category_id: int | None = None class PersonaSnapshot(BaseModel): @@ -54,7 +59,7 @@ class PersonaSnapshot(BaseModel): llm_model_provider_override: str | None llm_model_version_override: str | None starter_messages: list[StarterMessage] | None - default_persona: bool + builtin_persona: bool prompts: list[PromptSnapshot] tools: list[ToolSnapshot] document_sets: list[DocumentSet] @@ -63,6 +68,9 @@ class PersonaSnapshot(BaseModel): icon_color: str | None icon_shape: int | None uploaded_image_id: str | None = None + is_default_persona: bool + search_start_date: datetime | None = None + category_id: int | None = None @classmethod def from_model( @@ -93,7 +101,8 @@ def from_model( llm_model_provider_override=persona.llm_model_provider_override, llm_model_version_override=persona.llm_model_version_override, starter_messages=persona.starter_messages, - default_persona=persona.default_persona, + builtin_persona=persona.builtin_persona, + is_default_persona=persona.is_default_persona, prompts=[PromptSnapshot.from_model(prompt) for prompt in persona.prompts], tools=[ToolSnapshot.from_model(tool) for tool in persona.tools], document_sets=[ @@ -108,8 +117,37 @@ def from_model( icon_color=persona.icon_color, icon_shape=persona.icon_shape, uploaded_image_id=persona.uploaded_image_id, + search_start_date=persona.search_start_date, + category_id=persona.category_id, ) class PromptTemplateResponse(BaseModel): final_prompt_template: str + + +class PersonaSharedNotificationData(BaseModel): + persona_id: int + + +class ImageGenerationToolStatus(BaseModel): + is_available: bool + + +class PersonaCategoryCreate(BaseModel): + name: str + description: str + + +class PersonaCategoryResponse(BaseModel): + id: int + name: str + description: str | None + + @classmethod + def from_model(cls, category: PersonaCategory) -> "PersonaCategoryResponse": + return PersonaCategoryResponse( + id=category.id, + name=category.name, + description=category.description, + ) diff --git a/backend/danswer/server/features/tool/api.py b/backend/danswer/server/features/tool/api.py index 9635a276507..48f857780ba 100644 --- a/backend/danswer/server/features/tool/api.py +++ b/backend/danswer/server/features/tool/api.py @@ -15,27 +15,25 @@ from danswer.db.tools import get_tool_by_id from danswer.db.tools import get_tools from danswer.db.tools import update_tool +from danswer.server.features.tool.models import CustomToolCreate +from danswer.server.features.tool.models import CustomToolUpdate from danswer.server.features.tool.models import ToolSnapshot -from danswer.tools.custom.openapi_parsing import MethodSpec -from danswer.tools.custom.openapi_parsing import openapi_to_method_specs -from danswer.tools.custom.openapi_parsing import validate_openapi_schema +from danswer.tools.tool_implementations.custom.openapi_parsing import MethodSpec +from danswer.tools.tool_implementations.custom.openapi_parsing import ( + openapi_to_method_specs, +) +from danswer.tools.tool_implementations.custom.openapi_parsing import ( + validate_openapi_schema, +) +from danswer.tools.tool_implementations.images.image_generation_tool import ( + ImageGenerationTool, +) +from danswer.tools.utils import is_image_generation_available router = APIRouter(prefix="/tool") admin_router = APIRouter(prefix="/admin/tool") -class CustomToolCreate(BaseModel): - name: str - description: str | None = None - definition: dict[str, Any] - - -class CustomToolUpdate(BaseModel): - name: str | None = None - description: str | None = None - definition: dict[str, Any] | None = None - - def _validate_tool_definition(definition: dict[str, Any]) -> None: try: validate_openapi_schema(definition) @@ -54,6 +52,7 @@ def create_custom_tool( name=tool_data.name, description=tool_data.description, openapi_schema=tool_data.definition, + custom_headers=tool_data.custom_headers, user_id=user.id if user else None, db_session=db_session, ) @@ -74,6 +73,7 @@ def update_custom_tool( name=tool_data.name, description=tool_data.description, openapi_schema=tool_data.definition, + custom_headers=tool_data.custom_headers, user_id=user.id if user else None, db_session=db_session, ) @@ -135,4 +135,9 @@ def list_tools( _: User | None = Depends(current_user), ) -> list[ToolSnapshot]: tools = get_tools(db_session) - return [ToolSnapshot.from_model(tool) for tool in tools] + return [ + ToolSnapshot.from_model(tool) + for tool in tools + if tool.in_code_tool_id != ImageGenerationTool.name + or is_image_generation_available(db_session=db_session) + ] diff --git a/backend/danswer/server/features/tool/models.py b/backend/danswer/server/features/tool/models.py index 0c1da965d4f..bf3e4d159b6 100644 --- a/backend/danswer/server/features/tool/models.py +++ b/backend/danswer/server/features/tool/models.py @@ -12,6 +12,7 @@ class ToolSnapshot(BaseModel): definition: dict[str, Any] | None display_name: str in_code_tool_id: str | None + custom_headers: list[Any] | None @classmethod def from_model(cls, tool: Tool) -> "ToolSnapshot": @@ -22,4 +23,24 @@ def from_model(cls, tool: Tool) -> "ToolSnapshot": definition=tool.openapi_schema, display_name=tool.display_name or tool.name, in_code_tool_id=tool.in_code_tool_id, + custom_headers=tool.custom_headers, ) + + +class Header(BaseModel): + key: str + value: str + + +class CustomToolCreate(BaseModel): + name: str + description: str | None = None + definition: dict[str, Any] + custom_headers: list[Header] | None = None + + +class CustomToolUpdate(BaseModel): + name: str | None = None + description: str | None = None + definition: dict[str, Any] | None = None + custom_headers: list[Header] | None = None diff --git a/backend/danswer/server/gpts/api.py b/backend/danswer/server/gpts/api.py index 1bebc3bfc1e..1d0684cf61a 100644 --- a/backend/danswer/server/gpts/api.py +++ b/backend/danswer/server/gpts/api.py @@ -6,11 +6,11 @@ from pydantic import BaseModel from sqlalchemy.orm import Session +from danswer.context.search.models import SearchRequest +from danswer.context.search.pipeline import SearchPipeline from danswer.db.engine import get_session from danswer.db.models import User from danswer.llm.factory import get_default_llms -from danswer.search.models import SearchRequest -from danswer.search.pipeline import SearchPipeline from danswer.server.danswer_api.ingestion import api_key_dep from danswer.utils.logger import setup_logger diff --git a/backend/danswer/server/long_term_logs/long_term_logs_api.py b/backend/danswer/server/long_term_logs/long_term_logs_api.py new file mode 100644 index 00000000000..ac7c42e5a0e --- /dev/null +++ b/backend/danswer/server/long_term_logs/long_term_logs_api.py @@ -0,0 +1,106 @@ +import json +import shutil +import tempfile +import zipfile +from datetime import datetime +from pathlib import Path + +from fastapi import APIRouter +from fastapi import Depends +from fastapi import HTTPException +from fastapi.responses import FileResponse +from starlette.background import BackgroundTask + +from danswer.auth.users import current_admin_user +from danswer.db.models import User +from danswer.utils.long_term_log import LongTermLogger + +router = APIRouter(prefix="/admin/long-term-logs") + + +@router.get("/{category}") +def get_long_term_logs( + category: str, + start_time: datetime | None = None, + end_time: datetime | None = None, + _: User | None = Depends(current_admin_user), +) -> list[dict | list | str]: + """Fetch logs for a specific category within an optional time range. + Only accessible by admin users.""" + try: + logger = LongTermLogger() + return logger.fetch_category( # type: ignore + category=category, + start_time=start_time, + end_time=end_time, + ) + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"Failed to fetch logs for category '{category}': {str(e)}", + ) + + +@router.get("/{category}/download") +def download_long_term_logs_zip( + category: str, + start_time: datetime | None = None, + end_time: datetime | None = None, + _: User | None = Depends(current_admin_user), +) -> FileResponse: + """Download logs for a specific category as a ZIP file. + Only accessible by admin users.""" + try: + logger = LongTermLogger() + logs = logger.fetch_category( + category=category, + start_time=start_time, + end_time=end_time, + ) + + # Create temporary files without using context manager + temp_dir = tempfile.mkdtemp() + temp_dir_path = Path(temp_dir) + + # Create JSON file + json_path = temp_dir_path / f"{category}-logs.json" + with open(json_path, "w") as f: + json.dump(logs, f, indent=2, default=str) + + # Create ZIP file + zip_path = temp_dir_path / f"{category}-logs.zip" + with zipfile.ZipFile(zip_path, "w") as zip_file: + zip_file.write(json_path, json_path.name) + + # Let FastAPI handle cleanup by setting background tasks + return FileResponse( + path=zip_path, + filename=f"{category}-logs.zip", + media_type="application/zip", + background=BackgroundTask( + lambda: shutil.rmtree(temp_dir, ignore_errors=True) + ), + ) + + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"Failed to create ZIP file for category '{category}': {str(e)}", + ) + + +@router.get("") +def get_available_categories( + _: User | None = Depends(current_admin_user), +) -> list[str]: + """Get a list of all available log categories. + Only accessible by admin users.""" + try: + logger = LongTermLogger() + # Get all subdirectories in the log directory + categories = [d.name for d in logger.log_file_path.iterdir() if d.is_dir()] + return categories + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to fetch log categories: {str(e)}" + ) diff --git a/backend/danswer/server/manage/administrative.py b/backend/danswer/server/manage/administrative.py index 481d2fedb9e..1ceeb776abc 100644 --- a/backend/danswer/server/manage/administrative.py +++ b/backend/danswer/server/manage/administrative.py @@ -10,6 +10,7 @@ from danswer.auth.users import current_admin_user from danswer.auth.users import current_curator_or_admin_user +from danswer.background.celery.versioned_apps.primary import app as primary_app from danswer.configs.app_configs import GENERATIVE_MODEL_ACCESS_CHECK_FREQ from danswer.configs.constants import DanswerCeleryPriority from danswer.configs.constants import DocumentSource @@ -18,7 +19,7 @@ from danswer.db.connector_credential_pair import ( update_connector_credential_pair_from_id, ) -from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed +from danswer.db.engine import get_current_tenant_id from danswer.db.engine import get_session from danswer.db.enums import ConnectorCredentialPairStatus from danswer.db.feedback import fetch_docs_ranked_by_boost @@ -28,9 +29,9 @@ from danswer.db.models import User from danswer.document_index.document_index_utils import get_both_index_names from danswer.document_index.factory import get_default_document_index -from danswer.dynamic_configs.factory import get_dynamic_config_store -from danswer.dynamic_configs.interface import ConfigNotFoundError from danswer.file_store.file_store import get_default_file_store +from danswer.key_value_store.factory import get_kv_store +from danswer.key_value_store.interface import KvKeyNotFoundError from danswer.llm.factory import get_default_llms from danswer.llm.utils import test_llm from danswer.server.documents.models import ConnectorCredentialPairIdentifier @@ -113,7 +114,7 @@ def validate_existing_genai_api_key( _: User = Depends(current_admin_user), ) -> None: # Only validate every so often - kv_store = get_dynamic_config_store() + kv_store = get_kv_store() curr_time = datetime.now(tz=timezone.utc) try: last_check = datetime.fromtimestamp( @@ -122,7 +123,7 @@ def validate_existing_genai_api_key( check_freq_sec = timedelta(seconds=GENERATIVE_MODEL_ACCESS_CHECK_FREQ) if curr_time - last_check < check_freq_sec: return - except ConfigNotFoundError: + except KvKeyNotFoundError: # First time checking the key, nothing unusual pass @@ -145,11 +146,8 @@ def create_deletion_attempt_for_connector_id( connector_credential_pair_identifier: ConnectorCredentialPairIdentifier, user: User = Depends(current_curator_or_admin_user), db_session: Session = Depends(get_session), + tenant_id: str = Depends(get_current_tenant_id), ) -> None: - from danswer.background.celery.celery_app import ( - check_for_connector_deletion_task, - ) - connector_id = connector_credential_pair_identifier.connector_id credential_id = connector_credential_pair_identifier.credential_id @@ -176,15 +174,19 @@ def create_deletion_attempt_for_connector_id( cc_pair_id=cc_pair.id, db_session=db_session, include_secondary_index=True ) + # TODO(rkuo): 2024-10-24 - check_deletion_attempt_is_allowed shouldn't be necessary + # any more due to background locking improvements. + # Remove the below permanently if everything is behaving for 30 days. + # Check if the deletion attempt should be allowed - deletion_attempt_disallowed_reason = check_deletion_attempt_is_allowed( - connector_credential_pair=cc_pair, db_session=db_session - ) - if deletion_attempt_disallowed_reason: - raise HTTPException( - status_code=400, - detail=deletion_attempt_disallowed_reason, - ) + # deletion_attempt_disallowed_reason = check_deletion_attempt_is_allowed( + # connector_credential_pair=cc_pair, db_session=db_session + # ) + # if deletion_attempt_disallowed_reason: + # raise HTTPException( + # status_code=400, + # detail=deletion_attempt_disallowed_reason, + # ) # mark as deleting update_connector_credential_pair_from_id( @@ -193,9 +195,13 @@ def create_deletion_attempt_for_connector_id( status=ConnectorCredentialPairStatus.DELETING, ) - # run the beat task to pick up this deletion early - check_for_connector_deletion_task.apply_async( + db_session.commit() + + # run the beat task to pick up this deletion from the db immediately + primary_app.send_task( + "check_for_connector_deletion_task", priority=DanswerCeleryPriority.HIGH, + kwargs={"tenant_id": tenant_id}, ) if cc_pair.connector.source == DocumentSource.FILE: diff --git a/backend/danswer/server/manage/embedding/api.py b/backend/danswer/server/manage/embedding/api.py index eac872810ef..5d6e55e7a6d 100644 --- a/backend/danswer/server/manage/embedding/api.py +++ b/backend/danswer/server/manage/embedding/api.py @@ -43,6 +43,8 @@ def test_embedding_configuration( api_url=test_llm_request.api_url, provider_type=test_llm_request.provider_type, model_name=test_llm_request.model_name, + api_version=test_llm_request.api_version, + deployment_name=test_llm_request.deployment_name, normalize=False, query_prefix=None, passage_prefix=None, diff --git a/backend/danswer/server/manage/embedding/models.py b/backend/danswer/server/manage/embedding/models.py index b4ca7862b55..a7e7cc8e1ac 100644 --- a/backend/danswer/server/manage/embedding/models.py +++ b/backend/danswer/server/manage/embedding/models.py @@ -17,12 +17,19 @@ class TestEmbeddingRequest(BaseModel): api_key: str | None = None api_url: str | None = None model_name: str | None = None + api_version: str | None = None + deployment_name: str | None = None + + # This disables the "model_" protected namespace for pydantic + model_config = {"protected_namespaces": ()} class CloudEmbeddingProvider(BaseModel): provider_type: EmbeddingProvider api_key: str | None = None api_url: str | None = None + api_version: str | None = None + deployment_name: str | None = None @classmethod def from_request( @@ -32,6 +39,8 @@ def from_request( provider_type=cloud_provider_model.provider_type, api_key=cloud_provider_model.api_key, api_url=cloud_provider_model.api_url, + api_version=cloud_provider_model.api_version, + deployment_name=cloud_provider_model.deployment_name, ) @@ -39,3 +48,5 @@ class CloudEmbeddingProviderCreationRequest(BaseModel): provider_type: EmbeddingProvider api_key: str | None = None api_url: str | None = None + api_version: str | None = None + deployment_name: str | None = None diff --git a/backend/danswer/server/manage/llm/api.py b/backend/danswer/server/manage/llm/api.py index 4e57ec7bc35..f52877d919a 100644 --- a/backend/danswer/server/manage/llm/api.py +++ b/backend/danswer/server/manage/llm/api.py @@ -3,12 +3,14 @@ from fastapi import APIRouter from fastapi import Depends from fastapi import HTTPException +from fastapi import Query from sqlalchemy.orm import Session from danswer.auth.users import current_admin_user from danswer.auth.users import current_user from danswer.db.engine import get_session from danswer.db.llm import fetch_existing_llm_providers +from danswer.db.llm import fetch_provider from danswer.db.llm import remove_llm_provider from danswer.db.llm import update_default_provider from danswer.db.llm import upsert_llm_provider @@ -17,6 +19,7 @@ from danswer.llm.factory import get_llm from danswer.llm.llm_provider_options import fetch_available_well_known_llms from danswer.llm.llm_provider_options import WellKnownLLMProviderDescriptor +from danswer.llm.utils import litellm_exception_to_error_msg from danswer.llm.utils import test_llm from danswer.server.manage.llm.models import FullLLMProvider from danswer.server.manage.llm.models import LLMProviderDescriptor @@ -27,7 +30,6 @@ logger = setup_logger() - admin_router = APIRouter(prefix="/admin/llm") basic_router = APIRouter(prefix="/llm") @@ -51,7 +53,9 @@ def test_llm_configuration( api_base=test_llm_request.api_base, api_version=test_llm_request.api_version, custom_config=test_llm_request.custom_config, + deployment_name=test_llm_request.deployment_name, ) + functions_with_args: list[tuple[Callable, tuple]] = [(test_llm, (llm,))] if ( @@ -66,6 +70,7 @@ def test_llm_configuration( api_base=test_llm_request.api_base, api_version=test_llm_request.api_version, custom_config=test_llm_request.custom_config, + deployment_name=test_llm_request.deployment_name, ) functions_with_args.append((test_llm, (fast_llm,))) @@ -77,7 +82,10 @@ def test_llm_configuration( ) if error: - raise HTTPException(status_code=400, detail=error) + client_error_msg = litellm_exception_to_error_msg( + error, llm, fallback_to_error_msg=True + ) + raise HTTPException(status_code=400, detail=client_error_msg) @admin_router.post("/test/default") @@ -118,10 +126,45 @@ def list_llm_providers( @admin_router.put("/provider") def put_llm_provider( llm_provider: LLMProviderUpsertRequest, + is_creation: bool = Query( + False, + description="True if updating an existing provider, False if creating a new one", + ), _: User | None = Depends(current_admin_user), db_session: Session = Depends(get_session), ) -> FullLLMProvider: - return upsert_llm_provider(llm_provider=llm_provider, db_session=db_session) + # validate request (e.g. if we're intending to create but the name already exists we should throw an error) + # NOTE: may involve duplicate fetching to Postgres, but we're assuming SQLAlchemy is smart enough to cache + # the result + existing_provider = fetch_provider(db_session, llm_provider.name) + if existing_provider and is_creation: + raise HTTPException( + status_code=400, + detail=f"LLM Provider with name {llm_provider.name} already exists", + ) + + # Ensure default_model_name and fast_default_model_name are in display_model_names + # This is necessary for custom models and Bedrock/Azure models + if llm_provider.display_model_names is None: + llm_provider.display_model_names = [] + + if llm_provider.default_model_name not in llm_provider.display_model_names: + llm_provider.display_model_names.append(llm_provider.default_model_name) + + if ( + llm_provider.fast_default_model_name + and llm_provider.fast_default_model_name not in llm_provider.display_model_names + ): + llm_provider.display_model_names.append(llm_provider.fast_default_model_name) + + try: + return upsert_llm_provider( + llm_provider=llm_provider, + db_session=db_session, + ) + except ValueError as e: + logger.exception("Failed to upsert LLM Provider") + raise HTTPException(status_code=400, detail=str(e)) @admin_router.delete("/provider/{provider_id}") diff --git a/backend/danswer/server/manage/llm/models.py b/backend/danswer/server/manage/llm/models.py index 3ef66971003..9b371099c57 100644 --- a/backend/danswer/server/manage/llm/models.py +++ b/backend/danswer/server/manage/llm/models.py @@ -21,6 +21,7 @@ class TestLLMRequest(BaseModel): # model level default_model_name: str fast_default_model_name: str | None = None + deployment_name: str | None = None class LLMProviderDescriptor(BaseModel): @@ -66,6 +67,7 @@ class LLMProvider(BaseModel): is_public: bool = True groups: list[int] = Field(default_factory=list) display_model_names: list[str] | None = None + deployment_name: str | None = None class LLMProviderUpsertRequest(LLMProvider): @@ -100,4 +102,5 @@ def from_model(cls, llm_provider_model: "LLMProviderModel") -> "FullLLMProvider" ), is_public=llm_provider_model.is_public, groups=[group.id for group in llm_provider_model.groups], + deployment_name=llm_provider_model.deployment_name, ) diff --git a/backend/danswer/server/manage/models.py b/backend/danswer/server/manage/models.py index 7b0a3813a82..9c2960741f3 100644 --- a/backend/danswer/server/manage/models.py +++ b/backend/danswer/server/manage/models.py @@ -10,13 +10,14 @@ from danswer.auth.schemas import UserRole from danswer.configs.app_configs import TRACK_EXTERNAL_IDP_EXPIRY from danswer.configs.constants import AuthType +from danswer.context.search.models import SavedSearchSettings from danswer.danswerbot.slack.config import VALID_SLACK_FILTERS from danswer.db.models import AllowedAnswerFilters from danswer.db.models import ChannelConfig -from danswer.db.models import SlackBotConfig as SlackBotConfigModel +from danswer.db.models import SlackBot as SlackAppModel from danswer.db.models import SlackBotResponseType +from danswer.db.models import SlackChannelConfig as SlackChannelConfigModel from danswer.db.models import User -from danswer.search.models import SavedSearchSettings from danswer.server.features.persona.models import PersonaSnapshot from danswer.server.models import FullUserSnapshot from danswer.server.models import InvitedUserSnapshot @@ -40,6 +41,9 @@ class AuthTypeResponse(BaseModel): class UserPreferences(BaseModel): chosen_assistants: list[int] | None = None + hidden_assistants: list[int] = [] + visible_assistants: list[int] = [] + recent_assistants: list[int] | None = None default_model: str | None = None @@ -54,6 +58,8 @@ class UserInfo(BaseModel): oidc_expiry: datetime | None = None current_token_created_at: datetime | None = None current_token_expiry_length: int | None = None + is_cloud_superuser: bool = False + organization_name: str | None = None @classmethod def from_model( @@ -61,6 +67,8 @@ def from_model( user: User, current_token_created_at: datetime | None = None, expiry_length: int | None = None, + is_cloud_superuser: bool = False, + organization_name: str | None = None, ) -> "UserInfo": return cls( id=str(user.id), @@ -73,8 +81,11 @@ def from_model( UserPreferences( chosen_assistants=user.chosen_assistants, default_model=user.default_model, + hidden_assistants=user.hidden_assistants, + visible_assistants=user.visible_assistants, ) ), + organization_name=organization_name, # set to None if TRACK_EXTERNAL_IDP_EXPIRY is False so that we avoid cases # where they previously had this set + used OIDC, and now they switched to # basic auth are now constantly getting redirected back to the login page @@ -82,6 +93,7 @@ def from_model( oidc_expiry=user.oidc_expiry if TRACK_EXTERNAL_IDP_EXPIRY else None, current_token_created_at=current_token_created_at, current_token_expiry_length=expiry_length, + is_cloud_superuser=is_cloud_superuser, ) @@ -116,24 +128,35 @@ class HiddenUpdateRequest(BaseModel): hidden: bool +class SlackBotCreationRequest(BaseModel): + name: str + enabled: bool + + bot_token: str + app_token: str + + class SlackBotTokens(BaseModel): bot_token: str app_token: str model_config = ConfigDict(frozen=True) -class SlackBotConfigCreationRequest(BaseModel): - # currently, a persona is created for each slack bot config +class SlackChannelConfigCreationRequest(BaseModel): + slack_bot_id: int + # currently, a persona is created for each Slack channel config # in the future, `document_sets` will probably be replaced # by an optional `PersonaSnapshot` object. Keeping it like this # for now for simplicity / speed of development document_sets: list[int] | None = None - persona_id: ( - int | None - ) = None # NOTE: only one of `document_sets` / `persona_id` should be set - channel_names: list[str] + + # NOTE: only one of `document_sets` / `persona_id` should be set + persona_id: int | None = None + + channel_name: str respond_tag_only: bool = False respond_to_bots: bool = False + show_continue_in_web_ui: bool = False enable_auto_filters: bool = False # If no team members, assume respond in the channel to everyone respond_member_group_list: list[str] = Field(default_factory=list) @@ -154,14 +177,17 @@ def validate_filters(cls, value: list[str]) -> list[str]: return value @model_validator(mode="after") - def validate_document_sets_and_persona_id(self) -> "SlackBotConfigCreationRequest": + def validate_document_sets_and_persona_id( + self, + ) -> "SlackChannelConfigCreationRequest": if self.document_sets and self.persona_id: raise ValueError("Only one of `document_sets` / `persona_id` should be set") return self -class SlackBotConfig(BaseModel): +class SlackChannelConfig(BaseModel): + slack_bot_id: int id: int persona: PersonaSnapshot | None channel_config: ChannelConfig @@ -172,25 +198,53 @@ class SlackBotConfig(BaseModel): @classmethod def from_model( - cls, slack_bot_config_model: SlackBotConfigModel - ) -> "SlackBotConfig": + cls, slack_channel_config_model: SlackChannelConfigModel + ) -> "SlackChannelConfig": return cls( - id=slack_bot_config_model.id, + id=slack_channel_config_model.id, + slack_bot_id=slack_channel_config_model.slack_bot_id, persona=( PersonaSnapshot.from_model( - slack_bot_config_model.persona, allow_deleted=True + slack_channel_config_model.persona, allow_deleted=True ) - if slack_bot_config_model.persona + if slack_channel_config_model.persona else None ), - channel_config=slack_bot_config_model.channel_config, - response_type=slack_bot_config_model.response_type, + channel_config=slack_channel_config_model.channel_config, + response_type=slack_channel_config_model.response_type, # XXX this is going away soon standard_answer_categories=[ StandardAnswerCategory.from_model(standard_answer_category_model) - for standard_answer_category_model in slack_bot_config_model.standard_answer_categories + for standard_answer_category_model in slack_channel_config_model.standard_answer_categories ], - enable_auto_filters=slack_bot_config_model.enable_auto_filters, + enable_auto_filters=slack_channel_config_model.enable_auto_filters, + ) + + +class SlackBot(BaseModel): + """ + This model is identical to the SlackAppModel, but it contains + a `configs_count` field to make it easier to fetch the number + of SlackChannelConfigs associated with a SlackBot. + """ + + id: int + name: str + enabled: bool + configs_count: int + + bot_token: str + app_token: str + + @classmethod + def from_model(cls, slack_bot_model: SlackAppModel) -> "SlackBot": + return cls( + id=slack_bot_model.id, + name=slack_bot_model.name, + enabled=slack_bot_model.enabled, + bot_token=slack_bot_model.bot_token, + app_token=slack_bot_model.app_token, + configs_count=len(slack_bot_model.slack_channel_configs), ) diff --git a/backend/danswer/server/manage/search_settings.py b/backend/danswer/server/manage/search_settings.py index c8433467f6c..5b8d7d8e1bb 100644 --- a/backend/danswer/server/manage/search_settings.py +++ b/backend/danswer/server/manage/search_settings.py @@ -7,6 +7,8 @@ from danswer.auth.users import current_admin_user from danswer.auth.users import current_user from danswer.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP +from danswer.context.search.models import SavedSearchSettings +from danswer.context.search.models import SearchSettingsCreationRequest from danswer.db.connector_credential_pair import get_connector_credential_pairs from danswer.db.connector_credential_pair import resync_cc_pair from danswer.db.engine import get_session @@ -21,16 +23,16 @@ from danswer.db.search_settings import update_current_search_settings from danswer.db.search_settings import update_search_settings_status from danswer.document_index.factory import get_default_document_index +from danswer.file_processing.unstructured import delete_unstructured_api_key +from danswer.file_processing.unstructured import get_unstructured_api_key +from danswer.file_processing.unstructured import update_unstructured_api_key from danswer.natural_language_processing.search_nlp_models import clean_model_name -from danswer.search.models import SavedSearchSettings -from danswer.search.models import SearchSettingsCreationRequest from danswer.server.manage.embedding.models import SearchSettingsDeleteRequest from danswer.server.manage.models import FullModelVersionResponse from danswer.server.models import IdReturn from danswer.utils.logger import setup_logger from shared_configs.configs import ALT_INDEX_SUFFIX - router = APIRouter(prefix="/search-settings") logger = setup_logger() @@ -113,6 +115,7 @@ def set_new_search_settings( for cc_pair in get_connector_credential_pairs(db_session): resync_cc_pair(cc_pair, db_session=db_session) + db_session.commit() return IdReturn(id=new_search_settings.id) @@ -196,3 +199,27 @@ def update_saved_search_settings( update_current_search_settings( search_settings=search_settings, db_session=db_session ) + + +@router.get("/unstructured-api-key-set") +def unstructured_api_key_set( + _: User | None = Depends(current_admin_user), +) -> bool: + api_key = get_unstructured_api_key() + print(api_key) + return api_key is not None + + +@router.put("/upsert-unstructured-api-key") +def upsert_unstructured_api_key( + unstructured_api_key: str, + _: User | None = Depends(current_admin_user), +) -> None: + update_unstructured_api_key(unstructured_api_key) + + +@router.delete("/delete-unstructured-api-key") +def delete_unstructured_api_key_endpoint( + _: User | None = Depends(current_admin_user), +) -> None: + delete_unstructured_api_key() diff --git a/backend/danswer/server/manage/slack_bot.py b/backend/danswer/server/manage/slack_bot.py index 9a06b225cce..60a7edaaed0 100644 --- a/backend/danswer/server/manage/slack_bot.py +++ b/backend/danswer/server/manage/slack_bot.py @@ -4,53 +4,57 @@ from sqlalchemy.orm import Session from danswer.auth.users import current_admin_user -from danswer.danswerbot.slack.config import validate_channel_names -from danswer.danswerbot.slack.tokens import fetch_tokens -from danswer.danswerbot.slack.tokens import save_tokens +from danswer.danswerbot.slack.config import validate_channel_name from danswer.db.constants import SLACK_BOT_PERSONA_PREFIX from danswer.db.engine import get_session from danswer.db.models import ChannelConfig from danswer.db.models import User from danswer.db.persona import get_persona_by_id -from danswer.db.slack_bot_config import create_slack_bot_persona -from danswer.db.slack_bot_config import fetch_slack_bot_config -from danswer.db.slack_bot_config import fetch_slack_bot_configs -from danswer.db.slack_bot_config import insert_slack_bot_config -from danswer.db.slack_bot_config import remove_slack_bot_config -from danswer.db.slack_bot_config import update_slack_bot_config -from danswer.dynamic_configs.interface import ConfigNotFoundError -from danswer.server.manage.models import SlackBotConfig -from danswer.server.manage.models import SlackBotConfigCreationRequest -from danswer.server.manage.models import SlackBotTokens +from danswer.db.slack_bot import fetch_slack_bot +from danswer.db.slack_bot import fetch_slack_bots +from danswer.db.slack_bot import insert_slack_bot +from danswer.db.slack_bot import remove_slack_bot +from danswer.db.slack_bot import update_slack_bot +from danswer.db.slack_channel_config import create_slack_channel_persona +from danswer.db.slack_channel_config import fetch_slack_channel_config +from danswer.db.slack_channel_config import fetch_slack_channel_configs +from danswer.db.slack_channel_config import insert_slack_channel_config +from danswer.db.slack_channel_config import remove_slack_channel_config +from danswer.db.slack_channel_config import update_slack_channel_config +from danswer.server.manage.models import SlackBot +from danswer.server.manage.models import SlackBotCreationRequest +from danswer.server.manage.models import SlackChannelConfig +from danswer.server.manage.models import SlackChannelConfigCreationRequest router = APIRouter(prefix="/manage") def _form_channel_config( - slack_bot_config_creation_request: SlackBotConfigCreationRequest, - current_slack_bot_config_id: int | None, db_session: Session, + slack_channel_config_creation_request: SlackChannelConfigCreationRequest, + current_slack_channel_config_id: int | None, ) -> ChannelConfig: - raw_channel_names = slack_bot_config_creation_request.channel_names - respond_tag_only = slack_bot_config_creation_request.respond_tag_only + raw_channel_name = slack_channel_config_creation_request.channel_name + respond_tag_only = slack_channel_config_creation_request.respond_tag_only respond_member_group_list = ( - slack_bot_config_creation_request.respond_member_group_list + slack_channel_config_creation_request.respond_member_group_list ) - answer_filters = slack_bot_config_creation_request.answer_filters - follow_up_tags = slack_bot_config_creation_request.follow_up_tags + answer_filters = slack_channel_config_creation_request.answer_filters + follow_up_tags = slack_channel_config_creation_request.follow_up_tags - if not raw_channel_names: + if not raw_channel_name: raise HTTPException( status_code=400, detail="Must provide at least one channel name", ) try: - cleaned_channel_names = validate_channel_names( - channel_names=raw_channel_names, - current_slack_bot_config_id=current_slack_bot_config_id, + cleaned_channel_name = validate_channel_name( db_session=db_session, + channel_name=raw_channel_name, + current_slack_channel_config_id=current_slack_channel_config_id, + current_slack_bot_id=slack_channel_config_creation_request.slack_bot_id, ) except ValueError as e: raise HTTPException( @@ -65,7 +69,7 @@ def _form_channel_config( ) channel_config: ChannelConfig = { - "channel_names": cleaned_channel_names, + "channel_name": cleaned_channel_name, } if respond_tag_only is not None: channel_config["respond_tag_only"] = respond_tag_only @@ -76,71 +80,79 @@ def _form_channel_config( if follow_up_tags is not None: channel_config["follow_up_tags"] = follow_up_tags + channel_config[ + "show_continue_in_web_ui" + ] = slack_channel_config_creation_request.show_continue_in_web_ui + channel_config[ "respond_to_bots" - ] = slack_bot_config_creation_request.respond_to_bots + ] = slack_channel_config_creation_request.respond_to_bots return channel_config -@router.post("/admin/slack-bot/config") -def create_slack_bot_config( - slack_bot_config_creation_request: SlackBotConfigCreationRequest, +@router.post("/admin/slack-app/channel") +def create_slack_channel_config( + slack_channel_config_creation_request: SlackChannelConfigCreationRequest, db_session: Session = Depends(get_session), _: User | None = Depends(current_admin_user), -) -> SlackBotConfig: +) -> SlackChannelConfig: channel_config = _form_channel_config( - slack_bot_config_creation_request, None, db_session + db_session=db_session, + slack_channel_config_creation_request=slack_channel_config_creation_request, + current_slack_channel_config_id=None, ) persona_id = None - if slack_bot_config_creation_request.persona_id is not None: - persona_id = slack_bot_config_creation_request.persona_id - elif slack_bot_config_creation_request.document_sets: - persona_id = create_slack_bot_persona( + if slack_channel_config_creation_request.persona_id is not None: + persona_id = slack_channel_config_creation_request.persona_id + elif slack_channel_config_creation_request.document_sets: + persona_id = create_slack_channel_persona( db_session=db_session, - channel_names=channel_config["channel_names"], - document_set_ids=slack_bot_config_creation_request.document_sets, + channel_name=channel_config["channel_name"], + document_set_ids=slack_channel_config_creation_request.document_sets, existing_persona_id=None, ).id - slack_bot_config_model = insert_slack_bot_config( + slack_channel_config_model = insert_slack_channel_config( + slack_bot_id=slack_channel_config_creation_request.slack_bot_id, persona_id=persona_id, channel_config=channel_config, - response_type=slack_bot_config_creation_request.response_type, - # XXX this is going away soon - standard_answer_category_ids=slack_bot_config_creation_request.standard_answer_categories, + response_type=slack_channel_config_creation_request.response_type, + standard_answer_category_ids=slack_channel_config_creation_request.standard_answer_categories, db_session=db_session, - enable_auto_filters=slack_bot_config_creation_request.enable_auto_filters, + enable_auto_filters=slack_channel_config_creation_request.enable_auto_filters, ) - return SlackBotConfig.from_model(slack_bot_config_model) + return SlackChannelConfig.from_model(slack_channel_config_model) -@router.patch("/admin/slack-bot/config/{slack_bot_config_id}") -def patch_slack_bot_config( - slack_bot_config_id: int, - slack_bot_config_creation_request: SlackBotConfigCreationRequest, +@router.patch("/admin/slack-app/channel/{slack_channel_config_id}") +def patch_slack_channel_config( + slack_channel_config_id: int, + slack_channel_config_creation_request: SlackChannelConfigCreationRequest, db_session: Session = Depends(get_session), _: User | None = Depends(current_admin_user), -) -> SlackBotConfig: +) -> SlackChannelConfig: channel_config = _form_channel_config( - slack_bot_config_creation_request, slack_bot_config_id, db_session + db_session=db_session, + slack_channel_config_creation_request=slack_channel_config_creation_request, + current_slack_channel_config_id=slack_channel_config_id, ) persona_id = None - if slack_bot_config_creation_request.persona_id is not None: - persona_id = slack_bot_config_creation_request.persona_id - elif slack_bot_config_creation_request.document_sets: - existing_slack_bot_config = fetch_slack_bot_config( - db_session=db_session, slack_bot_config_id=slack_bot_config_id + if slack_channel_config_creation_request.persona_id is not None: + persona_id = slack_channel_config_creation_request.persona_id + elif slack_channel_config_creation_request.document_sets: + existing_slack_channel_config = fetch_slack_channel_config( + db_session=db_session, slack_channel_config_id=slack_channel_config_id ) - if existing_slack_bot_config is None: + if existing_slack_channel_config is None: raise HTTPException( status_code=404, - detail="Slack bot config not found", + detail="Slack channel config not found", ) - existing_persona_id = existing_slack_bot_config.persona_id + existing_persona_id = existing_slack_channel_config.persona_id if existing_persona_id is not None: persona = get_persona_by_id( persona_id=existing_persona_id, @@ -155,62 +167,133 @@ def patch_slack_bot_config( # for this DanswerBot config existing_persona_id = None else: - existing_persona_id = existing_slack_bot_config.persona_id + existing_persona_id = existing_slack_channel_config.persona_id - persona_id = create_slack_bot_persona( + persona_id = create_slack_channel_persona( db_session=db_session, - channel_names=channel_config["channel_names"], - document_set_ids=slack_bot_config_creation_request.document_sets, + channel_name=channel_config["channel_name"], + document_set_ids=slack_channel_config_creation_request.document_sets, existing_persona_id=existing_persona_id, - enable_auto_filters=slack_bot_config_creation_request.enable_auto_filters, + enable_auto_filters=slack_channel_config_creation_request.enable_auto_filters, ).id - slack_bot_config_model = update_slack_bot_config( - slack_bot_config_id=slack_bot_config_id, + slack_channel_config_model = update_slack_channel_config( + db_session=db_session, + slack_channel_config_id=slack_channel_config_id, persona_id=persona_id, channel_config=channel_config, - response_type=slack_bot_config_creation_request.response_type, - standard_answer_category_ids=slack_bot_config_creation_request.standard_answer_categories, - db_session=db_session, - enable_auto_filters=slack_bot_config_creation_request.enable_auto_filters, + response_type=slack_channel_config_creation_request.response_type, + standard_answer_category_ids=slack_channel_config_creation_request.standard_answer_categories, + enable_auto_filters=slack_channel_config_creation_request.enable_auto_filters, ) - return SlackBotConfig.from_model(slack_bot_config_model) + return SlackChannelConfig.from_model(slack_channel_config_model) -@router.delete("/admin/slack-bot/config/{slack_bot_config_id}") -def delete_slack_bot_config( - slack_bot_config_id: int, +@router.delete("/admin/slack-app/channel/{slack_channel_config_id}") +def delete_slack_channel_config( + slack_channel_config_id: int, db_session: Session = Depends(get_session), user: User | None = Depends(current_admin_user), ) -> None: - remove_slack_bot_config( - slack_bot_config_id=slack_bot_config_id, user=user, db_session=db_session + remove_slack_channel_config( + db_session=db_session, + slack_channel_config_id=slack_channel_config_id, + user=user, ) -@router.get("/admin/slack-bot/config") -def list_slack_bot_configs( +@router.get("/admin/slack-app/channel") +def list_slack_channel_configs( db_session: Session = Depends(get_session), _: User | None = Depends(current_admin_user), -) -> list[SlackBotConfig]: - slack_bot_config_models = fetch_slack_bot_configs(db_session=db_session) +) -> list[SlackChannelConfig]: + slack_channel_config_models = fetch_slack_channel_configs(db_session=db_session) return [ - SlackBotConfig.from_model(slack_bot_config_model) - for slack_bot_config_model in slack_bot_config_models + SlackChannelConfig.from_model(slack_channel_config_model) + for slack_channel_config_model in slack_channel_config_models ] -@router.put("/admin/slack-bot/tokens") -def put_tokens( - tokens: SlackBotTokens, +@router.post("/admin/slack-app/bots") +def create_bot( + slack_bot_creation_request: SlackBotCreationRequest, + db_session: Session = Depends(get_session), + _: User | None = Depends(current_admin_user), +) -> SlackBot: + slack_bot_model = insert_slack_bot( + db_session=db_session, + name=slack_bot_creation_request.name, + enabled=slack_bot_creation_request.enabled, + bot_token=slack_bot_creation_request.bot_token, + app_token=slack_bot_creation_request.app_token, + ) + return SlackBot.from_model(slack_bot_model) + + +@router.patch("/admin/slack-app/bots/{slack_bot_id}") +def patch_bot( + slack_bot_id: int, + slack_bot_creation_request: SlackBotCreationRequest, + db_session: Session = Depends(get_session), + _: User | None = Depends(current_admin_user), +) -> SlackBot: + slack_bot_model = update_slack_bot( + db_session=db_session, + slack_bot_id=slack_bot_id, + name=slack_bot_creation_request.name, + enabled=slack_bot_creation_request.enabled, + bot_token=slack_bot_creation_request.bot_token, + app_token=slack_bot_creation_request.app_token, + ) + return SlackBot.from_model(slack_bot_model) + + +@router.delete("/admin/slack-app/bots/{slack_bot_id}") +def delete_bot( + slack_bot_id: int, + db_session: Session = Depends(get_session), _: User | None = Depends(current_admin_user), ) -> None: - save_tokens(tokens=tokens) + remove_slack_bot( + db_session=db_session, + slack_bot_id=slack_bot_id, + ) -@router.get("/admin/slack-bot/tokens") -def get_tokens(_: User | None = Depends(current_admin_user)) -> SlackBotTokens: - try: - return fetch_tokens() - except ConfigNotFoundError: - raise HTTPException(status_code=404, detail="No tokens found") +@router.get("/admin/slack-app/bots/{slack_bot_id}") +def get_bot_by_id( + slack_bot_id: int, + db_session: Session = Depends(get_session), + _: User | None = Depends(current_admin_user), +) -> SlackBot: + slack_bot_model = fetch_slack_bot( + db_session=db_session, + slack_bot_id=slack_bot_id, + ) + return SlackBot.from_model(slack_bot_model) + + +@router.get("/admin/slack-app/bots") +def list_bots( + db_session: Session = Depends(get_session), + _: User | None = Depends(current_admin_user), +) -> list[SlackBot]: + slack_bot_models = fetch_slack_bots(db_session=db_session) + return [ + SlackBot.from_model(slack_bot_model) for slack_bot_model in slack_bot_models + ] + + +@router.get("/admin/slack-app/bots/{bot_id}/config") +def list_bot_configs( + bot_id: int, + db_session: Session = Depends(get_session), + _: User | None = Depends(current_admin_user), +) -> list[SlackChannelConfig]: + slack_bot_config_models = fetch_slack_channel_configs( + db_session=db_session, slack_bot_id=bot_id + ) + return [ + SlackChannelConfig.from_model(slack_bot_config_model) + for slack_bot_config_model in slack_bot_config_models + ] diff --git a/backend/danswer/server/manage/users.py b/backend/danswer/server/manage/users.py index 96c79b4cbe7..5e4197aaf5c 100644 --- a/backend/danswer/server/manage/users.py +++ b/backend/danswer/server/manage/users.py @@ -2,17 +2,22 @@ from datetime import datetime from datetime import timezone +import jwt +from email_validator import EmailNotValidError +from email_validator import EmailUndeliverableError from email_validator import validate_email from fastapi import APIRouter from fastapi import Body from fastapi import Depends from fastapi import HTTPException -from fastapi import status +from fastapi import Request +from psycopg2.errors import UniqueViolation from pydantic import BaseModel from sqlalchemy import Column from sqlalchemy import desc from sqlalchemy import select from sqlalchemy import update +from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session from danswer.auth.invited_users import get_invited_users @@ -21,31 +26,44 @@ from danswer.auth.noauth_user import set_no_auth_user_preferences from danswer.auth.schemas import UserRole from danswer.auth.schemas import UserStatus +from danswer.auth.users import BasicAuthenticationError from danswer.auth.users import current_admin_user from danswer.auth.users import current_curator_or_admin_user from danswer.auth.users import current_user from danswer.auth.users import optional_user from danswer.configs.app_configs import AUTH_TYPE +from danswer.configs.app_configs import ENABLE_EMAIL_INVITES from danswer.configs.app_configs import SESSION_EXPIRE_TIME_SECONDS +from danswer.configs.app_configs import SUPER_USERS from danswer.configs.app_configs import VALID_EMAIL_DOMAINS from danswer.configs.constants import AuthType +from danswer.db.api_key import is_api_key_email_address +from danswer.db.auth import get_total_users_count +from danswer.db.engine import CURRENT_TENANT_ID_CONTEXTVAR from danswer.db.engine import get_session from danswer.db.models import AccessToken +from danswer.db.models import DocumentSet__User +from danswer.db.models import Persona__User +from danswer.db.models import SamlAccount from danswer.db.models import User +from danswer.db.models import User__UserGroup from danswer.db.users import get_user_by_email from danswer.db.users import list_users -from danswer.dynamic_configs.factory import get_dynamic_config_store +from danswer.db.users import validate_user_role_update +from danswer.key_value_store.factory import get_kv_store from danswer.server.manage.models import AllUsersResponse from danswer.server.manage.models import UserByEmail from danswer.server.manage.models import UserInfo +from danswer.server.manage.models import UserPreferences from danswer.server.manage.models import UserRoleResponse from danswer.server.manage.models import UserRoleUpdateRequest from danswer.server.models import FullUserSnapshot from danswer.server.models import InvitedUserSnapshot from danswer.server.models import MinimalUserSnapshot +from danswer.server.utils import send_user_email_invite from danswer.utils.logger import setup_logger -from ee.danswer.db.api_key import is_api_key_email_address -from ee.danswer.db.user_group import remove_curator_status__no_commit +from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop +from shared_configs.configs import MULTI_TENANT logger = setup_logger() @@ -67,25 +85,31 @@ def set_user_role( if not user_to_update: raise HTTPException(status_code=404, detail="User not found") - if user_role_update_request.new_role == UserRole.CURATOR: - raise HTTPException( - status_code=400, - detail="Curator role must be set via the User Group Menu", - ) - - if user_to_update.role == user_role_update_request.new_role: + current_role = user_to_update.role + requested_role = user_role_update_request.new_role + if requested_role == current_role: return - if current_user.id == user_to_update.id: + # This will raise an exception if the role update is invalid + validate_user_role_update( + requested_role=requested_role, + current_role=current_role, + ) + + if user_to_update.id == current_user.id: raise HTTPException( status_code=400, detail="An admin cannot demote themselves from admin role!", ) - if user_to_update.role == UserRole.CURATOR: - remove_curator_status__no_commit(db_session, user_to_update) + if requested_role == UserRole.CURATOR: + # Remove all curator db relationships before changing role + fetch_ee_implementation_or_noop( + "danswer.db.user_group", + "remove_curator_status__no_commit", + )(db_session, user_to_update) - user_to_update.role = user_role_update_request.new_role.value + user_to_update.role = user_role_update_request.new_role db_session.commit() @@ -103,7 +127,7 @@ def list_all_users( users = [ user - for user in list_users(db_session, email_filter_string=q, user=user) + for user in list_users(db_session, email_filter_string=q) if not is_api_key_email_address(user.email) ] accepted_emails = {user.email for user in users} @@ -158,30 +182,105 @@ def list_all_users( def bulk_invite_users( emails: list[str] = Body(..., embed=True), current_user: User | None = Depends(current_admin_user), + db_session: Session = Depends(get_session), ) -> int: """emails are string validated. If any email fails validation, no emails are invited and an exception is raised.""" + if current_user is None: raise HTTPException( status_code=400, detail="Auth is disabled, cannot invite users" ) + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() normalized_emails = [] - for email in emails: - email_info = validate_email(email) # can raise EmailNotValidError - normalized_emails.append(email_info.normalized) # type: ignore - all_emails = list(set(normalized_emails) | set(get_invited_users())) - return write_invited_users(all_emails) + try: + for email in emails: + email_info = validate_email(email) + normalized_emails.append(email_info.normalized) # type: ignore + + except (EmailUndeliverableError, EmailNotValidError) as e: + raise HTTPException( + status_code=400, + detail=f"Invalid email address: {email} - {str(e)}", + ) + + if MULTI_TENANT: + try: + fetch_ee_implementation_or_noop( + "danswer.server.tenants.provisioning", "add_users_to_tenant", None + )(normalized_emails, tenant_id) + + except IntegrityError as e: + if isinstance(e.orig, UniqueViolation): + raise HTTPException( + status_code=400, + detail="User has already been invited to a Danswer organization", + ) + raise + except Exception as e: + logger.error(f"Failed to add users to tenant {tenant_id}: {str(e)}") + + initial_invited_users = get_invited_users() + + all_emails = list(set(normalized_emails) | set(initial_invited_users)) + number_of_invited_users = write_invited_users(all_emails) + + if not MULTI_TENANT: + return number_of_invited_users + try: + logger.info("Registering tenant users") + fetch_ee_implementation_or_noop( + "danswer.server.tenants.billing", "register_tenant_users", None + )(CURRENT_TENANT_ID_CONTEXTVAR.get(), get_total_users_count(db_session)) + if ENABLE_EMAIL_INVITES: + try: + for email in all_emails: + send_user_email_invite(email, current_user) + except Exception as e: + logger.error(f"Error sending email invite to invited users: {e}") + + return number_of_invited_users + except Exception as e: + logger.error(f"Failed to register tenant users: {str(e)}") + logger.info( + "Reverting changes: removing users from tenant and resetting invited users" + ) + write_invited_users(initial_invited_users) # Reset to original state + fetch_ee_implementation_or_noop( + "danswer.server.tenants.user_mapping", "remove_users_from_tenant", None + )(normalized_emails, tenant_id) + raise e @router.patch("/manage/admin/remove-invited-user") def remove_invited_user( user_email: UserByEmail, _: User | None = Depends(current_admin_user), + db_session: Session = Depends(get_session), ) -> int: user_emails = get_invited_users() remaining_users = [user for user in user_emails if user != user_email.user_email] - return write_invited_users(remaining_users) + + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() + fetch_ee_implementation_or_noop( + "danswer.server.tenants.user_mapping", "remove_users_from_tenant", None + )([user_email.user_email], tenant_id) + number_of_invited_users = write_invited_users(remaining_users) + + try: + if MULTI_TENANT: + fetch_ee_implementation_or_noop( + "danswer.server.tenants.billing", "register_tenant_users", None + )(CURRENT_TENANT_ID_CONTEXTVAR.get(), get_total_users_count(db_session)) + except Exception: + logger.error( + "Request to update number of seats taken in control plane failed. " + "This may cause synchronization issues/out of date enforcement of seat limits." + ) + raise + + return number_of_invited_users @router.patch("/manage/admin/deactivate-user") @@ -237,10 +336,28 @@ async def delete_user( db_session.expunge(user_to_delete) try: - # Delete related OAuthAccounts first for oauth_account in user_to_delete.oauth_accounts: db_session.delete(oauth_account) + fetch_ee_implementation_or_noop( + "danswer.db.external_perm", + "delete_user__ext_group_for_user__no_commit", + )( + db_session=db_session, + user_id=user_to_delete.id, + ) + db_session.query(SamlAccount).filter( + SamlAccount.user_id == user_to_delete.id + ).delete() + db_session.query(DocumentSet__User).filter( + DocumentSet__User.user_id == user_to_delete.id + ).delete() + db_session.query(Persona__User).filter( + Persona__User.user_id == user_to_delete.id + ).delete() + db_session.query(User__UserGroup).filter( + User__UserGroup.user_id == user_to_delete.id + ).delete() db_session.delete(user_to_delete) db_session.commit() @@ -254,6 +371,10 @@ async def delete_user( logger.info(f"Deleted user {user_to_delete.email}") except Exception as e: + import traceback + + full_traceback = traceback.format_exc() + logger.error(f"Full stack trace:\n{full_traceback}") db_session.rollback() logger.error(f"Error deleting user {user_to_delete.email}: {str(e)}") raise HTTPException(status_code=500, detail="Error deleting user") @@ -305,6 +426,35 @@ async def get_user_role(user: User = Depends(current_user)) -> UserRoleResponse: return UserRoleResponse(role=user.role) +def get_current_token_expiration_jwt( + user: User | None, request: Request +) -> datetime | None: + if user is None: + return None + + try: + # Get the JWT from the cookie + jwt_token = request.cookies.get("fastapiusersauth") + if not jwt_token: + logger.error("No JWT token found in cookies") + return None + + # Decode the JWT + decoded_token = jwt.decode(jwt_token, options={"verify_signature": False}) + + # Get the 'exp' (expiration) claim from the token + exp = decoded_token.get("exp") + if exp: + return datetime.fromtimestamp(exp) + else: + logger.error("No 'exp' claim found in JWT") + return None + + except Exception as e: + logger.error(f"Error decoding JWT: {e}") + return None + + def get_current_token_creation( user: User | None, db_session: Session ) -> datetime | None: @@ -338,28 +488,34 @@ def verify_user_logged_in( # NOTE: this does not use `current_user` / `current_admin_user` because we don't want # to enforce user verification here - the frontend always wants to get the info about # the current user regardless of if they are currently verified + if user is None: # if auth type is disabled, return a dummy user with preferences from # the key-value store if AUTH_TYPE == AuthType.DISABLED: - store = get_dynamic_config_store() + store = get_kv_store() return fetch_no_auth_user(store) - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, detail="User Not Authenticated" - ) + raise BasicAuthenticationError(detail="User Not Authenticated") if user.oidc_expiry and user.oidc_expiry < datetime.now(timezone.utc): - raise HTTPException( - status_code=status.HTTP_403_FORBIDDEN, + raise BasicAuthenticationError( detail="Access denied. User's OIDC token has expired.", ) - token_created_at = get_current_token_creation(user, db_session) + token_created_at = ( + None if MULTI_TENANT else get_current_token_creation(user, db_session) + ) + organization_name = fetch_ee_implementation_or_noop( + "danswer.server.tenants.user_mapping", "get_tenant_id_for_email", None + )(user.email) + user_info = UserInfo.from_model( user, current_token_created_at=token_created_at, expiry_length=SESSION_EXPIRE_TIME_SECONDS, + is_cloud_superuser=user.email in SUPER_USERS, + organization_name=organization_name, ) return user_info @@ -372,6 +528,59 @@ class ChosenDefaultModelRequest(BaseModel): default_model: str | None = None +class RecentAssistantsRequest(BaseModel): + current_assistant: int + + +def update_recent_assistants( + recent_assistants: list[int] | None, current_assistant: int +) -> list[int]: + if recent_assistants is None: + recent_assistants = [] + else: + recent_assistants = [x for x in recent_assistants if x != current_assistant] + + # Add current assistant to start of list + recent_assistants.insert(0, current_assistant) + + # Keep only the 5 most recent assistants + recent_assistants = recent_assistants[:5] + return recent_assistants + + +@router.patch("/user/recent-assistants") +def update_user_recent_assistants( + request: RecentAssistantsRequest, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> None: + if user is None: + if AUTH_TYPE == AuthType.DISABLED: + store = get_kv_store() + no_auth_user = fetch_no_auth_user(store) + preferences = no_auth_user.preferences + recent_assistants = preferences.recent_assistants + updated_preferences = update_recent_assistants( + recent_assistants, request.current_assistant + ) + preferences.recent_assistants = updated_preferences + set_no_auth_user_preferences(store, preferences) + return + else: + raise RuntimeError("This should never happen") + + recent_assistants = UserInfo.from_model(user).preferences.recent_assistants + updated_recent_assistants = update_recent_assistants( + recent_assistants, request.current_assistant + ) + db_session.execute( + update(User) + .where(User.id == user.id) # type: ignore + .values(recent_assistants=updated_recent_assistants) + ) + db_session.commit() + + @router.patch("/user/default-model") def update_user_default_model( request: ChosenDefaultModelRequest, @@ -380,7 +589,7 @@ def update_user_default_model( ) -> None: if user is None: if AUTH_TYPE == AuthType.DISABLED: - store = get_dynamic_config_store() + store = get_kv_store() no_auth_user = fetch_no_auth_user(store) no_auth_user.preferences.default_model = request.default_model set_no_auth_user_preferences(store, no_auth_user.preferences) @@ -408,8 +617,7 @@ def update_user_assistant_list( ) -> None: if user is None: if AUTH_TYPE == AuthType.DISABLED: - store = get_dynamic_config_store() - + store = get_kv_store() no_auth_user = fetch_no_auth_user(store) no_auth_user.preferences.chosen_assistants = request.chosen_assistants set_no_auth_user_preferences(store, no_auth_user.preferences) @@ -423,3 +631,66 @@ def update_user_assistant_list( .values(chosen_assistants=request.chosen_assistants) ) db_session.commit() + + +def update_assistant_visibility( + preferences: UserPreferences, assistant_id: int, show: bool +) -> UserPreferences: + visible_assistants = preferences.visible_assistants or [] + hidden_assistants = preferences.hidden_assistants or [] + + if show: + if assistant_id not in visible_assistants: + visible_assistants.append(assistant_id) + if assistant_id in hidden_assistants: + hidden_assistants.remove(assistant_id) + else: + if assistant_id in visible_assistants: + visible_assistants.remove(assistant_id) + if assistant_id not in hidden_assistants: + hidden_assistants.append(assistant_id) + + preferences.visible_assistants = visible_assistants + preferences.hidden_assistants = hidden_assistants + return preferences + + +@router.patch("/user/assistant-list/update/{assistant_id}") +def update_user_assistant_visibility( + assistant_id: int, + show: bool, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> None: + if user is None: + if AUTH_TYPE == AuthType.DISABLED: + store = get_kv_store() + no_auth_user = fetch_no_auth_user(store) + preferences = no_auth_user.preferences + updated_preferences = update_assistant_visibility( + preferences, assistant_id, show + ) + if updated_preferences.chosen_assistants is not None: + updated_preferences.chosen_assistants.append(assistant_id) + + set_no_auth_user_preferences(store, updated_preferences) + return + else: + raise RuntimeError("This should never happen") + + user_preferences = UserInfo.from_model(user).preferences + updated_preferences = update_assistant_visibility( + user_preferences, assistant_id, show + ) + if updated_preferences.chosen_assistants is not None: + updated_preferences.chosen_assistants.append(assistant_id) + db_session.execute( + update(User) + .where(User.id == user.id) # type: ignore + .values( + hidden_assistants=updated_preferences.hidden_assistants, + visible_assistants=updated_preferences.visible_assistants, + chosen_assistants=updated_preferences.chosen_assistants, + ) + ) + db_session.commit() diff --git a/backend/danswer/server/openai_assistants_api/asssistants_api.py b/backend/danswer/server/openai_assistants_api/asssistants_api.py new file mode 100644 index 00000000000..000944213da --- /dev/null +++ b/backend/danswer/server/openai_assistants_api/asssistants_api.py @@ -0,0 +1,273 @@ +from typing import Any +from typing import Optional +from uuid import uuid4 + +from fastapi import APIRouter +from fastapi import Depends +from fastapi import HTTPException +from fastapi import Query +from pydantic import BaseModel +from sqlalchemy.orm import Session + +from danswer.auth.users import current_user +from danswer.context.search.enums import RecencyBiasSetting +from danswer.db.engine import get_session +from danswer.db.models import Persona +from danswer.db.models import User +from danswer.db.persona import get_persona_by_id +from danswer.db.persona import get_personas +from danswer.db.persona import mark_persona_as_deleted +from danswer.db.persona import upsert_persona +from danswer.db.persona import upsert_prompt +from danswer.db.tools import get_tool_by_name +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +router = APIRouter(prefix="/assistants") + + +# Base models +class AssistantObject(BaseModel): + id: int + object: str = "assistant" + created_at: int + name: Optional[str] = None + description: Optional[str] = None + model: str + instructions: Optional[str] = None + tools: list[dict[str, Any]] + file_ids: list[str] + metadata: Optional[dict[str, Any]] = None + + +class CreateAssistantRequest(BaseModel): + model: str + name: Optional[str] = None + description: Optional[str] = None + instructions: Optional[str] = None + tools: Optional[list[dict[str, Any]]] = None + file_ids: Optional[list[str]] = None + metadata: Optional[dict[str, Any]] = None + + +class ModifyAssistantRequest(BaseModel): + model: Optional[str] = None + name: Optional[str] = None + description: Optional[str] = None + instructions: Optional[str] = None + tools: Optional[list[dict[str, Any]]] = None + file_ids: Optional[list[str]] = None + metadata: Optional[dict[str, Any]] = None + + +class DeleteAssistantResponse(BaseModel): + id: int + object: str = "assistant.deleted" + deleted: bool + + +class ListAssistantsResponse(BaseModel): + object: str = "list" + data: list[AssistantObject] + first_id: Optional[int] = None + last_id: Optional[int] = None + has_more: bool + + +def persona_to_assistant(persona: Persona) -> AssistantObject: + return AssistantObject( + id=persona.id, + created_at=0, + name=persona.name, + description=persona.description, + model=persona.llm_model_version_override or "gpt-3.5-turbo", + instructions=persona.prompts[0].system_prompt if persona.prompts else None, + tools=[ + { + "type": tool.display_name, + "function": { + "name": tool.name, + "description": tool.description, + "schema": tool.openapi_schema, + }, + } + for tool in persona.tools + ], + file_ids=[], # Assuming no file support for now + metadata={}, # Assuming no metadata for now + ) + + +# API endpoints +@router.post("") +def create_assistant( + request: CreateAssistantRequest, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> AssistantObject: + prompt = None + if request.instructions: + prompt = upsert_prompt( + user=user, + name=f"Prompt for {request.name or 'New Assistant'}", + description="Auto-generated prompt", + system_prompt=request.instructions, + task_prompt="", + include_citations=True, + datetime_aware=True, + personas=[], + db_session=db_session, + ) + + tool_ids = [] + for tool in request.tools or []: + tool_type = tool.get("type") + if not tool_type: + continue + + try: + tool_db = get_tool_by_name(tool_type, db_session) + tool_ids.append(tool_db.id) + except ValueError: + # Skip tools that don't exist in the database + logger.error(f"Tool {tool_type} not found in database") + raise HTTPException( + status_code=404, detail=f"Tool {tool_type} not found in database" + ) + + persona = upsert_persona( + user=user, + name=request.name or f"Assistant-{uuid4()}", + description=request.description or "", + num_chunks=25, + llm_relevance_filter=True, + llm_filter_extraction=True, + recency_bias=RecencyBiasSetting.AUTO, + llm_model_provider_override=None, + llm_model_version_override=request.model, + starter_messages=None, + is_public=False, + db_session=db_session, + prompt_ids=[prompt.id] if prompt else [0], + document_set_ids=[], + tool_ids=tool_ids, + icon_color=None, + icon_shape=None, + is_visible=True, + ) + + if prompt: + prompt.personas = [persona] + db_session.commit() + + return persona_to_assistant(persona) + + +"" + + +@router.get("/{assistant_id}") +def retrieve_assistant( + assistant_id: int, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> AssistantObject: + try: + persona = get_persona_by_id( + persona_id=assistant_id, + user=user, + db_session=db_session, + is_for_edit=False, + ) + except ValueError: + persona = None + + if not persona: + raise HTTPException(status_code=404, detail="Assistant not found") + return persona_to_assistant(persona) + + +@router.post("/{assistant_id}") +def modify_assistant( + assistant_id: int, + request: ModifyAssistantRequest, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> AssistantObject: + persona = get_persona_by_id( + persona_id=assistant_id, + user=user, + db_session=db_session, + is_for_edit=True, + ) + if not persona: + raise HTTPException(status_code=404, detail="Assistant not found") + + update_data = request.model_dump(exclude_unset=True) + for key, value in update_data.items(): + setattr(persona, key, value) + + if "instructions" in update_data and persona.prompts: + persona.prompts[0].system_prompt = update_data["instructions"] + + db_session.commit() + return persona_to_assistant(persona) + + +@router.delete("/{assistant_id}") +def delete_assistant( + assistant_id: int, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> DeleteAssistantResponse: + try: + mark_persona_as_deleted( + persona_id=int(assistant_id), + user=user, + db_session=db_session, + ) + return DeleteAssistantResponse(id=assistant_id, deleted=True) + except ValueError: + raise HTTPException(status_code=404, detail="Assistant not found") + + +@router.get("") +def list_assistants( + limit: int = Query(20, le=100), + order: str = Query("desc", regex="^(asc|desc)$"), + after: Optional[int] = None, + before: Optional[int] = None, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> ListAssistantsResponse: + personas = list( + get_personas( + user=user, + db_session=db_session, + get_editable=False, + joinedload_all=True, + ) + ) + + # Apply filtering based on after and before + if after: + personas = [p for p in personas if p.id > int(after)] + if before: + personas = [p for p in personas if p.id < int(before)] + + # Apply ordering + personas.sort(key=lambda p: p.id, reverse=(order == "desc")) + + # Apply limit + personas = personas[:limit] + + assistants = [persona_to_assistant(p) for p in personas] + + return ListAssistantsResponse( + data=assistants, + first_id=assistants[0].id if assistants else None, + last_id=assistants[-1].id if assistants else None, + has_more=len(personas) == limit, + ) diff --git a/backend/danswer/server/openai_assistants_api/full_openai_assistants_api.py b/backend/danswer/server/openai_assistants_api/full_openai_assistants_api.py new file mode 100644 index 00000000000..2b2fe93e96e --- /dev/null +++ b/backend/danswer/server/openai_assistants_api/full_openai_assistants_api.py @@ -0,0 +1,19 @@ +from fastapi import APIRouter + +from danswer.server.openai_assistants_api.asssistants_api import ( + router as assistants_router, +) +from danswer.server.openai_assistants_api.messages_api import router as messages_router +from danswer.server.openai_assistants_api.runs_api import router as runs_router +from danswer.server.openai_assistants_api.threads_api import router as threads_router + + +def get_full_openai_assistants_api_router() -> APIRouter: + router = APIRouter(prefix="/openai-assistants") + + router.include_router(assistants_router) + router.include_router(runs_router) + router.include_router(threads_router) + router.include_router(messages_router) + + return router diff --git a/backend/danswer/server/openai_assistants_api/messages_api.py b/backend/danswer/server/openai_assistants_api/messages_api.py new file mode 100644 index 00000000000..c28c349f277 --- /dev/null +++ b/backend/danswer/server/openai_assistants_api/messages_api.py @@ -0,0 +1,235 @@ +import uuid +from datetime import datetime +from typing import Any +from typing import Literal +from typing import Optional + +from fastapi import APIRouter +from fastapi import Depends +from fastapi import HTTPException +from pydantic import BaseModel +from pydantic import Field +from sqlalchemy.orm import Session + +from danswer.auth.users import current_user +from danswer.configs.constants import MessageType +from danswer.db.chat import create_new_chat_message +from danswer.db.chat import get_chat_message +from danswer.db.chat import get_chat_messages_by_session +from danswer.db.chat import get_chat_session_by_id +from danswer.db.chat import get_or_create_root_message +from danswer.db.engine import get_session +from danswer.db.models import User +from danswer.llm.utils import check_number_of_tokens + +router = APIRouter(prefix="") + + +Role = Literal["user", "assistant"] + + +class MessageContent(BaseModel): + type: Literal["text"] + text: str + + +class Message(BaseModel): + id: str = Field(default_factory=lambda: f"msg_{uuid.uuid4()}") + object: Literal["thread.message"] = "thread.message" + created_at: int = Field(default_factory=lambda: int(datetime.now().timestamp())) + thread_id: str + role: Role + content: list[MessageContent] + file_ids: list[str] = [] + assistant_id: Optional[str] = None + run_id: Optional[str] = None + metadata: Optional[dict[str, Any]] = None # Change this line to use dict[str, Any] + + +class CreateMessageRequest(BaseModel): + role: Role + content: str + file_ids: list[str] = [] + metadata: Optional[dict] = None + + +class ListMessagesResponse(BaseModel): + object: Literal["list"] = "list" + data: list[Message] + first_id: str + last_id: str + has_more: bool + + +@router.post("/threads/{thread_id}/messages") +def create_message( + thread_id: str, + message: CreateMessageRequest, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> Message: + user_id = user.id if user else None + + try: + chat_session = get_chat_session_by_id( + chat_session_id=uuid.UUID(thread_id), + user_id=user_id, + db_session=db_session, + ) + except ValueError: + raise HTTPException(status_code=404, detail="Chat session not found") + + chat_messages = get_chat_messages_by_session( + chat_session_id=chat_session.id, + user_id=user.id if user else None, + db_session=db_session, + ) + latest_message = ( + chat_messages[-1] + if chat_messages + else get_or_create_root_message(chat_session.id, db_session) + ) + + new_message = create_new_chat_message( + chat_session_id=chat_session.id, + parent_message=latest_message, + message=message.content, + prompt_id=chat_session.persona.prompts[0].id, + token_count=check_number_of_tokens(message.content), + message_type=( + MessageType.USER if message.role == "user" else MessageType.ASSISTANT + ), + db_session=db_session, + ) + + return Message( + id=str(new_message.id), + thread_id=thread_id, + role="user", + content=[MessageContent(type="text", text=message.content)], + file_ids=message.file_ids, + metadata=message.metadata, + ) + + +@router.get("/threads/{thread_id}/messages") +def list_messages( + thread_id: str, + limit: int = 20, + order: Literal["asc", "desc"] = "desc", + after: Optional[str] = None, + before: Optional[str] = None, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> ListMessagesResponse: + user_id = user.id if user else None + + try: + chat_session = get_chat_session_by_id( + chat_session_id=uuid.UUID(thread_id), + user_id=user_id, + db_session=db_session, + ) + except ValueError: + raise HTTPException(status_code=404, detail="Chat session not found") + + messages = get_chat_messages_by_session( + chat_session_id=chat_session.id, + user_id=user_id, + db_session=db_session, + ) + + # Apply filtering based on after and before + if after: + messages = [m for m in messages if str(m.id) >= after] + if before: + messages = [m for m in messages if str(m.id) <= before] + + # Apply ordering + messages = sorted(messages, key=lambda m: m.id, reverse=(order == "desc")) + + # Apply limit + messages = messages[:limit] + + data = [ + Message( + id=str(m.id), + thread_id=thread_id, + role="user" if m.message_type == "user" else "assistant", + content=[MessageContent(type="text", text=m.message)], + created_at=int(m.time_sent.timestamp()), + ) + for m in messages + ] + + return ListMessagesResponse( + data=data, + first_id=str(data[0].id) if data else "", + last_id=str(data[-1].id) if data else "", + has_more=len(messages) == limit, + ) + + +@router.get("/threads/{thread_id}/messages/{message_id}") +def retrieve_message( + thread_id: str, + message_id: int, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> Message: + user_id = user.id if user else None + + try: + chat_message = get_chat_message( + chat_message_id=message_id, + user_id=user_id, + db_session=db_session, + ) + except ValueError: + raise HTTPException(status_code=404, detail="Message not found") + + return Message( + id=str(chat_message.id), + thread_id=thread_id, + role="user" if chat_message.message_type == "user" else "assistant", + content=[MessageContent(type="text", text=chat_message.message)], + created_at=int(chat_message.time_sent.timestamp()), + ) + + +class ModifyMessageRequest(BaseModel): + metadata: dict + + +@router.post("/threads/{thread_id}/messages/{message_id}") +def modify_message( + thread_id: str, + message_id: int, + request: ModifyMessageRequest, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> Message: + user_id = user.id if user else None + + try: + chat_message = get_chat_message( + chat_message_id=message_id, + user_id=user_id, + db_session=db_session, + ) + except ValueError: + raise HTTPException(status_code=404, detail="Message not found") + + # Update metadata + # TODO: Uncomment this once we have metadata in the chat message + # chat_message.metadata = request.metadata + # db_session.commit() + + return Message( + id=str(chat_message.id), + thread_id=thread_id, + role="user" if chat_message.message_type == "user" else "assistant", + content=[MessageContent(type="text", text=chat_message.message)], + created_at=int(chat_message.time_sent.timestamp()), + metadata=request.metadata, + ) diff --git a/backend/danswer/server/openai_assistants_api/runs_api.py b/backend/danswer/server/openai_assistants_api/runs_api.py new file mode 100644 index 00000000000..44bfaa3aca4 --- /dev/null +++ b/backend/danswer/server/openai_assistants_api/runs_api.py @@ -0,0 +1,344 @@ +from typing import Literal +from typing import Optional +from uuid import UUID + +from fastapi import APIRouter +from fastapi import BackgroundTasks +from fastapi import Depends +from fastapi import HTTPException +from pydantic import BaseModel +from sqlalchemy.orm import Session + +from danswer.auth.users import current_user +from danswer.chat.process_message import stream_chat_message_objects +from danswer.configs.constants import MessageType +from danswer.context.search.models import RetrievalDetails +from danswer.db.chat import create_new_chat_message +from danswer.db.chat import get_chat_message +from danswer.db.chat import get_chat_messages_by_session +from danswer.db.chat import get_chat_session_by_id +from danswer.db.chat import get_or_create_root_message +from danswer.db.engine import get_session +from danswer.db.models import ChatMessage +from danswer.db.models import User +from danswer.server.query_and_chat.models import ChatMessageDetail +from danswer.server.query_and_chat.models import CreateChatMessageRequest +from danswer.tools.tool_implementations.search.search_tool import SearchTool +from danswer.utils.logger import setup_logger + + +logger = setup_logger() + + +router = APIRouter() + + +class RunRequest(BaseModel): + assistant_id: int + model: Optional[str] = None + instructions: Optional[str] = None + additional_instructions: Optional[str] = None + tools: Optional[list[dict]] = None + metadata: Optional[dict] = None + + +RunStatus = Literal[ + "queued", + "in_progress", + "requires_action", + "cancelling", + "cancelled", + "failed", + "completed", + "expired", +] + + +class RunResponse(BaseModel): + id: str + object: Literal["thread.run"] + created_at: int + assistant_id: int + thread_id: UUID + status: RunStatus + started_at: Optional[int] = None + expires_at: Optional[int] = None + cancelled_at: Optional[int] = None + failed_at: Optional[int] = None + completed_at: Optional[int] = None + last_error: Optional[dict] = None + model: str + instructions: str + tools: list[dict] + file_ids: list[str] + metadata: Optional[dict] = None + + +def process_run_in_background( + message_id: int, + parent_message_id: int, + chat_session_id: UUID, + assistant_id: int, + instructions: str, + tools: list[dict], + user: User | None, + db_session: Session, +) -> None: + # Get the latest message in the chat session + chat_session = get_chat_session_by_id( + chat_session_id=chat_session_id, + user_id=user.id if user else None, + db_session=db_session, + ) + + search_tool_retrieval_details = RetrievalDetails() + for tool in tools: + if tool["type"] == SearchTool.__name__ and ( + retrieval_details := tool.get("retrieval_details") + ): + search_tool_retrieval_details = RetrievalDetails.model_validate( + retrieval_details + ) + break + + new_msg_req = CreateChatMessageRequest( + chat_session_id=chat_session_id, + parent_message_id=int(parent_message_id) if parent_message_id else None, + message=instructions, + file_descriptors=[], + prompt_id=chat_session.persona.prompts[0].id, + search_doc_ids=None, + retrieval_options=search_tool_retrieval_details, # Adjust as needed + query_override=None, + regenerate=None, + llm_override=None, + prompt_override=None, + alternate_assistant_id=assistant_id, + use_existing_user_message=True, + existing_assistant_message_id=message_id, + ) + + run_message = get_chat_message(message_id, user.id if user else None, db_session) + try: + for packet in stream_chat_message_objects( + new_msg_req=new_msg_req, + user=user, + db_session=db_session, + ): + if isinstance(packet, ChatMessageDetail): + # Update the run status and message content + run_message = get_chat_message( + message_id, user.id if user else None, db_session + ) + if run_message: + # this handles cancelling + if run_message.error: + return + + run_message.message = packet.message + run_message.message_type = MessageType.ASSISTANT + db_session.commit() + except Exception as e: + logger.exception("Error processing run in background") + run_message.error = str(e) + db_session.commit() + return + + db_session.refresh(run_message) + if run_message.token_count == 0: + run_message.error = "No tokens generated" + db_session.commit() + + +@router.post("/threads/{thread_id}/runs") +def create_run( + thread_id: UUID, + run_request: RunRequest, + background_tasks: BackgroundTasks, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> RunResponse: + try: + chat_session = get_chat_session_by_id( + chat_session_id=thread_id, + user_id=user.id if user else None, + db_session=db_session, + ) + except ValueError: + raise HTTPException(status_code=404, detail="Thread not found") + + chat_messages = get_chat_messages_by_session( + chat_session_id=chat_session.id, + user_id=user.id if user else None, + db_session=db_session, + ) + latest_message = ( + chat_messages[-1] + if chat_messages + else get_or_create_root_message(chat_session.id, db_session) + ) + + # Create a new "run" (chat message) in the session + new_message = create_new_chat_message( + chat_session_id=chat_session.id, + parent_message=latest_message, + message="", + prompt_id=chat_session.persona.prompts[0].id, + token_count=0, + message_type=MessageType.ASSISTANT, + db_session=db_session, + commit=False, + ) + db_session.flush() + latest_message.latest_child_message = new_message.id + db_session.commit() + + # Schedule the background task + background_tasks.add_task( + process_run_in_background, + new_message.id, + latest_message.id, + chat_session.id, + run_request.assistant_id, + run_request.instructions or "", + run_request.tools or [], + user, + db_session, + ) + + return RunResponse( + id=str(new_message.id), + object="thread.run", + created_at=int(new_message.time_sent.timestamp()), + assistant_id=run_request.assistant_id, + thread_id=chat_session.id, + status="queued", + model=run_request.model or "default_model", + instructions=run_request.instructions or "", + tools=run_request.tools or [], + file_ids=[], + metadata=run_request.metadata, + ) + + +@router.get("/threads/{thread_id}/runs/{run_id}") +def retrieve_run( + thread_id: UUID, + run_id: str, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> RunResponse: + # Retrieve the chat message (which represents a "run" in DAnswer) + chat_message = get_chat_message( + chat_message_id=int(run_id), # Convert string run_id to int + user_id=user.id if user else None, + db_session=db_session, + ) + if not chat_message: + raise HTTPException(status_code=404, detail="Run not found") + + chat_session = chat_message.chat_session + + # Map DAnswer status to OpenAI status + run_status: RunStatus = "queued" + if chat_message.message: + run_status = "in_progress" + if chat_message.token_count != 0: + run_status = "completed" + if chat_message.error: + run_status = "cancelled" + + return RunResponse( + id=run_id, + object="thread.run", + created_at=int(chat_message.time_sent.timestamp()), + assistant_id=chat_session.persona_id or 0, + thread_id=chat_session.id, + status=run_status, + started_at=int(chat_message.time_sent.timestamp()), + completed_at=( + int(chat_message.time_sent.timestamp()) if chat_message.message else None + ), + model=chat_session.current_alternate_model or "default_model", + instructions="", # DAnswer doesn't store per-message instructions + tools=[], # DAnswer doesn't have a direct equivalent for tools + file_ids=( + [file["id"] for file in chat_message.files] if chat_message.files else [] + ), + metadata=None, # DAnswer doesn't store metadata for individual messages + ) + + +@router.post("/threads/{thread_id}/runs/{run_id}/cancel") +def cancel_run( + thread_id: UUID, + run_id: str, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> RunResponse: + # In DAnswer, we don't have a direct equivalent to cancelling a run + # We'll simulate it by marking the message as "cancelled" + chat_message = ( + db_session.query(ChatMessage).filter(ChatMessage.id == run_id).first() + ) + if not chat_message: + raise HTTPException(status_code=404, detail="Run not found") + + chat_message.error = "Cancelled" + db_session.commit() + + return retrieve_run(thread_id, run_id, user, db_session) + + +@router.get("/threads/{thread_id}/runs") +def list_runs( + thread_id: UUID, + limit: int = 20, + order: Literal["asc", "desc"] = "desc", + after: Optional[str] = None, + before: Optional[str] = None, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> list[RunResponse]: + # In DAnswer, we'll treat each message in a chat session as a "run" + chat_messages = get_chat_messages_by_session( + chat_session_id=thread_id, + user_id=user.id if user else None, + db_session=db_session, + ) + + # Apply pagination + if after: + chat_messages = [msg for msg in chat_messages if str(msg.id) > after] + if before: + chat_messages = [msg for msg in chat_messages if str(msg.id) < before] + + # Apply ordering + chat_messages = sorted( + chat_messages, key=lambda msg: msg.time_sent, reverse=(order == "desc") + ) + + # Apply limit + chat_messages = chat_messages[:limit] + + return [ + retrieve_run(thread_id, str(msg.id), user, db_session) for msg in chat_messages + ] + + +@router.get("/threads/{thread_id}/runs/{run_id}/steps") +def list_run_steps( + run_id: str, + limit: int = 20, + order: Literal["asc", "desc"] = "desc", + after: Optional[str] = None, + before: Optional[str] = None, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> list[dict]: # You may want to create a specific model for run steps + # DAnswer doesn't have an equivalent to run steps + # We'll return an empty list to maintain API compatibility + return [] + + +# Additional helper functions can be added here if needed diff --git a/backend/danswer/server/openai_assistants_api/threads_api.py b/backend/danswer/server/openai_assistants_api/threads_api.py new file mode 100644 index 00000000000..ffc3a3016dc --- /dev/null +++ b/backend/danswer/server/openai_assistants_api/threads_api.py @@ -0,0 +1,156 @@ +from typing import Optional +from uuid import UUID + +from fastapi import APIRouter +from fastapi import Depends +from fastapi import HTTPException +from pydantic import BaseModel +from sqlalchemy.orm import Session + +from danswer.auth.users import current_user +from danswer.db.chat import create_chat_session +from danswer.db.chat import delete_chat_session +from danswer.db.chat import get_chat_session_by_id +from danswer.db.chat import get_chat_sessions_by_user +from danswer.db.chat import update_chat_session +from danswer.db.engine import get_session +from danswer.db.models import User +from danswer.server.query_and_chat.models import ChatSessionDetails +from danswer.server.query_and_chat.models import ChatSessionsResponse + +router = APIRouter(prefix="/threads") + + +# Models +class Thread(BaseModel): + id: UUID + object: str = "thread" + created_at: int + metadata: Optional[dict[str, str]] = None + + +class CreateThreadRequest(BaseModel): + messages: Optional[list[dict]] = None + metadata: Optional[dict[str, str]] = None + + +class ModifyThreadRequest(BaseModel): + metadata: Optional[dict[str, str]] = None + + +# API Endpoints +@router.post("") +def create_thread( + request: CreateThreadRequest, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> Thread: + user_id = user.id if user else None + new_chat_session = create_chat_session( + db_session=db_session, + description="", # Leave the naming till later to prevent delay + user_id=user_id, + persona_id=0, + ) + + return Thread( + id=new_chat_session.id, + created_at=int(new_chat_session.time_created.timestamp()), + metadata=request.metadata, + ) + + +@router.get("/{thread_id}") +def retrieve_thread( + thread_id: UUID, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> Thread: + user_id = user.id if user else None + try: + chat_session = get_chat_session_by_id( + chat_session_id=thread_id, + user_id=user_id, + db_session=db_session, + ) + except ValueError: + raise HTTPException(status_code=404, detail="Thread not found") + + return Thread( + id=chat_session.id, + created_at=int(chat_session.time_created.timestamp()), + metadata=None, # Assuming we don't store metadata in our current implementation + ) + + +@router.post("/{thread_id}") +def modify_thread( + thread_id: UUID, + request: ModifyThreadRequest, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> Thread: + user_id = user.id if user else None + try: + chat_session = update_chat_session( + db_session=db_session, + user_id=user_id, + chat_session_id=thread_id, + description=None, # Not updating description + sharing_status=None, # Not updating sharing status + ) + except ValueError: + raise HTTPException(status_code=404, detail="Thread not found") + + return Thread( + id=chat_session.id, + created_at=int(chat_session.time_created.timestamp()), + metadata=request.metadata, + ) + + +@router.delete("/{thread_id}") +def delete_thread( + thread_id: UUID, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> dict: + user_id = user.id if user else None + try: + delete_chat_session( + user_id=user_id, + chat_session_id=thread_id, + db_session=db_session, + ) + except ValueError: + raise HTTPException(status_code=404, detail="Thread not found") + + return {"id": str(thread_id), "object": "thread.deleted", "deleted": True} + + +@router.get("") +def list_threads( + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> ChatSessionsResponse: + user_id = user.id if user else None + chat_sessions = get_chat_sessions_by_user( + user_id=user_id, + deleted=False, + db_session=db_session, + ) + + return ChatSessionsResponse( + sessions=[ + ChatSessionDetails( + id=chat.id, + name=chat.description, + persona_id=chat.persona_id, + time_created=chat.time_created.isoformat(), + shared_status=chat.shared_status, + folder_id=chat.folder_id, + current_alternate_model=chat.current_alternate_model, + ) + for chat in chat_sessions + ] + ) diff --git a/backend/danswer/server/query_and_chat/chat_backend.py b/backend/danswer/server/query_and_chat/chat_backend.py index e6c5fce29d1..e74c7e0769a 100644 --- a/backend/danswer/server/query_and_chat/chat_backend.py +++ b/backend/danswer/server/query_and_chat/chat_backend.py @@ -1,18 +1,26 @@ import asyncio import io +import json import uuid from collections.abc import Callable from collections.abc import Generator +from typing import Tuple +from uuid import UUID +from danswer.auth.users import current_limited_user from danswer.auth.users import current_user from danswer.chat.chat_utils import create_chat_chain +from danswer.chat.chat_utils import extract_headers from danswer.chat.process_message import stream_chat_message from danswer.configs.app_configs import WEB_DOMAIN from danswer.configs.constants import FileOrigin from danswer.configs.constants import MessageType +from danswer.configs.model_configs import LITELLM_PASS_THROUGH_HEADERS +from danswer.db.chat import add_chats_to_session_from_slack_thread from danswer.db.chat import create_chat_session from danswer.db.chat import create_new_chat_message from danswer.db.chat import delete_chat_session +from danswer.db.chat import duplicate_chat_session_for_user_from_slack from danswer.db.chat import get_chat_message from danswer.db.chat import get_chat_messages_by_session from danswer.db.chat import get_chat_session_by_id @@ -38,7 +46,6 @@ from danswer.llm.exceptions import GenAIDisabledException from danswer.llm.factory import get_default_llms from danswer.llm.factory import get_llms_for_persona -from danswer.llm.headers import get_litellm_additional_request_headers from danswer.natural_language_processing.utils import get_tokenizer from danswer.secondary_llm_flows.chat_session_naming import ( get_renamed_conversation_name, @@ -59,6 +66,7 @@ from danswer.server.query_and_chat.models import SearchFeedbackRequest from danswer.server.query_and_chat.models import UpdateChatSessionThreadRequest from danswer.server.query_and_chat.token_limit import check_token_rate_limits +from danswer.utils.headers import get_custom_tool_additional_request_headers from danswer.utils.logger import setup_logger from fastapi import APIRouter from fastapi import Depends @@ -67,9 +75,11 @@ from fastapi import Response from fastapi import UploadFile from fastapi.responses import StreamingResponse +from PIL import Image from pydantic import BaseModel from sqlalchemy.orm import Session + logger = setup_logger() router = APIRouter(prefix="/chat") @@ -125,7 +135,7 @@ def update_chat_session_model( @router.get("/get-chat-session/{session_id}") def get_chat_session( - session_id: int, + session_id: UUID, is_shared: bool = False, user: User | None = Depends(current_user), db_session: Session = Depends(get_session), @@ -225,7 +235,9 @@ def rename_chat_session( try: llm, _ = get_default_llms( - additional_headers=get_litellm_additional_request_headers(request.headers) + additional_headers=extract_headers( + request.headers, LITELLM_PASS_THROUGH_HEADERS + ) ) except GenAIDisabledException: # This may be longer than what the LLM tends to produce but is the most @@ -246,7 +258,7 @@ def rename_chat_session( @router.patch("/chat-session/{session_id}") def patch_chat_session( - session_id: int, + session_id: UUID, chat_session_update_req: ChatSessionUpdateRequest, user: User | None = Depends(current_user), db_session: Session = Depends(get_session), @@ -263,7 +275,7 @@ def patch_chat_session( @router.delete("/delete-chat-session/{session_id}") def delete_chat_session_by_id( - session_id: int, + session_id: UUID, user: User | None = Depends(current_user), db_session: Session = Depends(get_session), ) -> None: @@ -274,13 +286,14 @@ def delete_chat_session_by_id( raise HTTPException(status_code=400, detail=str(e)) -async def is_disconnected(request: Request) -> Callable[[], bool]: +async def is_connected(request: Request) -> Callable[[], bool]: main_loop = asyncio.get_event_loop() - def is_disconnected_sync() -> bool: + def is_connected_sync() -> bool: future = asyncio.run_coroutine_threadsafe(request.is_disconnected(), main_loop) try: - return not future.result(timeout=0.01) + is_connected = not future.result(timeout=0.01) + return is_connected except asyncio.TimeoutError: logger.error("Asyncio timed out") return True @@ -291,24 +304,37 @@ def is_disconnected_sync() -> bool: ) return True - return is_disconnected_sync + return is_connected_sync @router.post("/send-message") def handle_new_chat_message( chat_message_req: CreateChatMessageRequest, request: Request, - user: User | None = Depends(current_user), + user: User | None = Depends(current_limited_user), _: None = Depends(check_token_rate_limits), - is_disconnected_func: Callable[[], bool] = Depends(is_disconnected), + is_connected_func: Callable[[], bool] = Depends(is_connected), ) -> StreamingResponse: - """This endpoint is both used for all the following purposes: + """ + This endpoint is both used for all the following purposes: - Sending a new message in the session - Regenerating a message in the session (just send the same one again) - Editing a message (similar to regenerating but sending a different message) - Kicking off a seeded chat session (set `use_existing_user_message`) - To avoid extra overhead/latency, this assumes (and checks) that previous messages on the path - have already been set as latest""" + + Assumes that previous messages have been set as the latest to minimize overhead. + + Args: + chat_message_req (CreateChatMessageRequest): Details about the new chat message. + request (Request): The current HTTP request context. + user (User | None): The current user, obtained via dependency injection. + _ (None): Rate limit check is run if user/group/global rate limits are enabled. + is_connected_func (Callable[[], bool]): Function to check client disconnection, + used to stop the streaming response if the client disconnects. + + Returns: + StreamingResponse: Streams the response to the new chat message. + """ logger.debug(f"Received new chat message: {chat_message_req.message}") if ( @@ -318,25 +344,28 @@ def handle_new_chat_message( ): raise HTTPException(status_code=400, detail="Empty chat message is invalid") - import json - def stream_generator() -> Generator[str, None, None]: try: for packet in stream_chat_message( new_msg_req=chat_message_req, user=user, - use_existing_user_message=chat_message_req.use_existing_user_message, - litellm_additional_headers=get_litellm_additional_request_headers( + litellm_additional_headers=extract_headers( + request.headers, LITELLM_PASS_THROUGH_HEADERS + ), + custom_tool_additional_headers=get_custom_tool_additional_request_headers( request.headers ), - is_connected=is_disconnected_func, + is_connected=is_connected_func, ): yield json.dumps(packet) if isinstance(packet, dict) else packet except Exception as e: - logger.exception(f"Error in chat message streaming: {e}") + logger.exception("Error in chat message streaming") yield json.dumps({"error": str(e)}) + finally: + logger.debug("Stream generator finished") + return StreamingResponse(stream_generator(), media_type="text/event-stream") @@ -364,7 +393,7 @@ def set_message_as_latest( @router.post("/create-chat-message-feedback") def create_chat_feedback( feedback: ChatFeedbackRequest, - user: User | None = Depends(current_user), + user: User | None = Depends(current_limited_user), db_session: Session = Depends(get_session), ) -> None: user_id = user.id if user else None @@ -504,9 +533,56 @@ def seed_chat( ) +class SeedChatFromSlackRequest(BaseModel): + chat_session_id: UUID + + +class SeedChatFromSlackResponse(BaseModel): + redirect_url: str + + +@router.post("/seed-chat-session-from-slack") +def seed_chat_from_slack( + chat_seed_request: SeedChatFromSlackRequest, + user: User | None = Depends(current_user), + db_session: Session = Depends(get_session), +) -> SeedChatFromSlackResponse: + slack_chat_session_id = chat_seed_request.chat_session_id + new_chat_session = duplicate_chat_session_for_user_from_slack( + db_session=db_session, + user=user, + chat_session_id=slack_chat_session_id, + ) + + add_chats_to_session_from_slack_thread( + db_session=db_session, + slack_chat_session_id=slack_chat_session_id, + new_chat_session_id=new_chat_session.id, + ) + + return SeedChatFromSlackResponse( + redirect_url=f"{WEB_DOMAIN}/chat?chatId={new_chat_session.id}" + ) + + """File upload""" +def convert_to_jpeg(file: UploadFile) -> Tuple[io.BytesIO, str]: + try: + with Image.open(file.file) as img: + if img.mode != "RGB": + img = img.convert("RGB") + jpeg_io = io.BytesIO() + img.save(jpeg_io, format="JPEG", quality=85) + jpeg_io.seek(0) + return jpeg_io, "image/jpeg" + except Exception as e: + raise HTTPException( + status_code=400, detail=f"Failed to convert image: {str(e)}" + ) + + @router.post("/file") def upload_files_for_chat( files: list[UploadFile], @@ -514,9 +590,9 @@ def upload_files_for_chat( _: User | None = Depends(current_user), ) -> dict[str, list[FileDescriptor]]: image_content_types = {"image/jpeg", "image/png", "image/webp"} + csv_content_types = {"text/csv"} text_content_types = { "text/plain", - "text/csv", "text/markdown", "text/x-markdown", "text/x-config", @@ -535,8 +611,10 @@ def upload_files_for_chat( "application/epub+zip", } - allowed_content_types = image_content_types.union(text_content_types).union( - document_content_types + allowed_content_types = ( + image_content_types.union(text_content_types) + .union(document_content_types) + .union(csv_content_types) ) for file in files: @@ -546,6 +624,10 @@ def upload_files_for_chat( elif file.content_type in text_content_types: error_detail = "Unsupported text file type. Supported text types include .txt, .csv, .md, .mdx, .conf, " ".log, .tsv." + elif file.content_type in csv_content_types: + error_detail = ( + "Unsupported CSV file type. Supported CSV types include .csv." + ) else: error_detail = ( "Unsupported document file type. Supported document types include .pdf, .docx, .pptx, .xlsx, " @@ -569,25 +651,38 @@ def upload_files_for_chat( for file in files: if file.content_type in image_content_types: file_type = ChatFileType.IMAGE + # Convert image to JPEG + file_content, new_content_type = convert_to_jpeg(file) + elif file.content_type in csv_content_types: + file_type = ChatFileType.CSV + file_content = io.BytesIO(file.file.read()) + new_content_type = file.content_type or "" elif file.content_type in document_content_types: file_type = ChatFileType.DOC + file_content = io.BytesIO(file.file.read()) + new_content_type = file.content_type or "" else: file_type = ChatFileType.PLAIN_TEXT + file_content = io.BytesIO(file.file.read()) + new_content_type = file.content_type or "" - # store the raw file + # store the file (now JPEG for images) file_id = str(uuid.uuid4()) file_store.save_file( file_name=file_id, - content=file.file, + content=file_content, display_name=file.filename, file_origin=FileOrigin.CHAT_UPLOAD, - file_type=file.content_type or file_type.value, + file_type=new_content_type or file_type.value, ) # if the file is a doc, extract text and store that so we don't need # to re-extract it every time we send a message if file_type == ChatFileType.DOC: - extracted_text = extract_file_text(file_name=file.filename, file=file.file) + extracted_text = extract_file_text( + file=file.file, + file_name=file.filename or "", + ) text_file_id = str(uuid.uuid4()) file_store.save_file( file_name=text_file_id, diff --git a/backend/danswer/server/query_and_chat/models.py b/backend/danswer/server/query_and_chat/models.py index c9109b141c3..ae6e651fff1 100644 --- a/backend/danswer/server/query_and_chat/models.py +++ b/backend/danswer/server/query_and_chat/models.py @@ -1,5 +1,6 @@ from datetime import datetime from typing import Any +from uuid import UUID from pydantic import BaseModel from pydantic import model_validator @@ -8,15 +9,15 @@ from danswer.configs.constants import DocumentSource from danswer.configs.constants import MessageType from danswer.configs.constants import SearchFeedbackType +from danswer.context.search.models import BaseFilters +from danswer.context.search.models import ChunkContext +from danswer.context.search.models import RetrievalDetails +from danswer.context.search.models import SearchDoc +from danswer.context.search.models import Tag from danswer.db.enums import ChatSessionSharedStatus from danswer.file_store.models import FileDescriptor from danswer.llm.override_models import LLMOverride from danswer.llm.override_models import PromptOverride -from danswer.search.models import BaseFilters -from danswer.search.models import ChunkContext -from danswer.search.models import RetrievalDetails -from danswer.search.models import SearchDoc -from danswer.search.models import Tag from danswer.tools.models import ToolCallFinalResult @@ -28,13 +29,9 @@ class TagResponse(BaseModel): tags: list[SourceTag] -class SimpleQueryRequest(BaseModel): - query: str - - class UpdateChatSessionThreadRequest(BaseModel): # If not specified, use Danswer default persona - chat_session_id: int + chat_session_id: UUID new_alternate_model: str @@ -45,7 +42,7 @@ class ChatSessionCreationRequest(BaseModel): class CreateChatSessionID(BaseModel): - chat_session_id: int + chat_session_id: UUID class ChatFeedbackRequest(BaseModel): @@ -75,7 +72,7 @@ def check_is_positive_or_feedback_text(self) -> "ChatFeedbackRequest": class CreateChatMessageRequest(ChunkContext): """Before creating messages, be sure to create a chat_session and get an id""" - chat_session_id: int + chat_session_id: UUID # This is the primary-key (unique identifier) for the previous message of the tree parent_message_id: int | None # New message contents @@ -107,6 +104,13 @@ class CreateChatMessageRequest(ChunkContext): # used for seeded chats to kick off the generation of an AI answer use_existing_user_message: bool = False + # used for "OpenAI Assistants API" + existing_assistant_message_id: int | None = None + + # forces the LLM to return a structured response, see + # https://platform.openai.com/docs/guides/structured-outputs/introduction + structured_response_format: dict | None = None + @model_validator(mode="after") def check_search_doc_ids_or_retrieval_options(self) -> "CreateChatMessageRequest": if self.search_doc_ids is None and self.retrieval_options is None: @@ -115,13 +119,18 @@ def check_search_doc_ids_or_retrieval_options(self) -> "CreateChatMessageRequest ) return self + def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]: + data = super().model_dump(*args, **kwargs) + data["chat_session_id"] = str(data["chat_session_id"]) + return data + class ChatMessageIdentifier(BaseModel): message_id: int class ChatRenameRequest(BaseModel): - chat_session_id: int + chat_session_id: UUID name: str | None = None @@ -134,7 +143,7 @@ class RenameChatSessionResponse(BaseModel): class ChatSessionDetails(BaseModel): - id: int + id: UUID name: str persona_id: int | None = None time_created: str @@ -175,10 +184,10 @@ class ChatMessageDetail(BaseModel): overridden_model: str | None alternate_assistant_id: int | None = None # Dict mapping citation number to db_doc_id - chat_session_id: int | None = None + chat_session_id: UUID | None = None citations: dict[int, int] | None = None files: list[FileDescriptor] - tool_calls: list[ToolCallFinalResult] + tool_call: ToolCallFinalResult | None def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]: # type: ignore initial_dict = super().model_dump(mode="json", *args, **kwargs) # type: ignore @@ -187,14 +196,14 @@ def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]: class SearchSessionDetailResponse(BaseModel): - search_session_id: int + search_session_id: UUID description: str documents: list[SearchDoc] messages: list[ChatMessageDetail] class ChatSessionDetailResponse(BaseModel): - chat_session_id: int + chat_session_id: UUID description: str persona_id: int | None = None persona_name: str | None @@ -204,6 +213,7 @@ class ChatSessionDetailResponse(BaseModel): current_alternate_model: str | None +# This one is not used anymore class QueryValidationResponse(BaseModel): reasoning: str answerable: bool diff --git a/backend/danswer/server/query_and_chat/query_backend.py b/backend/danswer/server/query_and_chat/query_backend.py index 13a4cc7415c..c89772026a7 100644 --- a/backend/danswer/server/query_and_chat/query_backend.py +++ b/backend/danswer/server/query_and_chat/query_backend.py @@ -1,13 +1,21 @@ -from fastapi import APIRouter -from fastapi import Depends -from fastapi import HTTPException -from fastapi.responses import StreamingResponse -from sqlalchemy.orm import Session +import json +from collections.abc import Generator +from uuid import UUID from danswer.auth.users import current_curator_or_admin_user +from danswer.auth.users import current_limited_user from danswer.auth.users import current_user from danswer.configs.constants import DocumentSource from danswer.configs.constants import MessageType +from danswer.context.search.models import IndexFilters +from danswer.context.search.models import SearchDoc +from danswer.context.search.preprocessing.access_filters import ( + build_access_filters_for_user, +) +from danswer.context.search.utils import chunks_or_sections_to_search_docs +from danswer.danswerbot.slack.handlers.handle_standard_answers import ( + oneoff_standard_answers, +) from danswer.db.chat import get_chat_messages_by_session from danswer.db.chat import get_chat_session_by_id from danswer.db.chat import get_chat_sessions_by_user @@ -18,33 +26,27 @@ from danswer.db.engine import get_session from danswer.db.models import User from danswer.db.search_settings import get_current_search_settings -from danswer.db.tag import get_tags_by_value_prefix_for_source_types +from danswer.db.tag import find_tags from danswer.document_index.factory import get_default_document_index from danswer.document_index.vespa.index import VespaIndex from danswer.one_shot_answer.answer_question import stream_search_answer from danswer.one_shot_answer.models import DirectQARequest -from danswer.search.models import IndexFilters -from danswer.search.models import SearchDoc -from danswer.search.preprocessing.access_filters import build_access_filters_for_user -from danswer.search.utils import chunks_or_sections_to_search_docs -from danswer.secondary_llm_flows.query_validation import get_query_answerability -from danswer.secondary_llm_flows.query_validation import stream_query_answerability from danswer.server.query_and_chat.models import AdminSearchRequest from danswer.server.query_and_chat.models import AdminSearchResponse from danswer.server.query_and_chat.models import ChatSessionDetails from danswer.server.query_and_chat.models import ChatSessionsResponse -from danswer.server.query_and_chat.models import QueryValidationResponse from danswer.server.query_and_chat.models import SearchSessionDetailResponse -from danswer.server.query_and_chat.models import SimpleQueryRequest from danswer.server.query_and_chat.models import SourceTag from danswer.server.query_and_chat.models import TagResponse from danswer.server.query_and_chat.token_limit import check_token_rate_limits from danswer.utils.logger import setup_logger -from danswer.danswerbot.slack.handlers.handle_standard_answers import ( - oneoff_standard_answers, -) from ee.danswer.server.query_and_chat.models import StandardAnswerRequest from ee.danswer.server.query_and_chat.models import StandardAnswerResponse +from fastapi import APIRouter +from fastapi import Depends +from fastapi import HTTPException +from fastapi.responses import StreamingResponse +from sqlalchemy.orm import Session logger = setup_logger() @@ -104,12 +106,25 @@ def get_tags( if not allow_prefix: raise NotImplementedError("Cannot disable prefix match for now") - db_tags = get_tags_by_value_prefix_for_source_types( - tag_key_prefix=match_pattern, - tag_value_prefix=match_pattern, + key_prefix = match_pattern + value_prefix = match_pattern + require_both_to_match = False + + # split on = to allow the user to type in "author=bob" + EQUAL_PAT = "=" + if match_pattern and EQUAL_PAT in match_pattern: + split_pattern = match_pattern.split(EQUAL_PAT) + key_prefix = split_pattern[0] + value_prefix = EQUAL_PAT.join(split_pattern[1:]) + require_both_to_match = True + + db_tags = find_tags( + tag_key_prefix=key_prefix, + tag_value_prefix=value_prefix, sources=sources, limit=limit, db_session=db_session, + require_both_to_match=require_both_to_match, ) server_tags = [ SourceTag( @@ -120,18 +135,6 @@ def get_tags( return TagResponse(tags=server_tags) -@basic_router.post("/query-validation") -def query_validation( - simple_query: SimpleQueryRequest, _: User = Depends(current_user) -) -> QueryValidationResponse: - # Note if weak model prompt is chosen, this check does not occur and will simply return that - # the query is valid, this is because weaker models cannot really handle this task well. - # Additionally, some weak model servers cannot handle concurrent inferences. - logger.notice(f"Validating query: {simple_query.query}") - reasoning, answerable = get_query_answerability(simple_query.query) - return QueryValidationResponse(reasoning=reasoning, answerable=answerable) - - @basic_router.get("/user-searches") def get_user_search_sessions( user: User | None = Depends(current_user), @@ -178,7 +181,7 @@ def get_user_search_sessions( @basic_router.get("/search-session/{session_id}") def get_search_session( - session_id: int, + session_id: UUID, is_shared: bool = False, user: User | None = Depends(current_user), db_session: Session = Depends(get_session), @@ -232,52 +235,27 @@ def get_search_session( return response -# NOTE No longer used, after search/chat redesign. -# No search responses are answered with a conversational generative AI response -@basic_router.post("/stream-query-validation") -def stream_query_validation( - simple_query: SimpleQueryRequest, _: User = Depends(current_user) -) -> StreamingResponse: - # Note if weak model prompt is chosen, this check does not occur and will simply return that - # the query is valid, this is because weaker models cannot really handle this task well. - # Additionally, some weak model servers cannot handle concurrent inferences. - logger.notice(f"Validating query: {simple_query.query}") - return StreamingResponse( - stream_query_answerability(simple_query.query), media_type="application/json" - ) - - @basic_router.post("/stream-answer-with-quote") def get_answer_with_quote( query_request: DirectQARequest, - user: User = Depends(current_user), + user: User = Depends(current_limited_user), _: None = Depends(check_token_rate_limits), ) -> StreamingResponse: query = query_request.messages[0].message logger.notice(f"Received query for one shot answer with quotes: {query}") - packets = stream_search_answer( - query_req=query_request, - user=user, - max_document_tokens=None, - max_history_tokens=0, - ) - return StreamingResponse(packets, media_type="application/json") + def stream_generator() -> Generator[str, None, None]: + try: + for packet in stream_search_answer( + query_req=query_request, + user=user, + max_document_tokens=None, + max_history_tokens=0, + ): + yield json.dumps(packet) if isinstance(packet, dict) else packet + except Exception as e: + logger.exception("Error in search answer streaming") + yield json.dumps({"error": str(e)}) -@basic_router.get("/standard-answer") -def get_standard_answer( - request: StandardAnswerRequest, - db_session: Session = Depends(get_session), - _: User | None = Depends(current_user), -) -> StandardAnswerResponse: - try: - standard_answers = oneoff_standard_answers( - message=request.message, - slack_bot_categories=request.slack_bot_categories, - db_session=db_session, - ) - return StandardAnswerResponse(standard_answers=standard_answers) - except Exception as e: - logger.error(f"Error in get_standard_answer: {str(e)}", exc_info=True) - raise HTTPException(status_code=500, detail="An internal server error occurred") + return StreamingResponse(stream_generator(), media_type="application/json") diff --git a/backend/danswer/server/query_and_chat/token_limit.py b/backend/danswer/server/query_and_chat/token_limit.py index 3f5d76bac7f..0f47ef7266f 100644 --- a/backend/danswer/server/query_and_chat/token_limit.py +++ b/backend/danswer/server/query_and_chat/token_limit.py @@ -13,13 +13,15 @@ from danswer.auth.users import current_user from danswer.db.engine import get_session_context_manager +from danswer.db.engine import get_session_with_tenant from danswer.db.models import ChatMessage from danswer.db.models import ChatSession from danswer.db.models import TokenRateLimit from danswer.db.models import User +from danswer.db.token_limit import fetch_all_global_token_rate_limits from danswer.utils.logger import setup_logger from danswer.utils.variable_functionality import fetch_versioned_implementation -from ee.danswer.db.token_limit import fetch_all_global_token_rate_limits +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR logger = setup_logger() @@ -39,11 +41,11 @@ def check_token_rate_limits( versioned_rate_limit_strategy = fetch_versioned_implementation( "danswer.server.query_and_chat.token_limit", "_check_token_rate_limits" ) - return versioned_rate_limit_strategy(user) + return versioned_rate_limit_strategy(user, CURRENT_TENANT_ID_CONTEXTVAR.get()) -def _check_token_rate_limits(_: User | None) -> None: - _user_is_rate_limited_by_global() +def _check_token_rate_limits(_: User | None, tenant_id: str | None) -> None: + _user_is_rate_limited_by_global(tenant_id) """ @@ -51,8 +53,8 @@ def _check_token_rate_limits(_: User | None) -> None: """ -def _user_is_rate_limited_by_global() -> None: - with get_session_context_manager() as db_session: +def _user_is_rate_limited_by_global(tenant_id: str | None) -> None: + with get_session_with_tenant(tenant_id) as db_session: global_rate_limits = fetch_all_global_token_rate_limits( db_session=db_session, enabled_only=True, ordered=False ) diff --git a/backend/danswer/server/settings/api.py b/backend/danswer/server/settings/api.py index 5b8564c3d3a..4f598a18353 100644 --- a/backend/danswer/server/settings/api.py +++ b/backend/danswer/server/settings/api.py @@ -15,12 +15,10 @@ from danswer.db.models import User from danswer.db.notification import create_notification from danswer.db.notification import dismiss_all_notifications -from danswer.db.notification import dismiss_notification -from danswer.db.notification import get_notification_by_id from danswer.db.notification import get_notifications from danswer.db.notification import update_notification_last_shown -from danswer.dynamic_configs.factory import get_dynamic_config_store -from danswer.dynamic_configs.interface import ConfigNotFoundError +from danswer.key_value_store.factory import get_kv_store +from danswer.key_value_store.interface import KvKeyNotFoundError from danswer.server.settings.models import Notification from danswer.server.settings.models import Settings from danswer.server.settings.models import UserSettings @@ -55,79 +53,70 @@ def fetch_settings( """Settings and notifications are stuffed into this single endpoint to reduce number of Postgres calls""" general_settings = load_settings() - user_notifications = get_user_notifications(user, db_session) + settings_notifications = get_settings_notifications(user, db_session) try: - kv_store = get_dynamic_config_store() + kv_store = get_kv_store() needs_reindexing = cast(bool, kv_store.load(KV_REINDEX_KEY)) - except ConfigNotFoundError: + except KvKeyNotFoundError: needs_reindexing = False return UserSettings( **general_settings.model_dump(), - notifications=user_notifications, + notifications=settings_notifications, needs_reindexing=needs_reindexing, ) -@basic_router.post("/notifications/{notification_id}/dismiss") -def dismiss_notification_endpoint( - notification_id: int, - user: User | None = Depends(current_user), - db_session: Session = Depends(get_session), -) -> None: - try: - notification = get_notification_by_id(notification_id, user, db_session) - except PermissionError: - raise HTTPException( - status_code=403, detail="Not authorized to dismiss this notification" - ) - except ValueError: - raise HTTPException(status_code=404, detail="Notification not found") - - dismiss_notification(notification, db_session) - - -def get_user_notifications( +def get_settings_notifications( user: User | None, db_session: Session ) -> list[Notification]: - """Get notifications for the user, currently the logic is very specific to the reindexing flag""" + """Get notifications for settings page, including product gating and reindex notifications""" + # Check for product gating notification + product_notif = get_notifications( + user=None, + notif_type=NotificationType.TRIAL_ENDS_TWO_DAYS, + db_session=db_session, + ) + notifications = [Notification.from_model(product_notif[0])] if product_notif else [] + + # Only show reindex notifications to admins is_admin = is_user_admin(user) if not is_admin: - # Reindexing flag should only be shown to admins, basic users can't trigger it anyway - return [] + return notifications - kv_store = get_dynamic_config_store() + # Check if reindexing is needed + kv_store = get_kv_store() try: needs_index = cast(bool, kv_store.load(KV_REINDEX_KEY)) if not needs_index: dismiss_all_notifications( notif_type=NotificationType.REINDEX, db_session=db_session ) - return [] - except ConfigNotFoundError: + return notifications + except KvKeyNotFoundError: # If something goes wrong and the flag is gone, better to not start a reindexing # it's a heavyweight long running job and maybe this flag is cleaned up later logger.warning("Could not find reindex flag") - return [] + return notifications try: # Need a transaction in order to prevent under-counting current notifications - db_session.begin() - reindex_notifs = get_notifications( user=user, notif_type=NotificationType.REINDEX, db_session=db_session ) if not reindex_notifs: notif = create_notification( - user=user, + user_id=user.id if user else None, notif_type=NotificationType.REINDEX, db_session=db_session, ) db_session.flush() db_session.commit() - return [Notification.from_model(notif)] + + notifications.append(Notification.from_model(notif)) + return notifications if len(reindex_notifs) > 1: logger.error("User has multiple reindex notifications") @@ -138,8 +127,9 @@ def get_user_notifications( ) db_session.commit() - return [Notification.from_model(reindex_notif)] + notifications.append(Notification.from_model(reindex_notif)) + return notifications except SQLAlchemyError: logger.exception("Error while processing notifications") db_session.rollback() - return [] + return notifications diff --git a/backend/danswer/server/settings/models.py b/backend/danswer/server/settings/models.py index ae7e7236c8d..af93595501d 100644 --- a/backend/danswer/server/settings/models.py +++ b/backend/danswer/server/settings/models.py @@ -12,12 +12,19 @@ class PageType(str, Enum): SEARCH = "search" +class GatingType(str, Enum): + FULL = "full" # Complete restriction of access to the product or service + PARTIAL = "partial" # Full access but warning (no credit card on file) + NONE = "none" # No restrictions, full access to all features + + class Notification(BaseModel): id: int notif_type: NotificationType dismissed: bool last_shown: datetime first_shown: datetime + additional_data: dict | None = None @classmethod def from_model(cls, notif: NotificationDBModel) -> "Notification": @@ -27,6 +34,7 @@ def from_model(cls, notif: NotificationDBModel) -> "Notification": dismissed=notif.dismissed, last_shown=notif.last_shown, first_shown=notif.first_shown, + additional_data=notif.additional_data, ) @@ -38,6 +46,7 @@ class Settings(BaseModel): default_page: PageType = PageType.SEARCH maximum_chat_retention_days: int | None = None gpu_enabled: bool | None = None + product_gating: GatingType = GatingType.NONE def check_validity(self) -> None: chat_page_enabled = self.chat_page_enabled diff --git a/backend/danswer/server/settings/store.py b/backend/danswer/server/settings/store.py index 6f2872f40f9..c3875c6aecb 100644 --- a/backend/danswer/server/settings/store.py +++ b/backend/danswer/server/settings/store.py @@ -1,16 +1,16 @@ from typing import cast from danswer.configs.constants import KV_SETTINGS_KEY -from danswer.dynamic_configs.factory import get_dynamic_config_store -from danswer.dynamic_configs.interface import ConfigNotFoundError +from danswer.key_value_store.factory import get_kv_store +from danswer.key_value_store.interface import KvKeyNotFoundError from danswer.server.settings.models import Settings def load_settings() -> Settings: - dynamic_config_store = get_dynamic_config_store() + dynamic_config_store = get_kv_store() try: settings = Settings(**cast(dict, dynamic_config_store.load(KV_SETTINGS_KEY))) - except ConfigNotFoundError: + except KvKeyNotFoundError: settings = Settings() dynamic_config_store.store(KV_SETTINGS_KEY, settings.model_dump()) @@ -18,4 +18,4 @@ def load_settings() -> Settings: def store_settings(settings: Settings) -> None: - get_dynamic_config_store().store(KV_SETTINGS_KEY, settings.model_dump()) + get_kv_store().store(KV_SETTINGS_KEY, settings.model_dump()) diff --git a/backend/danswer/server/token_rate_limits/api.py b/backend/danswer/server/token_rate_limits/api.py index 245e3391410..16755e06e7d 100644 --- a/backend/danswer/server/token_rate_limits/api.py +++ b/backend/danswer/server/token_rate_limits/api.py @@ -5,13 +5,13 @@ from danswer.auth.users import current_admin_user from danswer.db.engine import get_session from danswer.db.models import User +from danswer.db.token_limit import delete_token_rate_limit +from danswer.db.token_limit import fetch_all_global_token_rate_limits +from danswer.db.token_limit import insert_global_token_rate_limit +from danswer.db.token_limit import update_token_rate_limit from danswer.server.query_and_chat.token_limit import any_rate_limit_exists from danswer.server.token_rate_limits.models import TokenRateLimitArgs from danswer.server.token_rate_limits.models import TokenRateLimitDisplay -from ee.danswer.db.token_limit import delete_token_rate_limit -from ee.danswer.db.token_limit import fetch_all_global_token_rate_limits -from ee.danswer.db.token_limit import insert_global_token_rate_limit -from ee.danswer.db.token_limit import update_token_rate_limit router = APIRouter(prefix="/admin/token-rate-limits") diff --git a/backend/danswer/server/utils.py b/backend/danswer/server/utils.py index 53ed5b426ba..68e6dc8d0b8 100644 --- a/backend/danswer/server/utils.py +++ b/backend/danswer/server/utils.py @@ -1,7 +1,18 @@ import json +import smtplib from datetime import datetime +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText +from textwrap import dedent from typing import Any +from danswer.configs.app_configs import SMTP_PASS +from danswer.configs.app_configs import SMTP_PORT +from danswer.configs.app_configs import SMTP_SERVER +from danswer.configs.app_configs import SMTP_USER +from danswer.configs.app_configs import WEB_DOMAIN +from danswer.db.models import User + class DateTimeEncoder(json.JSONEncoder): """Custom JSON encoder that converts datetime objects to ISO format strings.""" @@ -43,3 +54,31 @@ def mask_credential_dict(credential_dict: dict[str, Any]) -> dict[str, str]: masked_creds[key] = mask_string(val) return masked_creds + + +def send_user_email_invite(user_email: str, current_user: User) -> None: + msg = MIMEMultipart() + msg["Subject"] = "Invitation to Join Danswer Workspace" + msg["From"] = current_user.email + msg["To"] = user_email + + email_body = dedent( + f"""\ + Hello, + + You have been invited to join a workspace on Danswer. + + To join the workspace, please visit the following link: + + {WEB_DOMAIN}/auth/login + + Best regards, + The Danswer Team + """ + ) + + msg.attach(MIMEText(email_body, "plain")) + with smtplib.SMTP(SMTP_SERVER, SMTP_PORT) as smtp_server: + smtp_server.starttls() + smtp_server.login(SMTP_USER, SMTP_PASS) + smtp_server.send_message(msg) diff --git a/backend/danswer/setup.py b/backend/danswer/setup.py new file mode 100644 index 00000000000..99173821a45 --- /dev/null +++ b/backend/danswer/setup.py @@ -0,0 +1,360 @@ +import time + +from sqlalchemy.orm import Session + +from danswer.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP +from danswer.configs.app_configs import MANAGED_VESPA +from danswer.configs.constants import KV_REINDEX_KEY +from danswer.configs.constants import KV_SEARCH_SETTINGS +from danswer.configs.model_configs import FAST_GEN_AI_MODEL_VERSION +from danswer.configs.model_configs import GEN_AI_API_KEY +from danswer.configs.model_configs import GEN_AI_MODEL_VERSION +from danswer.context.search.models import SavedSearchSettings +from danswer.context.search.retrieval.search_runner import download_nltk_data +from danswer.db.connector import check_connectors_exist +from danswer.db.connector import create_initial_default_connector +from danswer.db.connector_credential_pair import associate_default_cc_pair +from danswer.db.connector_credential_pair import get_connector_credential_pairs +from danswer.db.connector_credential_pair import resync_cc_pair +from danswer.db.credentials import create_initial_public_credential +from danswer.db.document import check_docs_exist +from danswer.db.index_attempt import cancel_indexing_attempts_past_model +from danswer.db.index_attempt import expire_index_attempts +from danswer.db.llm import fetch_default_provider +from danswer.db.llm import update_default_provider +from danswer.db.llm import upsert_llm_provider +from danswer.db.persona import delete_old_default_personas +from danswer.db.search_settings import get_current_search_settings +from danswer.db.search_settings import get_secondary_search_settings +from danswer.db.search_settings import update_current_search_settings +from danswer.db.search_settings import update_secondary_search_settings +from danswer.db.swap_index import check_index_swap +from danswer.document_index.factory import get_default_document_index +from danswer.document_index.interfaces import DocumentIndex +from danswer.document_index.vespa.index import VespaIndex +from danswer.indexing.models import IndexingSetting +from danswer.key_value_store.factory import get_kv_store +from danswer.key_value_store.interface import KvKeyNotFoundError +from danswer.natural_language_processing.search_nlp_models import EmbeddingModel +from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder +from danswer.natural_language_processing.search_nlp_models import warm_up_cross_encoder +from danswer.seeding.load_docs import seed_initial_documents +from danswer.seeding.load_yamls import load_chat_yamls +from danswer.server.manage.llm.models import LLMProviderUpsertRequest +from danswer.server.settings.store import load_settings +from danswer.server.settings.store import store_settings +from danswer.tools.built_in_tools import auto_add_search_tool_to_personas +from danswer.tools.built_in_tools import load_builtin_tools +from danswer.tools.built_in_tools import refresh_built_in_tools_cache +from danswer.utils.gpu_utils import gpu_status_request +from danswer.utils.logger import setup_logger +from shared_configs.configs import ALT_INDEX_SUFFIX +from shared_configs.configs import MODEL_SERVER_HOST +from shared_configs.configs import MODEL_SERVER_PORT +from shared_configs.configs import MULTI_TENANT +from shared_configs.configs import SUPPORTED_EMBEDDING_MODELS +from shared_configs.model_server_models import SupportedEmbeddingModel + + +logger = setup_logger() + + +def setup_danswer( + db_session: Session, tenant_id: str | None, cohere_enabled: bool = False +) -> None: + """ + Setup Danswer for a particular tenant. In the Single Tenant case, it will set it up for the default schema + on server startup. In the MT case, it will be called when the tenant is created. + + The Tenant Service calls the tenants/create endpoint which runs this. + """ + check_index_swap(db_session=db_session) + search_settings = get_current_search_settings(db_session) + secondary_search_settings = get_secondary_search_settings(db_session) + + # Break bad state for thrashing indexes + if secondary_search_settings and DISABLE_INDEX_UPDATE_ON_SWAP: + expire_index_attempts( + search_settings_id=search_settings.id, db_session=db_session + ) + + for cc_pair in get_connector_credential_pairs(db_session): + resync_cc_pair(cc_pair, db_session=db_session) + + # Expire all old embedding models indexing attempts, technically redundant + cancel_indexing_attempts_past_model(db_session) + + logger.notice(f'Using Embedding model: "{search_settings.model_name}"') + if search_settings.query_prefix or search_settings.passage_prefix: + logger.notice(f'Query embedding prefix: "{search_settings.query_prefix}"') + logger.notice(f'Passage embedding prefix: "{search_settings.passage_prefix}"') + + if search_settings: + if not search_settings.disable_rerank_for_streaming: + logger.notice("Reranking is enabled.") + + if search_settings.multilingual_expansion: + logger.notice( + f"Multilingual query expansion is enabled with {search_settings.multilingual_expansion}." + ) + if ( + search_settings.rerank_model_name + and not search_settings.provider_type + and not search_settings.rerank_provider_type + ): + warm_up_cross_encoder(search_settings.rerank_model_name) + + logger.notice("Verifying query preprocessing (NLTK) data is downloaded") + download_nltk_data() + + # setup Postgres with default credential, llm providers, etc. + setup_postgres(db_session) + + translate_saved_search_settings(db_session) + + # Does the user need to trigger a reindexing to bring the document index + # into a good state, marked in the kv store + if not MULTI_TENANT: + mark_reindex_flag(db_session) + + # Ensure Vespa is setup correctly, this step is relatively near the end because Vespa + # takes a bit of time to start up + logger.notice("Verifying Document Index(s) is/are available.") + document_index = get_default_document_index( + primary_index_name=search_settings.index_name, + secondary_index_name=secondary_search_settings.index_name + if secondary_search_settings + else None, + ) + + success = setup_vespa( + document_index, + IndexingSetting.from_db_model(search_settings), + IndexingSetting.from_db_model(secondary_search_settings) + if secondary_search_settings + else None, + ) + if not success: + raise RuntimeError("Could not connect to Vespa within the specified timeout.") + + logger.notice(f"Model Server: http://{MODEL_SERVER_HOST}:{MODEL_SERVER_PORT}") + if search_settings.provider_type is None: + warm_up_bi_encoder( + embedding_model=EmbeddingModel.from_db_model( + search_settings=search_settings, + server_host=MODEL_SERVER_HOST, + server_port=MODEL_SERVER_PORT, + ), + ) + + # update multipass indexing setting based on GPU availability + update_default_multipass_indexing(db_session) + + seed_initial_documents(db_session, tenant_id, cohere_enabled) + + +def translate_saved_search_settings(db_session: Session) -> None: + kv_store = get_kv_store() + + try: + search_settings_dict = kv_store.load(KV_SEARCH_SETTINGS) + if isinstance(search_settings_dict, dict): + # Update current search settings + current_settings = get_current_search_settings(db_session) + + # Update non-preserved fields + if current_settings: + current_settings_dict = SavedSearchSettings.from_db_model( + current_settings + ).dict() + + new_current_settings = SavedSearchSettings( + **{**current_settings_dict, **search_settings_dict} + ) + update_current_search_settings(db_session, new_current_settings) + + # Update secondary search settings + secondary_settings = get_secondary_search_settings(db_session) + if secondary_settings: + secondary_settings_dict = SavedSearchSettings.from_db_model( + secondary_settings + ).dict() + + new_secondary_settings = SavedSearchSettings( + **{**secondary_settings_dict, **search_settings_dict} + ) + update_secondary_search_settings( + db_session, + new_secondary_settings, + ) + # Delete the KV store entry after successful update + kv_store.delete(KV_SEARCH_SETTINGS) + logger.notice("Search settings updated and KV store entry deleted.") + else: + logger.notice("KV store search settings is empty.") + except KvKeyNotFoundError: + logger.notice("No search config found in KV store.") + + +def mark_reindex_flag(db_session: Session) -> None: + kv_store = get_kv_store() + try: + value = kv_store.load(KV_REINDEX_KEY) + logger.debug(f"Re-indexing flag has value {value}") + return + except KvKeyNotFoundError: + # Only need to update the flag if it hasn't been set + pass + + # If their first deployment is after the changes, it will + # enable this when the other changes go in, need to avoid + # this being set to False, then the user indexes things on the old version + docs_exist = check_docs_exist(db_session) + connectors_exist = check_connectors_exist(db_session) + if docs_exist or connectors_exist: + kv_store.store(KV_REINDEX_KEY, True) + else: + kv_store.store(KV_REINDEX_KEY, False) + + +def setup_vespa( + document_index: DocumentIndex, + index_setting: IndexingSetting, + secondary_index_setting: IndexingSetting | None, +) -> bool: + # Vespa startup is a bit slow, so give it a few seconds + WAIT_SECONDS = 5 + VESPA_ATTEMPTS = 5 + for x in range(VESPA_ATTEMPTS): + try: + logger.notice(f"Setting up Vespa (attempt {x+1}/{VESPA_ATTEMPTS})...") + document_index.ensure_indices_exist( + index_embedding_dim=index_setting.model_dim, + secondary_index_embedding_dim=secondary_index_setting.model_dim + if secondary_index_setting + else None, + ) + + logger.notice("Vespa setup complete.") + return True + except Exception: + logger.notice( + f"Vespa setup did not succeed. The Vespa service may not be ready yet. Retrying in {WAIT_SECONDS} seconds." + ) + time.sleep(WAIT_SECONDS) + + logger.error( + f"Vespa setup did not succeed. Attempt limit reached. ({VESPA_ATTEMPTS})" + ) + return False + + +def setup_postgres(db_session: Session) -> None: + logger.notice("Verifying default connector/credential exist.") + create_initial_public_credential(db_session) + create_initial_default_connector(db_session) + associate_default_cc_pair(db_session) + + logger.notice("Loading default Prompts and Personas") + delete_old_default_personas(db_session) + load_chat_yamls(db_session) + + logger.notice("Loading built-in tools") + load_builtin_tools(db_session) + refresh_built_in_tools_cache(db_session) + auto_add_search_tool_to_personas(db_session) + + if GEN_AI_API_KEY and fetch_default_provider(db_session) is None: + # Only for dev flows + logger.notice("Setting up default OpenAI LLM for dev.") + llm_model = GEN_AI_MODEL_VERSION or "gpt-4o-mini" + fast_model = FAST_GEN_AI_MODEL_VERSION or "gpt-4o-mini" + model_req = LLMProviderUpsertRequest( + name="DevEnvPresetOpenAI", + provider="openai", + api_key=GEN_AI_API_KEY, + api_base=None, + api_version=None, + custom_config=None, + default_model_name=llm_model, + fast_default_model_name=fast_model, + is_public=True, + groups=[], + display_model_names=[llm_model, fast_model], + model_names=[llm_model, fast_model], + ) + new_llm_provider = upsert_llm_provider( + llm_provider=model_req, db_session=db_session + ) + update_default_provider(provider_id=new_llm_provider.id, db_session=db_session) + + +def update_default_multipass_indexing(db_session: Session) -> None: + docs_exist = check_docs_exist(db_session) + connectors_exist = check_connectors_exist(db_session) + logger.debug(f"Docs exist: {docs_exist}, Connectors exist: {connectors_exist}") + + if not docs_exist and not connectors_exist: + logger.info( + "No existing docs or connectors found. Checking GPU availability for multipass indexing." + ) + gpu_available = gpu_status_request() + logger.info(f"GPU available: {gpu_available}") + + current_settings = get_current_search_settings(db_session) + + logger.notice(f"Updating multipass indexing setting to: {gpu_available}") + updated_settings = SavedSearchSettings.from_db_model(current_settings) + # Enable multipass indexing if GPU is available or if using a cloud provider + updated_settings.multipass_indexing = ( + gpu_available or current_settings.cloud_provider is not None + ) + update_current_search_settings(db_session, updated_settings) + + # Update settings with GPU availability + settings = load_settings() + settings.gpu_enabled = gpu_available + store_settings(settings) + logger.notice(f"Updated settings with GPU availability: {gpu_available}") + + else: + logger.debug( + "Existing docs or connectors found. Skipping multipass indexing update." + ) + + +def setup_multitenant_danswer() -> None: + # For Managed Vespa, the schema is sent over via the Vespa Console manually. + if not MANAGED_VESPA: + setup_vespa_multitenant(SUPPORTED_EMBEDDING_MODELS) + + +def setup_vespa_multitenant(supported_indices: list[SupportedEmbeddingModel]) -> bool: + # This is for local testing + WAIT_SECONDS = 5 + VESPA_ATTEMPTS = 5 + for x in range(VESPA_ATTEMPTS): + try: + logger.notice(f"Setting up Vespa (attempt {x+1}/{VESPA_ATTEMPTS})...") + VespaIndex.register_multitenant_indices( + indices=[index.index_name for index in supported_indices] + + [ + f"{index.index_name}{ALT_INDEX_SUFFIX}" + for index in supported_indices + ], + embedding_dims=[index.dim for index in supported_indices] + + [index.dim for index in supported_indices], + ) + + logger.notice("Vespa setup complete.") + return True + except Exception: + logger.notice( + f"Vespa setup did not succeed. The Vespa service may not be ready yet. Retrying in {WAIT_SECONDS} seconds." + ) + time.sleep(WAIT_SECONDS) + + logger.error( + f"Vespa setup did not succeed. Attempt limit reached. ({VESPA_ATTEMPTS})" + ) + return False diff --git a/backend/danswer/tools/base_tool.py b/backend/danswer/tools/base_tool.py new file mode 100644 index 00000000000..73902504462 --- /dev/null +++ b/backend/danswer/tools/base_tool.py @@ -0,0 +1,59 @@ +from typing import cast +from typing import TYPE_CHECKING + +from langchain_core.messages import HumanMessage + +from danswer.llm.utils import message_to_prompt_and_imgs +from danswer.tools.tool import Tool + +if TYPE_CHECKING: + from danswer.llm.answering.prompts.build import AnswerPromptBuilder + from danswer.tools.tool_implementations.custom.custom_tool import ( + CustomToolCallSummary, + ) + from danswer.tools.message import ToolCallSummary + from danswer.tools.models import ToolResponse + + +def build_user_message_for_non_tool_calling_llm( + message: HumanMessage, + tool_name: str, + *args: "ToolResponse", +) -> str: + query, _ = message_to_prompt_and_imgs(message) + + tool_run_summary = cast("CustomToolCallSummary", args[0].response).tool_result + return f""" +Here's the result from the {tool_name} tool: + +{tool_run_summary} + +Now respond to the following: + +{query} +""".strip() + + +class BaseTool(Tool): + def build_next_prompt( + self, + prompt_builder: "AnswerPromptBuilder", + tool_call_summary: "ToolCallSummary", + tool_responses: list["ToolResponse"], + using_tool_calling_llm: bool, + ) -> "AnswerPromptBuilder": + if using_tool_calling_llm: + prompt_builder.append_message(tool_call_summary.tool_call_request) + prompt_builder.append_message(tool_call_summary.tool_call_result) + else: + prompt_builder.update_user_prompt( + HumanMessage( + content=build_user_message_for_non_tool_calling_llm( + prompt_builder.user_message_and_token_cnt[0], + self.name, + *tool_responses, + ) + ) + ) + + return prompt_builder diff --git a/backend/danswer/tools/built_in_tools.py b/backend/danswer/tools/built_in_tools.py index 99b2ae3bbb6..fb64381f1d0 100644 --- a/backend/danswer/tools/built_in_tools.py +++ b/backend/danswer/tools/built_in_tools.py @@ -9,9 +9,13 @@ from danswer.db.models import Persona from danswer.db.models import Tool as ToolDBModel -from danswer.tools.images.image_generation_tool import ImageGenerationTool -from danswer.tools.internet_search.internet_search_tool import InternetSearchTool -from danswer.tools.search.search_tool import SearchTool +from danswer.tools.tool_implementations.images.image_generation_tool import ( + ImageGenerationTool, +) +from danswer.tools.tool_implementations.internet_search.internet_search_tool import ( + InternetSearchTool, +) +from danswer.tools.tool_implementations.search.search_tool import SearchTool from danswer.tools.tool import Tool from danswer.utils.logger import setup_logger diff --git a/backend/danswer/tools/custom/custom_tool_prompt_builder.py b/backend/danswer/tools/custom/custom_tool_prompt_builder.py deleted file mode 100644 index 8016363acc9..00000000000 --- a/backend/danswer/tools/custom/custom_tool_prompt_builder.py +++ /dev/null @@ -1,21 +0,0 @@ -from typing import cast - -from danswer.tools.custom.custom_tool import CustomToolCallSummary -from danswer.tools.models import ToolResponse - - -def build_user_message_for_custom_tool_for_non_tool_calling_llm( - query: str, - tool_name: str, - *args: ToolResponse, -) -> str: - tool_run_summary = cast(CustomToolCallSummary, args[0].response).tool_result - return f""" -Here's the result from the {tool_name} tool: - -{tool_run_summary} - -Now respond to the following: - -{query} -""".strip() diff --git a/backend/danswer/tools/models.py b/backend/danswer/tools/models.py index 6317a95e2d3..4f56aecd372 100644 --- a/backend/danswer/tools/models.py +++ b/backend/danswer/tools/models.py @@ -1,4 +1,5 @@ from typing import Any +from uuid import UUID from pydantic import BaseModel from pydantic import model_validator @@ -40,7 +41,7 @@ class ToolCallFinalResult(ToolCallKickoff): class DynamicSchemaInfo(BaseModel): - chat_session_id: int | None + chat_session_id: UUID | None message_id: int | None diff --git a/backend/danswer/tools/tool.py b/backend/danswer/tools/tool.py index 81b9b457178..6fc9251a18a 100644 --- a/backend/danswer/tools/tool.py +++ b/backend/danswer/tools/tool.py @@ -1,11 +1,17 @@ import abc from collections.abc import Generator from typing import Any +from typing import TYPE_CHECKING -from danswer.dynamic_configs.interface import JSON_ro from danswer.llm.answering.models import PreviousMessage from danswer.llm.interfaces import LLM -from danswer.tools.models import ToolResponse +from danswer.utils.special_types import JSON_ro + + +if TYPE_CHECKING: + from danswer.llm.answering.prompts.build import AnswerPromptBuilder + from danswer.tools.message import ToolCallSummary + from danswer.tools.models import ToolResponse class Tool(abc.ABC): @@ -32,7 +38,7 @@ def tool_definition(self) -> dict: @abc.abstractmethod def build_tool_message_content( - self, *args: ToolResponse + self, *args: "ToolResponse" ) -> str | list[str | dict[str, Any]]: raise NotImplementedError @@ -51,13 +57,26 @@ def get_args_for_non_tool_calling_llm( """Actual execution of the tool""" @abc.abstractmethod - def run(self, **kwargs: Any) -> Generator[ToolResponse, None, None]: + def run(self, **kwargs: Any) -> Generator["ToolResponse", None, None]: raise NotImplementedError @abc.abstractmethod - def final_result(self, *args: ToolResponse) -> JSON_ro: + def final_result(self, *args: "ToolResponse") -> JSON_ro: """ This is the "final summary" result of the tool. It is the result that will be stored in the database. """ raise NotImplementedError + + """Some tools may want to modify the prompt based on the tool call summary and tool responses. + Default behavior is to continue with just the raw tool call request/result passed to the LLM.""" + + @abc.abstractmethod + def build_next_prompt( + self, + prompt_builder: "AnswerPromptBuilder", + tool_call_summary: "ToolCallSummary", + tool_responses: list["ToolResponse"], + using_tool_calling_llm: bool, + ) -> "AnswerPromptBuilder": + raise NotImplementedError diff --git a/backend/danswer/tools/tool_constructor.py b/backend/danswer/tools/tool_constructor.py new file mode 100644 index 00000000000..a8fb5706dc2 --- /dev/null +++ b/backend/danswer/tools/tool_constructor.py @@ -0,0 +1,255 @@ +from typing import cast +from uuid import UUID + +from pydantic import BaseModel +from pydantic import Field +from sqlalchemy.orm import Session + +from danswer.configs.app_configs import AZURE_DALLE_API_BASE +from danswer.configs.app_configs import AZURE_DALLE_API_KEY +from danswer.configs.app_configs import AZURE_DALLE_API_VERSION +from danswer.configs.app_configs import AZURE_DALLE_DEPLOYMENT_NAME +from danswer.configs.chat_configs import BING_API_KEY +from danswer.configs.model_configs import GEN_AI_TEMPERATURE +from danswer.context.search.enums import LLMEvaluationType +from danswer.context.search.models import InferenceSection +from danswer.context.search.models import RetrievalDetails +from danswer.db.llm import fetch_existing_llm_providers +from danswer.db.models import Persona +from danswer.db.models import User +from danswer.file_store.models import InMemoryChatFile +from danswer.llm.answering.models import AnswerStyleConfig +from danswer.llm.answering.models import CitationConfig +from danswer.llm.answering.models import DocumentPruningConfig +from danswer.llm.answering.models import PromptConfig +from danswer.llm.interfaces import LLM +from danswer.llm.interfaces import LLMConfig +from danswer.natural_language_processing.utils import get_tokenizer +from danswer.tools.built_in_tools import get_built_in_tool_by_id +from danswer.tools.models import DynamicSchemaInfo +from danswer.tools.tool import Tool +from danswer.tools.tool_implementations.custom.custom_tool import ( + build_custom_tools_from_openapi_schema_and_headers, +) +from danswer.tools.tool_implementations.images.image_generation_tool import ( + ImageGenerationTool, +) +from danswer.tools.tool_implementations.internet_search.internet_search_tool import ( + InternetSearchTool, +) +from danswer.tools.tool_implementations.search.search_tool import SearchTool +from danswer.tools.utils import compute_all_tool_tokens +from danswer.tools.utils import explicit_tool_calling_supported +from danswer.utils.headers import header_dict_to_header_list +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +def _get_image_generation_config(llm: LLM, db_session: Session) -> LLMConfig: + """Helper function to get image generation LLM config based on available providers""" + if llm and llm.config.api_key and llm.config.model_provider == "openai": + return LLMConfig( + model_provider=llm.config.model_provider, + model_name="dall-e-3", + temperature=GEN_AI_TEMPERATURE, + api_key=llm.config.api_key, + api_base=llm.config.api_base, + api_version=llm.config.api_version, + ) + + if llm.config.model_provider == "azure" and AZURE_DALLE_API_KEY is not None: + return LLMConfig( + model_provider="azure", + model_name=f"azure/{AZURE_DALLE_DEPLOYMENT_NAME}", + temperature=GEN_AI_TEMPERATURE, + api_key=AZURE_DALLE_API_KEY, + api_base=AZURE_DALLE_API_BASE, + api_version=AZURE_DALLE_API_VERSION, + ) + + # Fallback to checking for OpenAI provider in database + llm_providers = fetch_existing_llm_providers(db_session) + openai_provider = next( + iter( + [ + llm_provider + for llm_provider in llm_providers + if llm_provider.provider == "openai" + ] + ), + None, + ) + + if not openai_provider or not openai_provider.api_key: + raise ValueError("Image generation tool requires an OpenAI API key") + + return LLMConfig( + model_provider=openai_provider.provider, + model_name="dall-e-3", + temperature=GEN_AI_TEMPERATURE, + api_key=openai_provider.api_key, + api_base=openai_provider.api_base, + api_version=openai_provider.api_version, + ) + + +class SearchToolConfig(BaseModel): + answer_style_config: AnswerStyleConfig = Field( + default_factory=lambda: AnswerStyleConfig(citation_config=CitationConfig()) + ) + document_pruning_config: DocumentPruningConfig = Field( + default_factory=DocumentPruningConfig + ) + retrieval_options: RetrievalDetails = Field(default_factory=RetrievalDetails) + selected_sections: list[InferenceSection] | None = None + chunks_above: int = 0 + chunks_below: int = 0 + full_doc: bool = False + latest_query_files: list[InMemoryChatFile] | None = None + + +class InternetSearchToolConfig(BaseModel): + answer_style_config: AnswerStyleConfig = Field( + default_factory=lambda: AnswerStyleConfig( + citation_config=CitationConfig(all_docs_useful=True) + ) + ) + + +class ImageGenerationToolConfig(BaseModel): + additional_headers: dict[str, str] | None = None + + +class CustomToolConfig(BaseModel): + chat_session_id: UUID | None = None + message_id: int | None = None + additional_headers: dict[str, str] | None = None + + +def construct_tools( + persona: Persona, + prompt_config: PromptConfig, + db_session: Session, + user: User | None, + llm: LLM, + fast_llm: LLM, + search_tool_config: SearchToolConfig | None = None, + internet_search_tool_config: InternetSearchToolConfig | None = None, + image_generation_tool_config: ImageGenerationToolConfig | None = None, + custom_tool_config: CustomToolConfig | None = None, +) -> dict[int, list[Tool]]: + """Constructs tools based on persona configuration and available APIs""" + tool_dict: dict[int, list[Tool]] = {} + + for db_tool_model in persona.tools: + if db_tool_model.in_code_tool_id: + tool_cls = get_built_in_tool_by_id(db_tool_model.id, db_session) + + # Handle Search Tool + if tool_cls.__name__ == SearchTool.__name__: + if not search_tool_config: + search_tool_config = SearchToolConfig() + + search_tool = SearchTool( + db_session=db_session, + user=user, + persona=persona, + retrieval_options=search_tool_config.retrieval_options, + prompt_config=prompt_config, + llm=llm, + fast_llm=fast_llm, + pruning_config=search_tool_config.document_pruning_config, + answer_style_config=search_tool_config.answer_style_config, + selected_sections=search_tool_config.selected_sections, + chunks_above=search_tool_config.chunks_above, + chunks_below=search_tool_config.chunks_below, + full_doc=search_tool_config.full_doc, + evaluation_type=( + LLMEvaluationType.BASIC + if persona.llm_relevance_filter + else LLMEvaluationType.SKIP + ), + ) + tool_dict[db_tool_model.id] = [search_tool] + + # Handle Image Generation Tool + elif tool_cls.__name__ == ImageGenerationTool.__name__: + if not image_generation_tool_config: + image_generation_tool_config = ImageGenerationToolConfig() + + img_generation_llm_config = _get_image_generation_config( + llm, db_session + ) + + tool_dict[db_tool_model.id] = [ + ImageGenerationTool( + api_key=cast(str, img_generation_llm_config.api_key), + api_base=img_generation_llm_config.api_base, + api_version=img_generation_llm_config.api_version, + additional_headers=image_generation_tool_config.additional_headers, + model=img_generation_llm_config.model_name, + ) + ] + + # Handle Internet Search Tool + elif tool_cls.__name__ == InternetSearchTool.__name__: + if not internet_search_tool_config: + internet_search_tool_config = InternetSearchToolConfig() + + if not BING_API_KEY: + raise ValueError( + "Internet search tool requires a Bing API key, please contact your Danswer admin to get it added!" + ) + tool_dict[db_tool_model.id] = [ + InternetSearchTool( + api_key=BING_API_KEY, + answer_style_config=internet_search_tool_config.answer_style_config, + prompt_config=prompt_config, + ) + ] + + # Handle custom tools + elif db_tool_model.openapi_schema: + if not custom_tool_config: + custom_tool_config = CustomToolConfig() + + tool_dict[db_tool_model.id] = cast( + list[Tool], + build_custom_tools_from_openapi_schema_and_headers( + db_tool_model.openapi_schema, + dynamic_schema_info=DynamicSchemaInfo( + chat_session_id=custom_tool_config.chat_session_id, + message_id=custom_tool_config.message_id, + ), + custom_headers=(db_tool_model.custom_headers or []) + + ( + header_dict_to_header_list( + custom_tool_config.additional_headers or {} + ) + ), + ), + ) + + tools: list[Tool] = [] + for tool_list in tool_dict.values(): + tools.extend(tool_list) + + # factor in tool definition size when pruning + if search_tool_config: + search_tool_config.document_pruning_config.tool_num_tokens = ( + compute_all_tool_tokens( + tools, + get_tokenizer( + model_name=llm.config.model_name, + provider_type=llm.config.model_provider, + ), + ) + ) + search_tool_config.document_pruning_config.using_tool_message = ( + explicit_tool_calling_supported( + llm.config.model_provider, llm.config.model_name + ) + ) + + return tool_dict diff --git a/backend/danswer/tools/custom/base_tool_types.py b/backend/danswer/tools/tool_implementations/custom/base_tool_types.py similarity index 100% rename from backend/danswer/tools/custom/base_tool_types.py rename to backend/danswer/tools/tool_implementations/custom/base_tool_types.py diff --git a/backend/danswer/tools/custom/custom_tool.py b/backend/danswer/tools/tool_implementations/custom/custom_tool.py similarity index 51% rename from backend/danswer/tools/custom/custom_tool.py rename to backend/danswer/tools/tool_implementations/custom/custom_tool.py index 0272b4ad607..c25d61b3cf3 100644 --- a/backend/danswer/tools/custom/custom_tool.py +++ b/backend/danswer/tools/tool_implementations/custom/custom_tool.py @@ -1,51 +1,85 @@ +import csv import json +import uuid from collections.abc import Generator +from io import BytesIO +from io import StringIO from typing import Any from typing import cast +from typing import Dict +from typing import List import requests from langchain_core.messages import HumanMessage from langchain_core.messages import SystemMessage from pydantic import BaseModel +from requests import JSONDecodeError -from danswer.dynamic_configs.interface import JSON_ro +from danswer.configs.constants import FileOrigin +from danswer.db.engine import get_session_with_default_tenant +from danswer.file_store.file_store import get_default_file_store +from danswer.file_store.models import ChatFileType +from danswer.file_store.models import InMemoryChatFile from danswer.llm.answering.models import PreviousMessage +from danswer.llm.answering.prompts.build import AnswerPromptBuilder from danswer.llm.interfaces import LLM -from danswer.tools.custom.base_tool_types import ToolResultType -from danswer.tools.custom.custom_tool_prompts import ( - SHOULD_USE_CUSTOM_TOOL_SYSTEM_PROMPT, -) -from danswer.tools.custom.custom_tool_prompts import SHOULD_USE_CUSTOM_TOOL_USER_PROMPT -from danswer.tools.custom.custom_tool_prompts import TOOL_ARG_SYSTEM_PROMPT -from danswer.tools.custom.custom_tool_prompts import TOOL_ARG_USER_PROMPT -from danswer.tools.custom.custom_tool_prompts import USE_TOOL -from danswer.tools.custom.openapi_parsing import MethodSpec -from danswer.tools.custom.openapi_parsing import openapi_to_method_specs -from danswer.tools.custom.openapi_parsing import openapi_to_url -from danswer.tools.custom.openapi_parsing import REQUEST_BODY -from danswer.tools.custom.openapi_parsing import validate_openapi_schema +from danswer.tools.base_tool import BaseTool +from danswer.tools.message import ToolCallSummary from danswer.tools.models import CHAT_SESSION_ID_PLACEHOLDER from danswer.tools.models import DynamicSchemaInfo from danswer.tools.models import MESSAGE_ID_PLACEHOLDER -from danswer.tools.tool import Tool -from danswer.tools.tool import ToolResponse +from danswer.tools.models import ToolResponse +from danswer.tools.tool_implementations.custom.custom_tool_prompts import ( + SHOULD_USE_CUSTOM_TOOL_SYSTEM_PROMPT, +) +from danswer.tools.tool_implementations.custom.custom_tool_prompts import ( + SHOULD_USE_CUSTOM_TOOL_USER_PROMPT, +) +from danswer.tools.tool_implementations.custom.custom_tool_prompts import ( + TOOL_ARG_SYSTEM_PROMPT, +) +from danswer.tools.tool_implementations.custom.custom_tool_prompts import ( + TOOL_ARG_USER_PROMPT, +) +from danswer.tools.tool_implementations.custom.custom_tool_prompts import USE_TOOL +from danswer.tools.tool_implementations.custom.openapi_parsing import MethodSpec +from danswer.tools.tool_implementations.custom.openapi_parsing import ( + openapi_to_method_specs, +) +from danswer.tools.tool_implementations.custom.openapi_parsing import openapi_to_url +from danswer.tools.tool_implementations.custom.openapi_parsing import REQUEST_BODY +from danswer.tools.tool_implementations.custom.openapi_parsing import ( + validate_openapi_schema, +) +from danswer.tools.tool_implementations.custom.prompt import ( + build_custom_image_generation_user_prompt, +) +from danswer.utils.headers import header_list_to_header_dict +from danswer.utils.headers import HeaderItemDict from danswer.utils.logger import setup_logger +from danswer.utils.special_types import JSON_ro logger = setup_logger() CUSTOM_TOOL_RESPONSE_ID = "custom_tool_response" +class CustomToolFileResponse(BaseModel): + file_ids: List[str] # References to saved images or CSVs + + class CustomToolCallSummary(BaseModel): tool_name: str - tool_result: ToolResultType + response_type: str # e.g., 'json', 'image', 'csv', 'graph' + tool_result: Any # The response data -class CustomTool(Tool): +class CustomTool(BaseTool): def __init__( self, method_spec: MethodSpec, base_url: str, + custom_headers: list[HeaderItemDict] | None = None, ) -> None: self._base_url = base_url self._method_spec = method_spec @@ -53,6 +87,9 @@ def __init__( self._name = self._method_spec.name self._description = self._method_spec.summary + self.headers = ( + header_list_to_header_dict(custom_headers) if custom_headers else {} + ) @property def name(self) -> str: @@ -75,6 +112,12 @@ def build_tool_message_content( self, *args: ToolResponse ) -> str | list[str | dict[str, Any]]: response = cast(CustomToolCallSummary, args[0].response) + + if response.response_type == "image" or response.response_type == "csv": + image_response = cast(CustomToolFileResponse, response.tool_result) + return json.dumps({"file_ids": image_response.file_ids}) + + # For JSON or other responses, return as-is return json.dumps(response.tool_result) """For LLMs which do NOT support explicit tool calling""" @@ -142,6 +185,38 @@ def get_args_for_non_tool_calling_llm( ) return None + def _save_and_get_file_references( + self, file_content: bytes | str, content_type: str + ) -> List[str]: + with get_session_with_default_tenant() as db_session: + file_store = get_default_file_store(db_session) + + file_id = str(uuid.uuid4()) + + # Handle both binary and text content + if isinstance(file_content, str): + content = BytesIO(file_content.encode()) + else: + content = BytesIO(file_content) + + file_store.save_file( + file_name=file_id, + content=content, + display_name=file_id, + file_origin=FileOrigin.CHAT_UPLOAD, + file_type=content_type, + file_metadata={ + "content_type": content_type, + }, + ) + + return [file_id] + + def _parse_csv(self, csv_text: str) -> List[Dict[str, Any]]: + csv_file = StringIO(csv_text) + reader = csv.DictReader(csv_file) + return [row for row in reader] + """Actual execution of the tool""" def run(self, **kwargs: Any) -> Generator[ToolResponse, None, None]: @@ -162,21 +237,116 @@ def run(self, **kwargs: Any) -> Generator[ToolResponse, None, None]: url = self._method_spec.build_url(self._base_url, path_params, query_params) method = self._method_spec.method - response = requests.request(method, url, json=request_body) + response = requests.request( + method, url, json=request_body, headers=self.headers + ) + content_type = response.headers.get("Content-Type", "") + + tool_result: Any + response_type: str + if "text/csv" in content_type: + file_ids = self._save_and_get_file_references( + response.content, content_type + ) + tool_result = CustomToolFileResponse(file_ids=file_ids) + response_type = "csv" + + elif "image/" in content_type: + file_ids = self._save_and_get_file_references( + response.content, content_type + ) + tool_result = CustomToolFileResponse(file_ids=file_ids) + response_type = "image" + + else: + try: + tool_result = response.json() + response_type = "json" + except JSONDecodeError: + logger.exception( + f"Failed to parse response as JSON for tool '{self._name}'" + ) + tool_result = response.text + response_type = "text" + + logger.info( + f"Returning tool response for {self._name} with type {response_type}" + ) yield ToolResponse( id=CUSTOM_TOOL_RESPONSE_ID, response=CustomToolCallSummary( - tool_name=self._name, tool_result=response.json() + tool_name=self._name, + response_type=response_type, + tool_result=tool_result, ), ) + def build_next_prompt( + self, + prompt_builder: AnswerPromptBuilder, + tool_call_summary: ToolCallSummary, + tool_responses: list[ToolResponse], + using_tool_calling_llm: bool, + ) -> AnswerPromptBuilder: + response = cast(CustomToolCallSummary, tool_responses[0].response) + + # Handle non-file responses using parent class behavior + if response.response_type not in ["image", "csv"]: + return super().build_next_prompt( + prompt_builder, + tool_call_summary, + tool_responses, + using_tool_calling_llm, + ) + + # Handle image and CSV file responses + file_type = ( + ChatFileType.IMAGE + if response.response_type == "image" + else ChatFileType.CSV + ) + + # Load files from storage + files = [] + with get_session_with_default_tenant() as db_session: + file_store = get_default_file_store(db_session) + + for file_id in response.tool_result.file_ids: + try: + file_io = file_store.read_file(file_id, mode="b") + files.append( + InMemoryChatFile( + file_id=file_id, + filename=file_id, + content=file_io.read(), + file_type=file_type, + ) + ) + except Exception: + logger.exception(f"Failed to read file {file_id}") + + # Update prompt with file content + prompt_builder.update_user_prompt( + build_custom_image_generation_user_prompt( + query=prompt_builder.get_user_message_content(), + files=files, + file_type=file_type, + ) + ) + + return prompt_builder + def final_result(self, *args: ToolResponse) -> JSON_ro: - return cast(CustomToolCallSummary, args[0].response).tool_result + response = cast(CustomToolCallSummary, args[0].response) + if isinstance(response.tool_result, CustomToolFileResponse): + return response.tool_result.model_dump() + return response.tool_result -def build_custom_tools_from_openapi_schema( +def build_custom_tools_from_openapi_schema_and_headers( openapi_schema: dict[str, Any], + custom_headers: list[HeaderItemDict] | None = None, dynamic_schema_info: DynamicSchemaInfo | None = None, ) -> list[CustomTool]: if dynamic_schema_info: @@ -195,7 +365,9 @@ def build_custom_tools_from_openapi_schema( url = openapi_to_url(openapi_schema) method_specs = openapi_to_method_specs(openapi_schema) - return [CustomTool(method_spec, url) for method_spec in method_specs] + return [ + CustomTool(method_spec, url, custom_headers) for method_spec in method_specs + ] if __name__ == "__main__": @@ -246,7 +418,7 @@ def build_custom_tools_from_openapi_schema( } validate_openapi_schema(openapi_schema) - tools = build_custom_tools_from_openapi_schema( + tools = build_custom_tools_from_openapi_schema_and_headers( openapi_schema, dynamic_schema_info=None ) diff --git a/backend/danswer/tools/custom/custom_tool_prompts.py b/backend/danswer/tools/tool_implementations/custom/custom_tool_prompts.py similarity index 100% rename from backend/danswer/tools/custom/custom_tool_prompts.py rename to backend/danswer/tools/tool_implementations/custom/custom_tool_prompts.py diff --git a/backend/danswer/tools/custom/openapi_parsing.py b/backend/danswer/tools/tool_implementations/custom/openapi_parsing.py similarity index 100% rename from backend/danswer/tools/custom/openapi_parsing.py rename to backend/danswer/tools/tool_implementations/custom/openapi_parsing.py diff --git a/backend/danswer/tools/tool_implementations/custom/prompt.py b/backend/danswer/tools/tool_implementations/custom/prompt.py new file mode 100644 index 00000000000..9911594a917 --- /dev/null +++ b/backend/danswer/tools/tool_implementations/custom/prompt.py @@ -0,0 +1,25 @@ +from langchain_core.messages import HumanMessage + +from danswer.file_store.models import ChatFileType +from danswer.file_store.models import InMemoryChatFile +from danswer.llm.utils import build_content_with_imgs + + +CUSTOM_IMG_GENERATION_SUMMARY_PROMPT = """ +You have just created the attached {file_type} file in response to the following query: "{query}". + +Can you please summarize it in a sentence or two? Do NOT include image urls or bulleted lists. +""" + + +def build_custom_image_generation_user_prompt( + query: str, file_type: ChatFileType, files: list[InMemoryChatFile] | None = None +) -> HumanMessage: + return HumanMessage( + content=build_content_with_imgs( + message=CUSTOM_IMG_GENERATION_SUMMARY_PROMPT.format( + query=query, file_type=file_type.value + ).strip(), + files=files, + ) + ) diff --git a/backend/danswer/tools/images/image_generation_tool.py b/backend/danswer/tools/tool_implementations/images/image_generation_tool.py similarity index 83% rename from backend/danswer/tools/images/image_generation_tool.py rename to backend/danswer/tools/tool_implementations/images/image_generation_tool.py index 6e2515a8e9f..70763fc7896 100644 --- a/backend/danswer/tools/images/image_generation_tool.py +++ b/backend/danswer/tools/tool_implementations/images/image_generation_tool.py @@ -9,16 +9,21 @@ from danswer.chat.chat_utils import combine_message_chain from danswer.configs.model_configs import GEN_AI_HISTORY_CUTOFF -from danswer.dynamic_configs.interface import JSON_ro from danswer.llm.answering.models import PreviousMessage -from danswer.llm.headers import build_llm_extra_headers +from danswer.llm.answering.prompts.build import AnswerPromptBuilder from danswer.llm.interfaces import LLM from danswer.llm.utils import build_content_with_imgs from danswer.llm.utils import message_to_string from danswer.prompts.constants import GENERAL_SEP_PAT +from danswer.tools.message import ToolCallSummary +from danswer.tools.models import ToolResponse from danswer.tools.tool import Tool -from danswer.tools.tool import ToolResponse +from danswer.tools.tool_implementations.images.prompt import ( + build_image_generation_user_prompt, +) +from danswer.utils.headers import build_llm_extra_headers from danswer.utils.logger import setup_logger +from danswer.utils.special_types import JSON_ro from danswer.utils.threadpool_concurrency import run_functions_tuples_in_parallel @@ -112,7 +117,10 @@ def tool_definition(self) -> dict: }, "shape": { "type": "string", - "description": "Optional. Image shape: 'square', 'portrait', or 'landscape'", + "description": ( + "Optional - only specify if you want a specific shape." + " Image shape: 'square', 'portrait', or 'landscape'." + ), "enum": [shape.value for shape in ImageShape], }, }, @@ -258,3 +266,34 @@ def final_result(self, *args: ToolResponse) -> JSON_ro: image_generation_response.model_dump() for image_generation_response in image_generation_responses ] + + def build_next_prompt( + self, + prompt_builder: AnswerPromptBuilder, + tool_call_summary: ToolCallSummary, + tool_responses: list[ToolResponse], + using_tool_calling_llm: bool, + ) -> AnswerPromptBuilder: + img_generation_response = cast( + list[ImageGenerationResponse] | None, + next( + ( + response.response + for response in tool_responses + if response.id == IMAGE_GENERATION_RESPONSE_ID + ), + None, + ), + ) + if img_generation_response is None: + raise ValueError("No image generation response found") + + img_urls = [img.url for img in img_generation_response] + prompt_builder.update_user_prompt( + build_image_generation_user_prompt( + query=prompt_builder.get_user_message_content(), + img_urls=img_urls, + ) + ) + + return prompt_builder diff --git a/backend/danswer/tools/images/prompt.py b/backend/danswer/tools/tool_implementations/images/prompt.py similarity index 100% rename from backend/danswer/tools/images/prompt.py rename to backend/danswer/tools/tool_implementations/images/prompt.py diff --git a/backend/danswer/tools/internet_search/internet_search_tool.py b/backend/danswer/tools/tool_implementations/internet_search/internet_search_tool.py similarity index 80% rename from backend/danswer/tools/internet_search/internet_search_tool.py rename to backend/danswer/tools/tool_implementations/internet_search/internet_search_tool.py index 3012eb465f4..7819753a30f 100644 --- a/backend/danswer/tools/internet_search/internet_search_tool.py +++ b/backend/danswer/tools/tool_implementations/internet_search/internet_search_tool.py @@ -5,25 +5,29 @@ from typing import cast import httpx - from danswer.chat.chat_utils import combine_message_chain from danswer.chat.models import LlmDoc from danswer.configs.constants import DocumentSource from danswer.configs.model_configs import GEN_AI_HISTORY_CUTOFF -from danswer.dynamic_configs.interface import JSON_ro +from danswer.context.search.models import SearchDoc +from danswer.llm.answering.models import AnswerStyleConfig from danswer.llm.answering.models import PreviousMessage +from danswer.llm.answering.models import PromptConfig +from danswer.llm.answering.prompts.build import AnswerPromptBuilder from danswer.llm.interfaces import LLM from danswer.llm.utils import message_to_string from danswer.prompts.chat_prompts import INTERNET_SEARCH_QUERY_REPHRASE from danswer.prompts.constants import GENERAL_SEP_PAT -from danswer.search.models import SearchDoc from danswer.secondary_llm_flows.query_expansion import history_based_query_rephrase -from danswer.tools.internet_search.models import InternetSearchResponse -from danswer.tools.internet_search.models import InternetSearchResult -from danswer.tools.search.search_tool import FINAL_CONTEXT_DOCUMENTS_ID +from danswer.tools.message import ToolCallSummary +from danswer.tools.models import ToolResponse from danswer.tools.tool import Tool -from danswer.tools.tool import ToolResponse +from danswer.tools.tool_implementations.internet_search.models import InternetSearchResponse +from danswer.tools.tool_implementations.internet_search.models import InternetSearchResult +from danswer.tools.tool_implementations.search_like_tool_utils import build_next_prompt_for_search_like_tool +from danswer.tools.tool_implementations.search_like_tool_utils import FINAL_CONTEXT_DOCUMENTS_ID from danswer.utils.logger import setup_logger +from danswer.utils.special_types import JSON_ro logger = setup_logger() @@ -97,8 +101,17 @@ class InternetSearchTool(Tool): _DISPLAY_NAME = "[Beta] Internet Search Tool" _DESCRIPTION = "Perform an internet search for up-to-date information." - def __init__(self, api_key: str, num_results: int = 10) -> None: + def __init__( + self, + api_key: str, + answer_style_config: AnswerStyleConfig, + prompt_config: PromptConfig, + num_results: int = 10, + ) -> None: self.api_key = api_key + self.answer_style_config = answer_style_config + self.prompt_config = prompt_config + self.host = "https://api.bing.microsoft.com/v7.0" self.headers = { "Ocp-Apim-Subscription-Key": api_key, @@ -231,3 +244,19 @@ def run(self, **kwargs: str) -> Generator[ToolResponse, None, None]: def final_result(self, *args: ToolResponse) -> JSON_ro: search_response = cast(InternetSearchResponse, args[0].response) return search_response.model_dump() + + def build_next_prompt( + self, + prompt_builder: AnswerPromptBuilder, + tool_call_summary: ToolCallSummary, + tool_responses: list[ToolResponse], + using_tool_calling_llm: bool, + ) -> AnswerPromptBuilder: + return build_next_prompt_for_search_like_tool( + prompt_builder=prompt_builder, + tool_call_summary=tool_call_summary, + tool_responses=tool_responses, + using_tool_calling_llm=using_tool_calling_llm, + answer_style_config=self.answer_style_config, + prompt_config=self.prompt_config + ) diff --git a/backend/danswer/tools/internet_search/models.py b/backend/danswer/tools/tool_implementations/internet_search/models.py similarity index 100% rename from backend/danswer/tools/internet_search/models.py rename to backend/danswer/tools/tool_implementations/internet_search/models.py diff --git a/backend/danswer/tools/search/search_tool.py b/backend/danswer/tools/tool_implementations/search/search_tool.py similarity index 81% rename from backend/danswer/tools/search/search_tool.py rename to backend/danswer/tools/tool_implementations/search/search_tool.py index cbfaf4f3d92..1ea7795d602 100644 --- a/backend/danswer/tools/search/search_tool.py +++ b/backend/danswer/tools/tool_implementations/search/search_tool.py @@ -1,51 +1,60 @@ import json from collections.abc import Generator -from typing import Any -from typing import cast - -from pydantic import BaseModel -from sqlalchemy.orm import Session +from typing import Any, cast from danswer.chat.chat_utils import llm_doc_from_inference_section -from danswer.chat.models import DanswerContext -from danswer.chat.models import DanswerContexts -from danswer.chat.models import LlmDoc -from danswer.chat.models import SectionRelevancePiece -from danswer.configs.chat_configs import CONTEXT_CHUNKS_ABOVE -from danswer.configs.chat_configs import CONTEXT_CHUNKS_BELOW +from danswer.chat.models import ( + DanswerContext, + DanswerContexts, + LlmDoc, + SectionRelevancePiece, +) +from danswer.configs.chat_configs import CONTEXT_CHUNKS_ABOVE, CONTEXT_CHUNKS_BELOW from danswer.configs.model_configs import GEN_AI_MODEL_FALLBACK_MAX_TOKENS -from danswer.db.models import Persona -from danswer.db.models import User -from danswer.dynamic_configs.interface import JSON_ro -from danswer.llm.answering.models import ContextualPruningConfig -from danswer.llm.answering.models import DocumentPruningConfig -from danswer.llm.answering.models import PreviousMessage -from danswer.llm.answering.models import PromptConfig +from danswer.context.search.enums import LLMEvaluationType, QueryFlow, SearchType +from danswer.context.search.models import ( + IndexFilters, + InferenceSection, + RetrievalDetails, + SearchRequest, +) +from danswer.context.search.pipeline import SearchPipeline +from danswer.db.models import Persona, User +from danswer.llm.answering.llm_response_handler import LLMCall +from danswer.llm.answering.models import ( + AnswerStyleConfig, + ContextualPruningConfig, + DocumentPruningConfig, + PreviousMessage, + PromptConfig, +) +from danswer.llm.answering.prompts.build import AnswerPromptBuilder from danswer.llm.answering.prompts.citations_prompt import compute_max_llm_input_tokens -from danswer.llm.answering.prune_and_merge import prune_and_merge_sections -from danswer.llm.answering.prune_and_merge import prune_sections +from danswer.llm.answering.prune_and_merge import ( + prune_and_merge_sections, + prune_sections, +) from danswer.llm.interfaces import LLM -from danswer.search.enums import LLMEvaluationType -from danswer.search.enums import QueryFlow -from danswer.search.enums import SearchType -from danswer.search.models import IndexFilters -from danswer.search.models import InferenceSection -from danswer.search.models import RetrievalDetails -from danswer.search.models import SearchRequest -from danswer.search.pipeline import SearchPipeline from danswer.secondary_llm_flows.choose_search import check_if_need_search from danswer.secondary_llm_flows.query_expansion import history_based_query_rephrase -from danswer.tools.search.search_utils import llm_doc_to_dict +from danswer.tools.message import ToolCallSummary +from danswer.tools.models import ToolResponse from danswer.tools.tool import Tool -from danswer.tools.tool import ToolResponse +from danswer.tools.tool_implementations.search.search_utils import llm_doc_to_dict +from danswer.tools.tool_implementations.search_like_tool_utils import ( + FINAL_CONTEXT_DOCUMENTS_ID, + build_next_prompt_for_search_like_tool, +) from danswer.utils.logger import setup_logger +from danswer.utils.special_types import JSON_ro +from pydantic import BaseModel +from sqlalchemy.orm import Session logger = setup_logger() SEARCH_RESPONSE_SUMMARY_ID = "search_response_summary" SEARCH_DOC_CONTENT_ID = "search_doc_content" SECTION_RELEVANCE_LIST_ID = "section_relevance_list" -FINAL_CONTEXT_DOCUMENTS_ID = "final_context_documents" SEARCH_EVALUATION_ID = "llm_doc_eval" @@ -85,6 +94,7 @@ def __init__( llm: LLM, fast_llm: LLM, pruning_config: DocumentPruningConfig, + answer_style_config: AnswerStyleConfig, evaluation_type: LLMEvaluationType, # if specified, will not actually run a search and will instead return these # sections. Used when the user selects specific docs to talk to @@ -136,6 +146,7 @@ def __init__( num_chunk_multiple = self.chunks_above + self.chunks_below + 1 + self.answer_style_config = answer_style_config self.contextual_pruning_config = ( ContextualPruningConfig.from_doc_pruning_config( num_chunk_multiple=num_chunk_multiple, doc_pruning_config=pruning_config @@ -353,4 +364,37 @@ def final_result(self, *args: ToolResponse) -> JSON_ro: # NOTE: need to do this json.loads(doc.json()) stuff because there are some # subfields that are not serializable by default (datetime) # this forces pydantic to make them JSON serializable for us - return [json.loads(doc.json()) for doc in final_docs] + return [json.loads(doc.model_dump_json()) for doc in final_docs] + + def build_next_prompt( + self, + prompt_builder: AnswerPromptBuilder, + tool_call_summary: ToolCallSummary, + tool_responses: list[ToolResponse], + using_tool_calling_llm: bool, + ) -> AnswerPromptBuilder: + return build_next_prompt_for_search_like_tool( + prompt_builder=prompt_builder, + tool_call_summary=tool_call_summary, + tool_responses=tool_responses, + using_tool_calling_llm=using_tool_calling_llm, + answer_style_config=self.answer_style_config, + prompt_config=self.prompt_config, + user_email=self.user.email if self.user else None, + ) + + """Other utility functions""" + + @classmethod + def get_search_result(cls, llm_call: LLMCall) -> list[LlmDoc] | None: + if not llm_call.tool_call_info: + return None + + for yield_item in llm_call.tool_call_info: + if ( + isinstance(yield_item, ToolResponse) + and yield_item.id == FINAL_CONTEXT_DOCUMENTS_ID + ): + return cast(list[LlmDoc], yield_item.response) + + return None diff --git a/backend/danswer/tools/search/search_utils.py b/backend/danswer/tools/tool_implementations/search/search_utils.py similarity index 94% rename from backend/danswer/tools/search/search_utils.py rename to backend/danswer/tools/tool_implementations/search/search_utils.py index 5076632a694..5eef0b891f0 100644 --- a/backend/danswer/tools/search/search_utils.py +++ b/backend/danswer/tools/tool_implementations/search/search_utils.py @@ -1,6 +1,6 @@ from danswer.chat.models import LlmDoc +from danswer.context.search.models import InferenceSection from danswer.prompts.prompt_utils import clean_up_source -from danswer.search.models import InferenceSection def llm_doc_to_dict(llm_doc: LlmDoc, doc_num: int) -> dict: diff --git a/backend/danswer/tools/tool_implementations/search_like_tool_utils.py b/backend/danswer/tools/tool_implementations/search_like_tool_utils.py new file mode 100644 index 00000000000..04f21ecf59c --- /dev/null +++ b/backend/danswer/tools/tool_implementations/search_like_tool_utils.py @@ -0,0 +1,77 @@ +from typing import cast + +from danswer.chat.models import LlmDoc +from danswer.llm.answering.models import AnswerStyleConfig, PromptConfig +from danswer.llm.answering.prompts.build import AnswerPromptBuilder +from danswer.llm.answering.prompts.citations_prompt import ( + build_citations_system_message, + build_citations_user_message, +) +from danswer.llm.answering.prompts.quotes_prompt import build_quotes_user_message +from danswer.tools.message import ToolCallSummary +from danswer.tools.models import ToolResponse +from langchain_core.messages import HumanMessage + +FINAL_CONTEXT_DOCUMENTS_ID = "final_context_documents" + + +def build_next_prompt_for_search_like_tool( + prompt_builder: AnswerPromptBuilder, + tool_call_summary: ToolCallSummary, + tool_responses: list[ToolResponse], + using_tool_calling_llm: bool, + answer_style_config: AnswerStyleConfig, + prompt_config: PromptConfig, + user_email: str | None = None, +) -> AnswerPromptBuilder: + if not using_tool_calling_llm: + final_context_docs_response = next( + response + for response in tool_responses + if response.id == FINAL_CONTEXT_DOCUMENTS_ID + ) + final_context_documents = cast( + list[LlmDoc], final_context_docs_response.response + ) + else: + # if using tool calling llm, then the final context documents are the tool responses + final_context_documents = [] + + if answer_style_config.citation_config: + prompt_builder.update_system_prompt( + build_citations_system_message(prompt_config, user_email) + ) + prompt_builder.update_user_prompt( + build_citations_user_message( + message=prompt_builder.user_message_and_token_cnt[0], + prompt_config=prompt_config, + context_docs=final_context_documents, + all_doc_useful=( + answer_style_config.citation_config.all_docs_useful + if answer_style_config.citation_config + else False + ), + history_message=prompt_builder.single_message_history or "", + ) + ) + elif answer_style_config.quotes_config: + # For Quotes, the system prompt is included in the user prompt + prompt_builder.update_system_prompt(None) + + human_message = HumanMessage(content=prompt_builder.raw_user_message) + + prompt_builder.update_user_prompt( + build_quotes_user_message( + message=human_message, + context_docs=final_context_documents, + history_str=prompt_builder.single_message_history or "", + prompt=prompt_config, + user_email=user_email, + ) + ) + + if using_tool_calling_llm: + prompt_builder.append_message(tool_call_summary.tool_call_request) + prompt_builder.append_message(tool_call_summary.tool_call_result) + + return prompt_builder diff --git a/backend/danswer/tools/tool_runner.py b/backend/danswer/tools/tool_runner.py index 58b94bdb0c8..fb3eb8b9932 100644 --- a/backend/danswer/tools/tool_runner.py +++ b/backend/danswer/tools/tool_runner.py @@ -6,8 +6,8 @@ from danswer.llm.interfaces import LLM from danswer.tools.models import ToolCallFinalResult from danswer.tools.models import ToolCallKickoff +from danswer.tools.models import ToolResponse from danswer.tools.tool import Tool -from danswer.tools.tool import ToolResponse from danswer.utils.threadpool_concurrency import run_functions_tuples_in_parallel diff --git a/backend/danswer/tools/utils.py b/backend/danswer/tools/utils.py index 9e20105edef..52d60feb912 100644 --- a/backend/danswer/tools/utils.py +++ b/backend/danswer/tools/utils.py @@ -1,5 +1,11 @@ import json +from sqlalchemy.orm import Session + +from danswer.configs.app_configs import AZURE_DALLE_API_KEY +from danswer.db.connector import check_connectors_exist +from danswer.db.document import check_docs_exist +from danswer.db.models import LLMProvider from danswer.natural_language_processing.utils import BaseTokenizer from danswer.tools.tool import Tool @@ -26,3 +32,18 @@ def compute_tool_tokens(tool: Tool, llm_tokenizer: BaseTokenizer) -> int: def compute_all_tool_tokens(tools: list[Tool], llm_tokenizer: BaseTokenizer) -> int: return sum(compute_tool_tokens(tool, llm_tokenizer) for tool in tools) + + +def is_image_generation_available(db_session: Session) -> bool: + providers = db_session.query(LLMProvider).all() + for provider in providers: + if provider.provider == "openai": + return True + + return bool(AZURE_DALLE_API_KEY) + + +def is_document_search_available(db_session: Session) -> bool: + docs_exist = check_docs_exist(db_session) + connectors_exist = check_connectors_exist(db_session) + return docs_exist or connectors_exist diff --git a/backend/danswer/utils/headers.py b/backend/danswer/utils/headers.py new file mode 100644 index 00000000000..5ccf61a51e1 --- /dev/null +++ b/backend/danswer/utils/headers.py @@ -0,0 +1,79 @@ +from typing import TypedDict + +from fastapi.datastructures import Headers + +from danswer.configs.model_configs import LITELLM_EXTRA_HEADERS +from danswer.configs.model_configs import LITELLM_PASS_THROUGH_HEADERS +from danswer.configs.tool_configs import CUSTOM_TOOL_PASS_THROUGH_HEADERS +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +class HeaderItemDict(TypedDict): + key: str + value: str + + +def clean_header_list(headers_to_clean: list[HeaderItemDict]) -> dict[str, str]: + cleaned_headers: dict[str, str] = {} + for item in headers_to_clean: + key = item["key"] + value = item["value"] + if key in cleaned_headers: + logger.warning( + f"Duplicate header {key} found in custom headers, ignoring..." + ) + continue + cleaned_headers[key] = value + return cleaned_headers + + +def header_dict_to_header_list(header_dict: dict[str, str]) -> list[HeaderItemDict]: + return [{"key": key, "value": value} for key, value in header_dict.items()] + + +def header_list_to_header_dict(header_list: list[HeaderItemDict]) -> dict[str, str]: + return {header["key"]: header["value"] for header in header_list} + + +def get_relevant_headers( + headers: dict[str, str] | Headers, desired_headers: list[str] | None +) -> dict[str, str]: + if not desired_headers: + return {} + + pass_through_headers: dict[str, str] = {} + for key in desired_headers: + if key in headers: + pass_through_headers[key] = headers[key] + else: + # fastapi makes all header keys lowercase, handling that here + lowercase_key = key.lower() + if lowercase_key in headers: + pass_through_headers[lowercase_key] = headers[lowercase_key] + + return pass_through_headers + + +def get_litellm_additional_request_headers( + headers: dict[str, str] | Headers +) -> dict[str, str]: + return get_relevant_headers(headers, LITELLM_PASS_THROUGH_HEADERS) + + +def build_llm_extra_headers( + additional_headers: dict[str, str] | None = None +) -> dict[str, str]: + extra_headers: dict[str, str] = {} + if additional_headers: + extra_headers.update(additional_headers) + if LITELLM_EXTRA_HEADERS: + extra_headers.update(LITELLM_EXTRA_HEADERS) + return extra_headers + + +def get_custom_tool_additional_request_headers( + headers: dict[str, str] | Headers +) -> dict[str, str]: + return get_relevant_headers(headers, CUSTOM_TOOL_PASS_THROUGH_HEADERS) diff --git a/backend/danswer/utils/logger.py b/backend/danswer/utils/logger.py index 96d4ae2a25e..b9c335be9c6 100644 --- a/backend/danswer/utils/logger.py +++ b/backend/danswer/utils/logger.py @@ -1,3 +1,4 @@ +import contextvars import logging import os from collections.abc import MutableMapping @@ -7,13 +8,25 @@ from shared_configs.configs import DEV_LOGGING_ENABLED from shared_configs.configs import LOG_FILE_NAME from shared_configs.configs import LOG_LEVEL +from shared_configs.configs import MULTI_TENANT +from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA from shared_configs.configs import SLACK_CHANNEL_ID +from shared_configs.configs import TENANT_ID_PREFIX +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR logging.addLevelName(logging.INFO + 5, "NOTICE") +pruning_ctx: contextvars.ContextVar[dict[str, Any]] = contextvars.ContextVar( + "pruning_ctx", default=dict() +) -class IndexAttemptSingleton: +doc_permission_sync_ctx: contextvars.ContextVar[ + dict[str, Any] +] = contextvars.ContextVar("doc_permission_sync_ctx", default=dict()) + + +class TaskAttemptSingleton: """Used to tell if this process is an indexing job, and if so what is the unique identifier for this indexing attempt. For things like the API server, main background job (scheduler), etc. this will not be used.""" @@ -57,14 +70,38 @@ def process( ) -> tuple[str, MutableMapping[str, Any]]: # If this is an indexing job, add the attempt ID to the log message # This helps filter the logs for this specific indexing - attempt_id = IndexAttemptSingleton.get_index_attempt_id() - cc_pair_id = IndexAttemptSingleton.get_connector_credential_pair_id() - - if attempt_id is not None: - msg = f"[Attempt ID: {attempt_id}] {msg}" - - if cc_pair_id is not None: - msg = f"[CC Pair ID: {cc_pair_id}] {msg}" + index_attempt_id = TaskAttemptSingleton.get_index_attempt_id() + cc_pair_id = TaskAttemptSingleton.get_connector_credential_pair_id() + + doc_permission_sync_ctx_dict = doc_permission_sync_ctx.get() + pruning_ctx_dict = pruning_ctx.get() + if len(pruning_ctx_dict) > 0: + if "request_id" in pruning_ctx_dict: + msg = f"[Prune: {pruning_ctx_dict['request_id']}] {msg}" + + if "cc_pair_id" in pruning_ctx_dict: + msg = f"[CC Pair: {pruning_ctx_dict['cc_pair_id']}] {msg}" + elif len(doc_permission_sync_ctx_dict) > 0: + if "request_id" in doc_permission_sync_ctx_dict: + msg = f"[Doc Permissions Sync: {doc_permission_sync_ctx_dict['request_id']}] {msg}" + else: + if index_attempt_id is not None: + msg = f"[Index Attempt: {index_attempt_id}] {msg}" + + if cc_pair_id is not None: + msg = f"[CC Pair: {cc_pair_id}] {msg}" + + # Add tenant information if it differs from default + # This will always be the case for authenticated API requests + if MULTI_TENANT: + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() + if tenant_id != POSTGRES_DEFAULT_SCHEMA: + # Strip tenant_ prefix and take first 8 chars for cleaner logs + tenant_display = tenant_id.removeprefix(TENANT_ID_PREFIX) + short_tenant = ( + tenant_display[:8] if len(tenant_display) > 8 else tenant_display + ) + msg = f"[t:{short_tenant}] {msg}" # For Slack Bot, logs the channel relevant to the request channel_id = self.extra.get(SLACK_CHANNEL_ID) if self.extra else None @@ -182,3 +219,25 @@ def setup_logger( logger.notice = lambda msg, *args, **kwargs: logger.log(logging.getLevelName("NOTICE"), msg, *args, **kwargs) # type: ignore return DanswerLoggingAdapter(logger, extra=extra) + + +def print_loggers() -> None: + """Print information about all loggers. Use to debug logging issues.""" + root_logger = logging.getLogger() + loggers: list[logging.Logger | logging.PlaceHolder] = [root_logger] + loggers.extend(logging.Logger.manager.loggerDict.values()) + + for logger in loggers: + if isinstance(logger, logging.PlaceHolder): + # Skip placeholders that aren't actual loggers + continue + + print(f"Logger: '{logger.name}' (Level: {logging.getLevelName(logger.level)})") + if logger.handlers: + for handler in logger.handlers: + print(f" Handler: {handler}") + else: + print(" No handlers") + + print(f" Propagate: {logger.propagate}") + print() diff --git a/backend/danswer/utils/long_term_log.py b/backend/danswer/utils/long_term_log.py new file mode 100644 index 00000000000..4faef454340 --- /dev/null +++ b/backend/danswer/utils/long_term_log.py @@ -0,0 +1,115 @@ +import json +import os +import threading +from datetime import datetime +from pathlib import Path +from typing import Any + +from danswer.utils.logger import setup_logger +from danswer.utils.special_types import JSON_ro + +logger = setup_logger() + +_LOG_FILE_NAME_TIMESTAMP_FORMAT = "%Y-%m-%d_%H-%M-%S-%f" + + +class LongTermLogger: + """NOTE: should support a LOT of data AND should be extremely fast, + ideally done in a background thread.""" + + def __init__( + self, + metadata: dict[str, str] | None = None, + log_file_path: str = "/tmp/long_term_log", + max_files_per_category: int = 1000, + ): + self.metadata = metadata + self.log_file_path = Path(log_file_path) + self.max_files_per_category = max_files_per_category + try: + # Create directory if it doesn't exist + os.makedirs(os.path.dirname(log_file_path), exist_ok=True) + except Exception as e: + logger.error(f"Error creating directory for long-term logs: {e}") + + def _cleanup_old_files(self, category_path: Path) -> None: + try: + files = sorted( + category_path.glob("*.json"), + key=lambda x: x.stat().st_mtime, # Sort by modification time + reverse=True, + ) + # Delete oldest files that exceed the limit + for file in files[self.max_files_per_category :]: + try: + file.unlink() + except Exception as e: + logger.error(f"Error deleting old log file {file}: {e}") + except Exception as e: + logger.error(f"Error during log rotation cleanup: {e}") + + def _record(self, message: Any, category: str) -> None: + category_path = self.log_file_path / category + try: + # Create directory if it doesn't exist + os.makedirs(category_path, exist_ok=True) + + # Perform cleanup before writing new file + self._cleanup_old_files(category_path) + + final_record = { + "metadata": self.metadata, + "record": message, + } + + file_path = ( + category_path + / f"{datetime.now().strftime(_LOG_FILE_NAME_TIMESTAMP_FORMAT)}.json" + ) + with open(file_path, "w+") as f: + # default allows us to "ignore" unserializable objects + json.dump(final_record, f, default=lambda x: str(x)) + except Exception as e: + logger.error(f"Error recording log: {e}") + + def record(self, message: JSON_ro, category: str = "default") -> None: + try: + # Run in separate thread to have minimal overhead in main flows + thread = threading.Thread( + target=self._record, args=(message, category), daemon=True + ) + thread.start() + except Exception: + # Should never interfere with normal functions of Danswer + pass + + def fetch_category( + self, + category: str, + start_time: datetime | None = None, + end_time: datetime | None = None, + limit: int = 100, + ) -> list[JSON_ro]: + category_path = self.log_file_path / category + files = list(category_path.glob("*.json")) + + results: list[JSON_ro] = [] + for file in files: + # Parse timestamp from filename (YYYY-MM-DD_HH-MM-SS.json) + try: + file_time = datetime.strptime( + file.stem, _LOG_FILE_NAME_TIMESTAMP_FORMAT + ) + + # Skip if outside time range + if start_time and file_time < start_time: + continue + if end_time and file_time > end_time: + continue + + results.append(json.loads(file.read_text())) + except ValueError: + # Skip files that don't match expected format + continue + + return results diff --git a/backend/danswer/connectors/cross_connector_utils/retry_wrapper.py b/backend/danswer/utils/retry_wrapper.py similarity index 79% rename from backend/danswer/connectors/cross_connector_utils/retry_wrapper.py rename to backend/danswer/utils/retry_wrapper.py index 7312d1349f7..2d6d79ca5eb 100644 --- a/backend/danswer/connectors/cross_connector_utils/retry_wrapper.py +++ b/backend/danswer/utils/retry_wrapper.py @@ -22,18 +22,18 @@ def retry_builder( jitter: tuple[float, float] | float = 1, ) -> Callable[[F], F]: """Builds a generic wrapper/decorator for calls to external APIs that - may fail due to rate limiting, flakes, or other reasons. Applies expontential + may fail due to rate limiting, flakes, or other reasons. Applies exponential backoff with jitter to retry the call.""" - @retry( - tries=tries, - delay=delay, - max_delay=max_delay, - backoff=backoff, - jitter=jitter, - logger=cast(Logger, logger), - ) def retry_with_default(func: F) -> F: + @retry( + tries=tries, + delay=delay, + max_delay=max_delay, + backoff=backoff, + jitter=jitter, + logger=cast(Logger, logger), + ) def wrapped_func(*args: list, **kwargs: dict[str, Any]) -> Any: return func(*args, **kwargs) diff --git a/backend/danswer/utils/sitemap.py b/backend/danswer/utils/sitemap.py index ababbec4575..551b2bb3bf0 100644 --- a/backend/danswer/utils/sitemap.py +++ b/backend/danswer/utils/sitemap.py @@ -1,39 +1,78 @@ -from datetime import datetime -from urllib import robotparser +import re +import xml.etree.ElementTree as ET +from typing import Set +from urllib.parse import urljoin -from usp.tree import sitemap_tree_for_homepage # type: ignore +import requests from danswer.utils.logger import setup_logger logger = setup_logger() -def test_url(rp: robotparser.RobotFileParser | None, url: str) -> bool: - if not rp: - return True - else: - return rp.can_fetch("*", url) +def _get_sitemap_locations_from_robots(base_url: str) -> Set[str]: + """Extract sitemap URLs from robots.txt""" + sitemap_urls: set = set() + try: + robots_url = urljoin(base_url, "/robots.txt") + resp = requests.get(robots_url, timeout=10) + if resp.status_code == 200: + for line in resp.text.splitlines(): + if line.lower().startswith("sitemap:"): + sitemap_url = line.split(":", 1)[1].strip() + sitemap_urls.add(sitemap_url) + except Exception as e: + logger.warning(f"Error fetching robots.txt: {e}") + return sitemap_urls + + +def _extract_urls_from_sitemap(sitemap_url: str) -> Set[str]: + """Extract URLs from a sitemap XML file""" + urls: set[str] = set() + try: + resp = requests.get(sitemap_url, timeout=10) + if resp.status_code != 200: + return urls + root = ET.fromstring(resp.content) -def init_robots_txt(site: str) -> robotparser.RobotFileParser: - ts = datetime.now().timestamp() - robots_url = f"{site}/robots.txt?ts={ts}" - rp = robotparser.RobotFileParser() - rp.set_url(robots_url) - rp.read() - return rp + # Handle both regular sitemaps and sitemap indexes + # Remove namespace for easier parsing + namespace = re.match(r"\{.*\}", root.tag) + ns = namespace.group(0) if namespace else "" + + if root.tag == f"{ns}sitemapindex": + # This is a sitemap index + for sitemap in root.findall(f".//{ns}loc"): + if sitemap.text: + sub_urls = _extract_urls_from_sitemap(sitemap.text) + urls.update(sub_urls) + else: + # This is a regular sitemap + for url in root.findall(f".//{ns}loc"): + if url.text: + urls.add(url.text) + + except Exception as e: + logger.warning(f"Error processing sitemap {sitemap_url}: {e}") + + return urls def list_pages_for_site(site: str) -> list[str]: - rp: robotparser.RobotFileParser | None = None - try: - rp = init_robots_txt(site) - except Exception: - logger.warning("Failed to load robots.txt") + """Get list of pages from a site's sitemaps""" + site = site.rstrip("/") + all_urls = set() - tree = sitemap_tree_for_homepage(site) + # Try both common sitemap locations + sitemap_paths = ["/sitemap.xml", "/sitemap_index.xml"] + for path in sitemap_paths: + sitemap_url = urljoin(site, path) + all_urls.update(_extract_urls_from_sitemap(sitemap_url)) - pages = [page.url for page in tree.all_pages() if test_url(rp, page.url)] - pages = list(dict.fromkeys(pages)) + # Check robots.txt for additional sitemaps + sitemap_locations = _get_sitemap_locations_from_robots(site) + for sitemap_url in sitemap_locations: + all_urls.update(_extract_urls_from_sitemap(sitemap_url)) - return pages + return list(all_urls) diff --git a/backend/danswer/utils/special_types.py b/backend/danswer/utils/special_types.py new file mode 100644 index 00000000000..ea9ccf21e5b --- /dev/null +++ b/backend/danswer/utils/special_types.py @@ -0,0 +1,7 @@ +from collections.abc import Mapping +from collections.abc import Sequence +from typing import TypeAlias + +JSON_ro: TypeAlias = ( + Mapping[str, "JSON_ro"] | Sequence["JSON_ro"] | str | int | float | bool | None +) diff --git a/backend/danswer/utils/telemetry.py b/backend/danswer/utils/telemetry.py index d8a021877e6..f5fb23ef86f 100644 --- a/backend/danswer/utils/telemetry.py +++ b/backend/danswer/utils/telemetry.py @@ -12,8 +12,8 @@ from danswer.configs.constants import KV_INSTANCE_DOMAIN_KEY from danswer.db.engine import get_sqlalchemy_engine from danswer.db.models import User -from danswer.dynamic_configs.factory import get_dynamic_config_store -from danswer.dynamic_configs.interface import ConfigNotFoundError +from danswer.key_value_store.factory import get_kv_store +from danswer.key_value_store.interface import KvKeyNotFoundError _DANSWER_TELEMETRY_ENDPOINT = "https://telemetry.danswer.ai/anonymous_telemetry" _CACHED_UUID: str | None = None @@ -34,11 +34,11 @@ def get_or_generate_uuid() -> str: if _CACHED_UUID is not None: return _CACHED_UUID - kv_store = get_dynamic_config_store() + kv_store = get_kv_store() try: _CACHED_UUID = cast(str, kv_store.load(KV_CUSTOMER_UUID_KEY)) - except ConfigNotFoundError: + except KvKeyNotFoundError: _CACHED_UUID = str(uuid.uuid4()) kv_store.store(KV_CUSTOMER_UUID_KEY, _CACHED_UUID, encrypt=True) @@ -51,11 +51,11 @@ def _get_or_generate_instance_domain() -> str | None: if _CACHED_INSTANCE_DOMAIN is not None: return _CACHED_INSTANCE_DOMAIN - kv_store = get_dynamic_config_store() + kv_store = get_kv_store() try: _CACHED_INSTANCE_DOMAIN = cast(str, kv_store.load(KV_INSTANCE_DOMAIN_KEY)) - except ConfigNotFoundError: + except KvKeyNotFoundError: with Session(get_sqlalchemy_engine()) as db_session: first_user = db_session.query(User).first() if first_user: diff --git a/backend/danswer/utils/text_processing.py b/backend/danswer/utils/text_processing.py index 134859d4e74..d26b5f357fb 100644 --- a/backend/danswer/utils/text_processing.py +++ b/backend/danswer/utils/text_processing.py @@ -4,6 +4,10 @@ import string from urllib.parse import quote +from danswer.utils.logger import setup_logger + + +logger = setup_logger(__name__) ESCAPE_SEQUENCE_RE = re.compile( r""" @@ -77,7 +81,8 @@ def extract_embedded_json(s: str) -> dict: last_brace_index = s.rfind("}") if first_brace_index == -1 or last_brace_index == -1: - raise ValueError("No valid json found") + logger.warning("No valid json found, assuming answer is entire string") + return {"answer": s, "quotes": []} json_str = s[first_brace_index : last_brace_index + 1] try: @@ -121,6 +126,28 @@ def shared_precompare_cleanup(text: str) -> str: return text +_INITIAL_FILTER = re.compile( + "[" + "\U0000FFF0-\U0000FFFF" # Specials + "\U0001F000-\U0001F9FF" # Emoticons + "\U00002000-\U0000206F" # General Punctuation + "\U00002190-\U000021FF" # Arrows + "\U00002700-\U000027BF" # Dingbats + "]+", + flags=re.UNICODE, +) + + +def clean_text(text: str) -> str: + # Remove specific Unicode ranges that might cause issues + cleaned = _INITIAL_FILTER.sub("", text) + + # Remove any control characters except for newline and tab + cleaned = "".join(ch for ch in cleaned if ch >= " " or ch in "\n\t") + + return cleaned + + def is_valid_email(text: str) -> bool: """Can use a library instead if more detailed checks are needed""" regex = r"^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$" diff --git a/backend/danswer/utils/variable_functionality.py b/backend/danswer/utils/variable_functionality.py index 55f296aa8e7..66cb4c3582a 100644 --- a/backend/danswer/utils/variable_functionality.py +++ b/backend/danswer/utils/variable_functionality.py @@ -1,5 +1,6 @@ import functools import importlib +import inspect from typing import Any from typing import TypeVar @@ -16,7 +17,7 @@ def __init__(self) -> None: def set_ee(self) -> None: self._is_ee = True - def get_is_ee_version(self) -> bool: + def is_ee_version(self) -> bool: return self._is_ee @@ -24,7 +25,7 @@ def get_is_ee_version(self) -> bool: def set_is_ee_based_on_env_variable() -> None: - if ENTERPRISE_EDITION_ENABLED and not global_version.get_is_ee_version(): + if ENTERPRISE_EDITION_ENABLED and not global_version.is_ee_version(): logger.notice("Enterprise Edition enabled") global_version.set_ee() @@ -54,7 +55,7 @@ def fetch_versioned_implementation(module: str, attribute: str) -> Any: implementation cannot be found or loaded. """ logger.debug("Fetching versioned implementation for %s.%s", module, attribute) - is_ee = global_version.get_is_ee_version() + is_ee = global_version.is_ee_version() module_full = f"ee.{module}" if is_ee else module try: @@ -119,3 +120,41 @@ def noop_fallback(*args: Any, **kwargs: Any) -> None: Returns: None """ + + +def fetch_ee_implementation_or_noop( + module: str, attribute: str, noop_return_value: Any = None +) -> Any: + """ + Fetches an EE implementation if EE is enabled, otherwise returns a no-op function. + Raises an exception if EE is enabled but the fetch fails. + + Args: + module (str): The name of the module from which to fetch the attribute. + attribute (str): The name of the attribute to fetch from the module. + + Returns: + Any: The fetched EE implementation if successful and EE is enabled, otherwise a no-op function. + + Raises: + Exception: If EE is enabled but the fetch fails. + """ + if not global_version.is_ee_version(): + if inspect.iscoroutinefunction(noop_return_value): + + async def async_noop(*args: Any, **kwargs: Any) -> Any: + return await noop_return_value(*args, **kwargs) + + return async_noop + + else: + + def sync_noop(*args: Any, **kwargs: Any) -> Any: + return noop_return_value + + return sync_noop + try: + return fetch_versioned_implementation(module, attribute) + except Exception as e: + logger.error(f"Failed to fetch implementation for {module}.{attribute}: {e}") + raise diff --git a/backend/ee/danswer/access/access.py b/backend/ee/danswer/access/access.py index 2b3cdb7a9dc..094298677a5 100644 --- a/backend/ee/danswer/access/access.py +++ b/backend/ee/danswer/access/access.py @@ -5,8 +5,11 @@ ) from danswer.access.access import _get_acl_for_user as get_acl_for_user_without_groups from danswer.access.models import DocumentAccess +from danswer.access.utils import prefix_external_group from danswer.access.utils import prefix_user_group +from danswer.db.document import get_documents_by_ids from danswer.db.models import User +from ee.danswer.db.external_perm import fetch_external_groups_for_user from ee.danswer.db.user_group import fetch_user_groups_for_documents from ee.danswer.db.user_group import fetch_user_groups_for_user @@ -17,7 +20,13 @@ def _get_access_for_document( ) -> DocumentAccess: id_to_access = _get_access_for_documents([document_id], db_session) if len(id_to_access) == 0: - return DocumentAccess.build(user_ids=[], user_groups=[], is_public=False) + return DocumentAccess.build( + user_emails=[], + user_groups=[], + external_user_emails=[], + external_user_group_ids=[], + is_public=False, + ) return next(iter(id_to_access.values())) @@ -30,22 +39,48 @@ def _get_access_for_documents( document_ids=document_ids, db_session=db_session, ) - user_group_info = { + user_group_info: dict[str, list[str]] = { document_id: group_names for document_id, group_names in fetch_user_groups_for_documents( db_session=db_session, document_ids=document_ids, ) } + documents = get_documents_by_ids( + db_session=db_session, + document_ids=document_ids, + ) + doc_id_map = {doc.id: doc for doc in documents} - return { - document_id: DocumentAccess( - user_ids=non_ee_access.user_ids, - user_groups=user_group_info.get(document_id, []), # type: ignore - is_public=non_ee_access.is_public, + access_map = {} + for document_id, non_ee_access in non_ee_access_dict.items(): + document = doc_id_map[document_id] + + ext_u_emails = ( + set(document.external_user_emails) + if document.external_user_emails + else set() + ) + + ext_u_groups = ( + set(document.external_user_group_ids) + if document.external_user_group_ids + else set() ) - for document_id, non_ee_access in non_ee_access_dict.items() - } + + # If the document is determined to be "public" externally (through a SYNC connector) + # then it's given the same access level as if it were marked public within Danswer + is_public_anywhere = document.is_public or non_ee_access.is_public + + # To avoid collisions of group namings between connectors, they need to be prefixed + access_map[document_id] = DocumentAccess( + user_emails=non_ee_access.user_emails, + user_groups=set(user_group_info.get(document_id, [])), + is_public=is_public_anywhere, + external_user_emails=ext_u_emails, + external_user_group_ids=ext_u_groups, + ) + return access_map def _get_acl_for_user(user: User | None, db_session: Session) -> set[str]: @@ -56,7 +91,20 @@ def _get_acl_for_user(user: User | None, db_session: Session) -> set[str]: NOTE: is imported in danswer.access.access by `fetch_versioned_implementation` DO NOT REMOVE.""" - user_groups = fetch_user_groups_for_user(db_session, user.id) if user else [] - return set( - [prefix_user_group(user_group.name) for user_group in user_groups] - ).union(get_acl_for_user_without_groups(user, db_session)) + db_user_groups = fetch_user_groups_for_user(db_session, user.id) if user else [] + prefixed_user_groups = [ + prefix_user_group(db_user_group.name) for db_user_group in db_user_groups + ] + + db_external_groups = ( + fetch_external_groups_for_user(db_session, user.id) if user else [] + ) + prefixed_external_groups = [ + prefix_external_group(db_external_group.external_user_group_id) + for db_external_group in db_external_groups + ] + + user_acl = set(prefixed_user_groups + prefixed_external_groups) + user_acl.update(get_acl_for_user_without_groups(user, db_session)) + + return user_acl diff --git a/backend/ee/danswer/auth/api_key.py b/backend/ee/danswer/auth/api_key.py deleted file mode 100644 index d4f99d13891..00000000000 --- a/backend/ee/danswer/auth/api_key.py +++ /dev/null @@ -1,53 +0,0 @@ -import secrets -import uuid - -from fastapi import Request -from passlib.hash import sha256_crypt -from pydantic import BaseModel - -from danswer.auth.schemas import UserRole -from ee.danswer.configs.app_configs import API_KEY_HASH_ROUNDS - - -_API_KEY_HEADER_NAME = "Authorization" -_BEARER_PREFIX = "Bearer " -_API_KEY_PREFIX = "dn_" -_API_KEY_LEN = 192 - - -class ApiKeyDescriptor(BaseModel): - api_key_id: int - api_key_display: str - api_key: str | None = None # only present on initial creation - api_key_name: str | None = None - api_key_role: UserRole - - user_id: uuid.UUID - - -def generate_api_key() -> str: - return _API_KEY_PREFIX + secrets.token_urlsafe(_API_KEY_LEN) - - -def hash_api_key(api_key: str) -> str: - # NOTE: no salt is needed, as the API key is randomly generated - # and overlaps are impossible - return sha256_crypt.hash(api_key, salt="", rounds=API_KEY_HASH_ROUNDS) - - -def build_displayable_api_key(api_key: str) -> str: - if api_key.startswith(_API_KEY_PREFIX): - api_key = api_key[len(_API_KEY_PREFIX) :] - - return _API_KEY_PREFIX + api_key[:4] + "********" + api_key[-4:] - - -def get_hashed_api_key_from_request(request: Request) -> str | None: - raw_api_key_header = request.headers.get(_API_KEY_HEADER_NAME) - if raw_api_key_header is None: - return None - - if raw_api_key_header.startswith(_BEARER_PREFIX): - raw_api_key_header = raw_api_key_header[len(_BEARER_PREFIX) :].strip() - - return hash_api_key(raw_api_key_header) diff --git a/backend/ee/danswer/auth/users.py b/backend/ee/danswer/auth/users.py index 18dff6ab064..aab88efa8e4 100644 --- a/backend/ee/danswer/auth/users.py +++ b/backend/ee/danswer/auth/users.py @@ -1,15 +1,16 @@ from fastapi import Depends from fastapi import HTTPException from fastapi import Request -from sqlalchemy.orm import Session +from fastapi import status +from sqlalchemy.ext.asyncio import AsyncSession +from danswer.auth.users import current_admin_user from danswer.configs.app_configs import AUTH_TYPE +from danswer.configs.app_configs import SUPER_CLOUD_API_KEY +from danswer.configs.app_configs import SUPER_USERS from danswer.configs.constants import AuthType -from danswer.db.engine import get_session from danswer.db.models import User from danswer.utils.logger import setup_logger -from ee.danswer.auth.api_key import get_hashed_api_key_from_request -from ee.danswer.db.api_key import fetch_user_for_api_key from ee.danswer.db.saml import get_saml_account from ee.danswer.server.seeding import get_seed_config from ee.danswer.utils.secrets import extract_hashed_cookie @@ -25,41 +26,18 @@ def verify_auth_setting() -> None: async def optional_user_( request: Request, user: User | None, - db_session: Session, + async_db_session: AsyncSession, ) -> User | None: # Check if the user has a session cookie from SAML if AUTH_TYPE == AuthType.SAML: saved_cookie = extract_hashed_cookie(request) if saved_cookie: - saml_account = get_saml_account(cookie=saved_cookie, db_session=db_session) + saml_account = await get_saml_account( + cookie=saved_cookie, async_db_session=async_db_session + ) user = saml_account.user if saml_account else None - # check if an API key is present - if user is None: - hashed_api_key = get_hashed_api_key_from_request(request) - if hashed_api_key: - user = fetch_user_for_api_key(hashed_api_key, db_session) - - return user - - -def api_key_dep( - request: Request, db_session: Session = Depends(get_session) -) -> User | None: - if AUTH_TYPE == AuthType.DISABLED: - return None - - hashed_api_key = get_hashed_api_key_from_request(request) - if not hashed_api_key: - raise HTTPException(status_code=401, detail="Missing API key") - - if hashed_api_key: - user = fetch_user_for_api_key(hashed_api_key, db_session) - - if user is None: - raise HTTPException(status_code=401, detail="Invalid API key") - return user @@ -68,3 +46,19 @@ def get_default_admin_user_emails_() -> list[str]: if seed_config and seed_config.admin_user_emails: return seed_config.admin_user_emails return [] + + +async def current_cloud_superuser( + request: Request, + user: User | None = Depends(current_admin_user), +) -> User | None: + api_key = request.headers.get("Authorization", "").replace("Bearer ", "") + if api_key != SUPER_CLOUD_API_KEY: + raise HTTPException(status_code=401, detail="Invalid API key") + + if user and user.email not in SUPER_USERS: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Access denied. User must be a cloud superuser to perform this action.", + ) + return user diff --git a/backend/ee/danswer/background/celery/celery_app.py b/backend/ee/danswer/background/celery/apps/primary.py similarity index 58% rename from backend/ee/danswer/background/celery/celery_app.py rename to backend/ee/danswer/background/celery/apps/primary.py index 2b4c96ccb1e..21644228484 100644 --- a/backend/ee/danswer/background/celery/celery_app.py +++ b/backend/ee/danswer/background/celery/apps/primary.py @@ -1,76 +1,68 @@ -from datetime import timedelta - -from sqlalchemy.orm import Session - -from danswer.background.celery.celery_app import celery_app +from danswer.background.celery.apps.primary import celery_app from danswer.background.task_utils import build_celery_task_wrapper from danswer.configs.app_configs import JOB_TIMEOUT from danswer.db.chat import delete_chat_sessions_older_than -from danswer.db.engine import get_sqlalchemy_engine +from danswer.db.engine import get_session_with_tenant from danswer.server.settings.store import load_settings from danswer.utils.logger import setup_logger -from danswer.utils.variable_functionality import global_version from ee.danswer.background.celery_utils import should_perform_chat_ttl_check from ee.danswer.background.task_name_builders import name_chat_ttl_task from ee.danswer.server.reporting.usage_export_generation import create_new_usage_report +from shared_configs.configs import MULTI_TENANT +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR logger = setup_logger() # mark as EE for all tasks in this file -global_version.set_ee() @build_celery_task_wrapper(name_chat_ttl_task) @celery_app.task(soft_time_limit=JOB_TIMEOUT) -def perform_ttl_management_task(retention_limit_days: int) -> None: - with Session(get_sqlalchemy_engine()) as db_session: +def perform_ttl_management_task( + retention_limit_days: int, *, tenant_id: str | None +) -> None: + with get_session_with_tenant(tenant_id) as db_session: delete_chat_sessions_older_than(retention_limit_days, db_session) ##### # Periodic Tasks ##### + + @celery_app.task( name="check_ttl_management_task", soft_time_limit=JOB_TIMEOUT, ) -def check_ttl_management_task() -> None: +def check_ttl_management_task(*, tenant_id: str | None) -> None: """Runs periodically to check if any ttl tasks should be run and adds them to the queue""" + token = None + if MULTI_TENANT and tenant_id is not None: + token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id) + settings = load_settings() retention_limit_days = settings.maximum_chat_retention_days - with Session(get_sqlalchemy_engine()) as db_session: + with get_session_with_tenant(tenant_id) as db_session: if should_perform_chat_ttl_check(retention_limit_days, db_session): perform_ttl_management_task.apply_async( - kwargs=dict(retention_limit_days=retention_limit_days), + kwargs=dict( + retention_limit_days=retention_limit_days, tenant_id=tenant_id + ), ) + if token is not None: + CURRENT_TENANT_ID_CONTEXTVAR.reset(token) @celery_app.task( name="autogenerate_usage_report_task", soft_time_limit=JOB_TIMEOUT, ) -def autogenerate_usage_report_task() -> None: +def autogenerate_usage_report_task(*, tenant_id: str | None) -> None: """This generates usage report under the /admin/generate-usage/report endpoint""" - with Session(get_sqlalchemy_engine()) as db_session: + with get_session_with_tenant(tenant_id) as db_session: create_new_usage_report( db_session=db_session, user_id=None, period=None, ) - - -##### -# Celery Beat (Periodic Tasks) Settings -##### -celery_app.conf.beat_schedule = { - "autogenerate-usage-report": { - "task": "autogenerate_usage_report_task", - "schedule": timedelta(days=30), # TODO: change this to config flag - }, - "check-ttl-management": { - "task": "check_ttl_management_task", - "schedule": timedelta(hours=1), - }, - **(celery_app.conf.beat_schedule or {}), -} diff --git a/backend/ee/danswer/background/celery/tasks/beat_schedule.py b/backend/ee/danswer/background/celery/tasks/beat_schedule.py new file mode 100644 index 00000000000..86680e60c7f --- /dev/null +++ b/backend/ee/danswer/background/celery/tasks/beat_schedule.py @@ -0,0 +1,23 @@ +from datetime import timedelta +from typing import Any + +from danswer.background.celery.tasks.beat_schedule import ( + tasks_to_schedule as base_tasks_to_schedule, +) + +ee_tasks_to_schedule = [ + { + "name": "autogenerate_usage_report", + "task": "autogenerate_usage_report_task", + "schedule": timedelta(days=30), # TODO: change this to config flag + }, + { + "name": "check-ttl-management", + "task": "check_ttl_management_task", + "schedule": timedelta(hours=1), + }, +] + + +def get_tasks_to_schedule() -> list[dict[str, Any]]: + return ee_tasks_to_schedule + base_tasks_to_schedule diff --git a/backend/ee/danswer/background/celery/tasks/vespa/tasks.py b/backend/ee/danswer/background/celery/tasks/vespa/tasks.py new file mode 100644 index 00000000000..84d67f0a3ac --- /dev/null +++ b/backend/ee/danswer/background/celery/tasks/vespa/tasks.py @@ -0,0 +1,56 @@ +from typing import cast + +from redis import Redis +from sqlalchemy.orm import Session + +from danswer.background.celery.apps.app_base import task_logger +from danswer.redis.redis_usergroup import RedisUserGroup +from danswer.utils.logger import setup_logger +from ee.danswer.db.user_group import delete_user_group +from ee.danswer.db.user_group import fetch_user_group +from ee.danswer.db.user_group import mark_user_group_as_synced + +logger = setup_logger() + + +def monitor_usergroup_taskset( + tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session +) -> None: + """This function is likely to move in the worker refactor happening next.""" + fence_key = key_bytes.decode("utf-8") + usergroup_id_str = RedisUserGroup.get_id_from_fence_key(fence_key) + if not usergroup_id_str: + task_logger.warning(f"Could not parse usergroup id from {fence_key}") + return + + try: + usergroup_id = int(usergroup_id_str) + except ValueError: + task_logger.exception(f"usergroup_id ({usergroup_id_str}) is not an integer!") + raise + + rug = RedisUserGroup(tenant_id, usergroup_id) + if not rug.fenced: + return + + initial_count = rug.payload + if initial_count is None: + return + + count = cast(int, r.scard(rug.taskset_key)) + task_logger.info( + f"User group sync progress: usergroup_id={usergroup_id} remaining={count} initial={initial_count}" + ) + if count > 0: + return + + user_group = fetch_user_group(db_session=db_session, user_group_id=usergroup_id) + if user_group: + if user_group.is_up_for_deletion: + delete_user_group(db_session=db_session, user_group=user_group) + task_logger.info(f"Deleted usergroup. id='{usergroup_id}'") + else: + mark_user_group_as_synced(db_session=db_session, user_group=user_group) + task_logger.info(f"Synced usergroup. id='{usergroup_id}'") + + rug.reset() diff --git a/backend/ee/danswer/background/celery_utils.py b/backend/ee/danswer/background/celery_utils.py index 879487180af..f6fff26cf41 100644 --- a/backend/ee/danswer/background/celery_utils.py +++ b/backend/ee/danswer/background/celery_utils.py @@ -1,18 +1,9 @@ -from typing import cast - -from redis import Redis from sqlalchemy.orm import Session -from danswer.background.celery.celery_app import task_logger -from danswer.background.celery.celery_redis import RedisUserGroup -from danswer.db.engine import get_sqlalchemy_engine from danswer.db.tasks import check_task_is_live_and_not_timed_out from danswer.db.tasks import get_latest_task from danswer.utils.logger import setup_logger from ee.danswer.background.task_name_builders import name_chat_ttl_task -from ee.danswer.db.user_group import delete_user_group -from ee.danswer.db.user_group import fetch_user_group -from ee.danswer.db.user_group import mark_user_group_as_synced logger = setup_logger() @@ -29,47 +20,7 @@ def should_perform_chat_ttl_check( if not latest_task: return True - if latest_task and check_task_is_live_and_not_timed_out(latest_task, db_session): - logger.info("TTL check is already being performed. Skipping.") + if check_task_is_live_and_not_timed_out(latest_task, db_session): + logger.debug(f"{task_name} is already being performed. Skipping.") return False return True - - -def monitor_usergroup_taskset(key_bytes: bytes, r: Redis) -> None: - """This function is likely to move in the worker refactor happening next.""" - key = key_bytes.decode("utf-8") - usergroup_id = RedisUserGroup.get_id_from_fence_key(key) - if not usergroup_id: - task_logger.warning("Could not parse usergroup id from {key}") - return - - rug = RedisUserGroup(usergroup_id) - fence_value = r.get(rug.fence_key) - if fence_value is None: - return - - try: - initial_count = int(cast(int, fence_value)) - except ValueError: - task_logger.error("The value is not an integer.") - return - - count = cast(int, r.scard(rug.taskset_key)) - task_logger.info( - f"User group sync: usergroup_id={usergroup_id} remaining={count} initial={initial_count}" - ) - if count > 0: - return - - with Session(get_sqlalchemy_engine()) as db_session: - user_group = fetch_user_group(db_session=db_session, user_group_id=usergroup_id) - if user_group: - if user_group.is_up_for_deletion: - delete_user_group(db_session=db_session, user_group=user_group) - task_logger.info(f"Deleted usergroup. id='{usergroup_id}'") - else: - mark_user_group_as_synced(db_session=db_session, user_group=user_group) - task_logger.info(f"Synced usergroup. id='{usergroup_id}'") - - r.delete(rug.taskset_key) - r.delete(rug.fence_key) diff --git a/backend/ee/danswer/background/permission_sync.py b/backend/ee/danswer/background/permission_sync.py deleted file mode 100644 index c14094b6042..00000000000 --- a/backend/ee/danswer/background/permission_sync.py +++ /dev/null @@ -1,224 +0,0 @@ -import logging -import time -from datetime import datetime - -import dask -from dask.distributed import Client -from dask.distributed import Future -from distributed import LocalCluster -from sqlalchemy.orm import Session - -from danswer.background.indexing.dask_utils import ResourceLogger -from danswer.background.indexing.job_client import SimpleJob -from danswer.background.indexing.job_client import SimpleJobClient -from danswer.configs.app_configs import CLEANUP_INDEXING_JOBS_TIMEOUT -from danswer.configs.app_configs import DASK_JOB_CLIENT_ENABLED -from danswer.configs.constants import DocumentSource -from danswer.configs.constants import POSTGRES_PERMISSIONS_APP_NAME -from danswer.db.engine import get_sqlalchemy_engine -from danswer.db.engine import init_sqlalchemy_engine -from danswer.db.models import PermissionSyncStatus -from danswer.utils.logger import setup_logger -from ee.danswer.configs.app_configs import NUM_PERMISSION_WORKERS -from ee.danswer.connectors.factory import CONNECTOR_PERMISSION_FUNC_MAP -from ee.danswer.db.connector import fetch_sources_with_connectors -from ee.danswer.db.connector_credential_pair import get_cc_pairs_by_source -from ee.danswer.db.permission_sync import create_perm_sync -from ee.danswer.db.permission_sync import expire_perm_sync_timed_out -from ee.danswer.db.permission_sync import get_perm_sync_attempt -from ee.danswer.db.permission_sync import mark_all_inprogress_permission_sync_failed -from shared_configs.configs import LOG_LEVEL - -logger = setup_logger() - -# If the indexing dies, it's most likely due to resource constraints, -# restarting just delays the eventual failure, not useful to the user -dask.config.set({"distributed.scheduler.allowed-failures": 0}) - - -def cleanup_perm_sync_jobs( - existing_jobs: dict[tuple[int, int | DocumentSource], Future | SimpleJob], - # Just reusing the same timeout, fine for now - timeout_hours: int = CLEANUP_INDEXING_JOBS_TIMEOUT, -) -> dict[tuple[int, int | DocumentSource], Future | SimpleJob]: - existing_jobs_copy = existing_jobs.copy() - - with Session(get_sqlalchemy_engine()) as db_session: - # clean up completed jobs - for (attempt_id, details), job in existing_jobs.items(): - perm_sync_attempt = get_perm_sync_attempt( - attempt_id=attempt_id, db_session=db_session - ) - - # do nothing for ongoing jobs that haven't been stopped - if ( - not job.done() - and perm_sync_attempt.status == PermissionSyncStatus.IN_PROGRESS - ): - continue - - if job.status == "error": - logger.error(job.exception()) - - job.release() - del existing_jobs_copy[(attempt_id, details)] - - # clean up in-progress jobs that were never completed - expire_perm_sync_timed_out( - timeout_hours=timeout_hours, - db_session=db_session, - ) - - return existing_jobs_copy - - -def create_group_sync_jobs( - existing_jobs: dict[tuple[int, int | DocumentSource], Future | SimpleJob], - client: Client | SimpleJobClient, -) -> dict[tuple[int, int | DocumentSource], Future | SimpleJob]: - """Creates new relational DB group permission sync job for each source that: - - has permission sync enabled - - has at least 1 connector (enabled or paused) - - has no sync already running - """ - existing_jobs_copy = existing_jobs.copy() - sources_w_runs = [ - key[1] - for key in existing_jobs_copy.keys() - if isinstance(key[1], DocumentSource) - ] - with Session(get_sqlalchemy_engine()) as db_session: - sources_w_connector = fetch_sources_with_connectors(db_session) - for source_type in sources_w_connector: - if source_type not in CONNECTOR_PERMISSION_FUNC_MAP: - continue - if source_type in sources_w_runs: - continue - - db_group_fnc, _ = CONNECTOR_PERMISSION_FUNC_MAP[source_type] - perm_sync = create_perm_sync( - source_type=source_type, - group_update=True, - cc_pair_id=None, - db_session=db_session, - ) - - run = client.submit(db_group_fnc, pure=False) - - logger.info( - f"Kicked off group permission sync for source type {source_type}" - ) - - if run: - existing_jobs_copy[(perm_sync.id, source_type)] = run - - return existing_jobs_copy - - -def create_connector_perm_sync_jobs( - existing_jobs: dict[tuple[int, int | DocumentSource], Future | SimpleJob], - client: Client | SimpleJobClient, -) -> dict[tuple[int, int | DocumentSource], Future | SimpleJob]: - """Update Document Index ACL sync job for each cc-pair where: - - source type has permission sync enabled - - has no sync already running - """ - existing_jobs_copy = existing_jobs.copy() - cc_pairs_w_runs = [ - key[1] - for key in existing_jobs_copy.keys() - if isinstance(key[1], DocumentSource) - ] - with Session(get_sqlalchemy_engine()) as db_session: - sources_w_connector = fetch_sources_with_connectors(db_session) - for source_type in sources_w_connector: - if source_type not in CONNECTOR_PERMISSION_FUNC_MAP: - continue - - _, index_sync_fnc = CONNECTOR_PERMISSION_FUNC_MAP[source_type] - - cc_pairs = get_cc_pairs_by_source(source_type, db_session) - - for cc_pair in cc_pairs: - if cc_pair.id in cc_pairs_w_runs: - continue - - perm_sync = create_perm_sync( - source_type=source_type, - group_update=False, - cc_pair_id=cc_pair.id, - db_session=db_session, - ) - - run = client.submit(index_sync_fnc, cc_pair.id, pure=False) - - logger.info(f"Kicked off ACL sync for cc-pair {cc_pair.id}") - - if run: - existing_jobs_copy[(perm_sync.id, cc_pair.id)] = run - - return existing_jobs_copy - - -def permission_loop(delay: int = 60, num_workers: int = NUM_PERMISSION_WORKERS) -> None: - client: Client | SimpleJobClient - if DASK_JOB_CLIENT_ENABLED: - cluster_primary = LocalCluster( - n_workers=num_workers, - threads_per_worker=1, - # there are warning about high memory usage + "Event loop unresponsive" - # which are not relevant to us since our workers are expected to use a - # lot of memory + involve CPU intensive tasks that will not relinquish - # the event loop - silence_logs=logging.ERROR, - ) - client = Client(cluster_primary) - if LOG_LEVEL.lower() == "debug": - client.register_worker_plugin(ResourceLogger()) - else: - client = SimpleJobClient(n_workers=num_workers) - - existing_jobs: dict[tuple[int, int | DocumentSource], Future | SimpleJob] = {} - engine = get_sqlalchemy_engine() - - with Session(engine) as db_session: - # Any jobs still in progress on restart must have died - mark_all_inprogress_permission_sync_failed(db_session) - - while True: - start = time.time() - start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S") - logger.info(f"Running Permission Sync, current UTC time: {start_time_utc}") - - if existing_jobs: - logger.debug( - "Found existing permission sync jobs: " - f"{[(attempt_id, job.status) for attempt_id, job in existing_jobs.items()]}" - ) - - try: - # TODO turn this on when it works - """ - existing_jobs = cleanup_perm_sync_jobs(existing_jobs=existing_jobs) - existing_jobs = create_group_sync_jobs( - existing_jobs=existing_jobs, client=client - ) - existing_jobs = create_connector_perm_sync_jobs( - existing_jobs=existing_jobs, client=client - ) - """ - except Exception as e: - logger.exception(f"Failed to run update due to {e}") - sleep_time = delay - (time.time() - start) - if sleep_time > 0: - time.sleep(sleep_time) - - -def update__main() -> None: - logger.notice("Starting Permission Syncing Loop") - init_sqlalchemy_engine(POSTGRES_PERMISSIONS_APP_NAME) - permission_loop() - - -if __name__ == "__main__": - update__main() diff --git a/backend/ee/danswer/background/task_name_builders.py b/backend/ee/danswer/background/task_name_builders.py index 4f1046adbbb..c218cdd3b59 100644 --- a/backend/ee/danswer/background/task_name_builders.py +++ b/backend/ee/danswer/background/task_name_builders.py @@ -1,6 +1,2 @@ -def name_user_group_sync_task(user_group_id: int) -> str: - return f"user_group_sync_task__{user_group_id}" - - -def name_chat_ttl_task(retention_limit_days: int) -> str: +def name_chat_ttl_task(retention_limit_days: int, tenant_id: str | None = None) -> str: return f"chat_ttl_{retention_limit_days}_days" diff --git a/backend/ee/danswer/configs/app_configs.py b/backend/ee/danswer/configs/app_configs.py index 1430a499136..7e1ade5f3a2 100644 --- a/backend/ee/danswer/configs/app_configs.py +++ b/backend/ee/danswer/configs/app_configs.py @@ -7,17 +7,15 @@ SAML_CONF_DIR = os.environ.get("SAML_CONF_DIR") or "/app/ee/danswer/configs/saml_config" -##### -# API Key Configs -##### -# refers to the rounds described here: https://passlib.readthedocs.io/en/stable/lib/passlib.hash.sha256_crypt.html -_API_KEY_HASH_ROUNDS_RAW = os.environ.get("API_KEY_HASH_ROUNDS") -API_KEY_HASH_ROUNDS = ( - int(_API_KEY_HASH_ROUNDS_RAW) if _API_KEY_HASH_ROUNDS_RAW else None -) - - ##### # Auto Permission Sync ##### NUM_PERMISSION_WORKERS = int(os.environ.get("NUM_PERMISSION_WORKERS") or 2) + + +STRIPE_SECRET_KEY = os.environ.get("STRIPE_SECRET_KEY") +STRIPE_PRICE_ID = os.environ.get("STRIPE_PRICE") + +OPENAI_DEFAULT_API_KEY = os.environ.get("OPENAI_DEFAULT_API_KEY") +ANTHROPIC_DEFAULT_API_KEY = os.environ.get("ANTHROPIC_DEFAULT_API_KEY") +COHERE_DEFAULT_API_KEY = os.environ.get("COHERE_DEFAULT_API_KEY") diff --git a/backend/ee/danswer/connectors/confluence/perm_sync.py b/backend/ee/danswer/connectors/confluence/perm_sync.py deleted file mode 100644 index 2985b47b0d1..00000000000 --- a/backend/ee/danswer/connectors/confluence/perm_sync.py +++ /dev/null @@ -1,12 +0,0 @@ -from danswer.utils.logger import setup_logger - - -logger = setup_logger() - - -def confluence_update_db_group() -> None: - logger.debug("Not yet implemented group sync for confluence, no-op") - - -def confluence_update_index_acl(cc_pair_id: int) -> None: - logger.debug("Not yet implemented ACL sync for confluence, no-op") diff --git a/backend/ee/danswer/connectors/factory.py b/backend/ee/danswer/connectors/factory.py deleted file mode 100644 index 52f9324948b..00000000000 --- a/backend/ee/danswer/connectors/factory.py +++ /dev/null @@ -1,8 +0,0 @@ -from danswer.configs.constants import DocumentSource -from ee.danswer.connectors.confluence.perm_sync import confluence_update_db_group -from ee.danswer.connectors.confluence.perm_sync import confluence_update_index_acl - - -CONNECTOR_PERMISSION_FUNC_MAP = { - DocumentSource.CONFLUENCE: (confluence_update_db_group, confluence_update_index_acl) -} diff --git a/backend/ee/danswer/danswerbot/slack/handlers/handle_standard_answers.py b/backend/ee/danswer/danswerbot/slack/handlers/handle_standard_answers.py index 6807e77135a..e0995acc334 100644 --- a/backend/ee/danswer/danswerbot/slack/handlers/handle_standard_answers.py +++ b/backend/ee/danswer/danswerbot/slack/handlers/handle_standard_answers.py @@ -19,7 +19,7 @@ from danswer.db.chat import get_chat_sessions_by_slack_thread_id from danswer.db.chat import get_or_create_root_message from danswer.db.models import Prompt -from danswer.db.models import SlackBotConfig +from danswer.db.models import SlackChannelConfig from danswer.db.models import StandardAnswer as StandardAnswerModel from danswer.utils.logger import DanswerLoggingAdapter from danswer.utils.logger import setup_logger @@ -80,7 +80,7 @@ def oneoff_standard_answers( def _handle_standard_answers( message_info: SlackMessageInfo, receiver_ids: list[str] | None, - slack_bot_config: SlackBotConfig | None, + slack_channel_config: SlackChannelConfig | None, prompt: Prompt | None, logger: DanswerLoggingAdapter, client: WebClient, @@ -95,12 +95,12 @@ def _handle_standard_answers( we still need to respond to the users. """ # if no channel config, then no standard answers are configured - if not slack_bot_config: + if not slack_channel_config: return False slack_thread_id = message_info.thread_to_respond configured_standard_answer_categories = ( - slack_bot_config.standard_answer_categories if slack_bot_config else [] + slack_channel_config.standard_answer_categories if slack_channel_config else [] ) configured_standard_answers = set( [ @@ -150,7 +150,9 @@ def _handle_standard_answers( db_session=db_session, description="", user_id=None, - persona_id=slack_bot_config.persona.id if slack_bot_config.persona else 0, + persona_id=slack_channel_config.persona.id + if slack_channel_config.persona + else 0, danswerbot_flow=True, slack_thread_id=slack_thread_id, one_shot=True, diff --git a/backend/ee/danswer/db/connector_credential_pair.py b/backend/ee/danswer/db/connector_credential_pair.py index a2172913476..bb91c0de74f 100644 --- a/backend/ee/danswer/db/connector_credential_pair.py +++ b/backend/ee/danswer/db/connector_credential_pair.py @@ -3,6 +3,7 @@ from danswer.configs.constants import DocumentSource from danswer.db.connector_credential_pair import get_connector_credential_pair +from danswer.db.enums import AccessType from danswer.db.models import Connector from danswer.db.models import ConnectorCredentialPair from danswer.db.models import UserGroup__ConnectorCredentialPair @@ -32,14 +33,30 @@ def _delete_connector_credential_pair_user_groups_relationship__no_commit( def get_cc_pairs_by_source( - source_type: DocumentSource, db_session: Session, + source_type: DocumentSource, + only_sync: bool, ) -> list[ConnectorCredentialPair]: - cc_pairs = ( + query = ( db_session.query(ConnectorCredentialPair) .join(ConnectorCredentialPair.connector) .filter(Connector.source == source_type) - .all() ) + if only_sync: + query = query.filter(ConnectorCredentialPair.access_type == AccessType.SYNC) + + cc_pairs = query.all() return cc_pairs + + +def get_all_auto_sync_cc_pairs( + db_session: Session, +) -> list[ConnectorCredentialPair]: + return ( + db_session.query(ConnectorCredentialPair) + .where( + ConnectorCredentialPair.access_type == AccessType.SYNC, + ) + .all() + ) diff --git a/backend/ee/danswer/db/document.py b/backend/ee/danswer/db/document.py index 5a368ea170e..e061db6c75b 100644 --- a/backend/ee/danswer/db/document.py +++ b/backend/ee/danswer/db/document.py @@ -1,14 +1,100 @@ -from collections.abc import Sequence +from datetime import datetime +from datetime import timezone from sqlalchemy import select from sqlalchemy.orm import Session -from danswer.db.models import Document +from danswer.access.models import ExternalAccess +from danswer.access.utils import prefix_group_w_source +from danswer.configs.constants import DocumentSource +from danswer.db.models import Document as DbDocument -def fetch_documents_from_ids( - db_session: Session, document_ids: list[str] -) -> Sequence[Document]: - return db_session.scalars( - select(Document).where(Document.id.in_(document_ids)) - ).all() +def upsert_document_external_perms__no_commit( + db_session: Session, + doc_id: str, + external_access: ExternalAccess, + source_type: DocumentSource, +) -> None: + """ + This sets the permissions for a document in postgres. + NOTE: this will replace any existing external access, it will not do a union + """ + document = db_session.scalars( + select(DbDocument).where(DbDocument.id == doc_id) + ).first() + + prefixed_external_groups = [ + prefix_group_w_source( + ext_group_name=group_id, + source=source_type, + ) + for group_id in external_access.external_user_group_ids + ] + + if not document: + # If the document does not exist, still store the external access + # So that if the document is added later, the external access is already stored + document = DbDocument( + id=doc_id, + semantic_id="", + external_user_emails=external_access.external_user_emails, + external_user_group_ids=prefixed_external_groups, + is_public=external_access.is_public, + ) + db_session.add(document) + return + + document.external_user_emails = list(external_access.external_user_emails) + document.external_user_group_ids = prefixed_external_groups + document.is_public = external_access.is_public + + +def upsert_document_external_perms( + db_session: Session, + doc_id: str, + external_access: ExternalAccess, + source_type: DocumentSource, +) -> None: + """ + This sets the permissions for a document in postgres. + NOTE: this will replace any existing external access, it will not do a union + """ + document = db_session.scalars( + select(DbDocument).where(DbDocument.id == doc_id) + ).first() + + prefixed_external_groups: set[str] = { + prefix_group_w_source( + ext_group_name=group_id, + source=source_type, + ) + for group_id in external_access.external_user_group_ids + } + + if not document: + # If the document does not exist, still store the external access + # So that if the document is added later, the external access is already stored + # The upsert function in the indexing pipeline does not overwrite the permissions fields + document = DbDocument( + id=doc_id, + semantic_id="", + external_user_emails=external_access.external_user_emails, + external_user_group_ids=prefixed_external_groups, + is_public=external_access.is_public, + ) + db_session.add(document) + db_session.commit() + return + + # If the document exists, we need to check if the external access has changed + if ( + external_access.external_user_emails != set(document.external_user_emails or []) + or prefixed_external_groups != set(document.external_user_group_ids or []) + or external_access.is_public != document.is_public + ): + document.external_user_emails = list(external_access.external_user_emails) + document.external_user_group_ids = list(prefixed_external_groups) + document.is_public = external_access.is_public + document.last_modified = datetime.now(timezone.utc) + db_session.commit() diff --git a/backend/ee/danswer/db/external_perm.py b/backend/ee/danswer/db/external_perm.py new file mode 100644 index 00000000000..5411d3c8d34 --- /dev/null +++ b/backend/ee/danswer/db/external_perm.py @@ -0,0 +1,99 @@ +from collections.abc import Sequence +from uuid import UUID + +from pydantic import BaseModel +from sqlalchemy import delete +from sqlalchemy import select +from sqlalchemy.orm import Session + +from danswer.access.utils import prefix_group_w_source +from danswer.configs.constants import DocumentSource +from danswer.db.models import User__ExternalUserGroupId +from danswer.db.users import batch_add_ext_perm_user_if_not_exists + + +class ExternalUserGroup(BaseModel): + id: str + user_emails: list[str] + + +def delete_user__ext_group_for_user__no_commit( + db_session: Session, + user_id: UUID, +) -> None: + db_session.execute( + delete(User__ExternalUserGroupId).where( + User__ExternalUserGroupId.user_id == user_id + ) + ) + + +def delete_user__ext_group_for_cc_pair__no_commit( + db_session: Session, + cc_pair_id: int, +) -> None: + db_session.execute( + delete(User__ExternalUserGroupId).where( + User__ExternalUserGroupId.cc_pair_id == cc_pair_id + ) + ) + + +def replace_user__ext_group_for_cc_pair( + db_session: Session, + cc_pair_id: int, + group_defs: list[ExternalUserGroup], + source: DocumentSource, +) -> None: + """ + This function clears all existing external user group relations for a given cc_pair_id + and replaces them with the new group definitions and commits the changes. + """ + + # collect all emails from all groups to batch add all users at once for efficiency + all_group_member_emails = set() + for external_group in group_defs: + for user_email in external_group.user_emails: + all_group_member_emails.add(user_email) + + # batch add users if they don't exist and get their ids + all_group_members = batch_add_ext_perm_user_if_not_exists( + db_session=db_session, emails=list(all_group_member_emails) + ) + + delete_user__ext_group_for_cc_pair__no_commit( + db_session=db_session, + cc_pair_id=cc_pair_id, + ) + + # map emails to ids + email_id_map = {user.email: user.id for user in all_group_members} + + # use these ids to create new external user group relations relating group_id to user_ids + new_external_permissions = [] + for external_group in group_defs: + for user_email in external_group.user_emails: + user_id = email_id_map[user_email] + new_external_permissions.append( + User__ExternalUserGroupId( + user_id=user_id, + external_user_group_id=prefix_group_w_source( + external_group.id, source + ), + cc_pair_id=cc_pair_id, + ) + ) + + db_session.add_all(new_external_permissions) + db_session.commit() + + +def fetch_external_groups_for_user( + db_session: Session, + user_id: UUID, +) -> Sequence[User__ExternalUserGroupId]: + return db_session.scalars( + select(User__ExternalUserGroupId).where( + User__ExternalUserGroupId.user_id == user_id + ) + ).all() diff --git a/backend/ee/danswer/db/permission_sync.py b/backend/ee/danswer/db/permission_sync.py deleted file mode 100644 index 7642bb65321..00000000000 --- a/backend/ee/danswer/db/permission_sync.py +++ /dev/null @@ -1,72 +0,0 @@ -from datetime import timedelta - -from sqlalchemy import func -from sqlalchemy import select -from sqlalchemy import update -from sqlalchemy.exc import NoResultFound -from sqlalchemy.orm import Session - -from danswer.configs.constants import DocumentSource -from danswer.db.models import PermissionSyncRun -from danswer.db.models import PermissionSyncStatus -from danswer.utils.logger import setup_logger - -logger = setup_logger() - - -def mark_all_inprogress_permission_sync_failed( - db_session: Session, -) -> None: - stmt = ( - update(PermissionSyncRun) - .where(PermissionSyncRun.status == PermissionSyncStatus.IN_PROGRESS) - .values(status=PermissionSyncStatus.FAILED) - ) - db_session.execute(stmt) - db_session.commit() - - -def get_perm_sync_attempt(attempt_id: int, db_session: Session) -> PermissionSyncRun: - stmt = select(PermissionSyncRun).where(PermissionSyncRun.id == attempt_id) - try: - return db_session.scalars(stmt).one() - except NoResultFound: - raise ValueError(f"No PermissionSyncRun found with id {attempt_id}") - - -def expire_perm_sync_timed_out( - timeout_hours: int, - db_session: Session, -) -> None: - cutoff_time = func.now() - timedelta(hours=timeout_hours) - - update_stmt = ( - update(PermissionSyncRun) - .where( - PermissionSyncRun.status == PermissionSyncStatus.IN_PROGRESS, - PermissionSyncRun.updated_at < cutoff_time, - ) - .values(status=PermissionSyncStatus.FAILED, error_msg="timed out") - ) - - db_session.execute(update_stmt) - db_session.commit() - - -def create_perm_sync( - source_type: DocumentSource, - group_update: bool, - cc_pair_id: int | None, - db_session: Session, -) -> PermissionSyncRun: - new_run = PermissionSyncRun( - source_type=source_type, - status=PermissionSyncStatus.IN_PROGRESS, - group_update=group_update, - cc_pair_id=cc_pair_id, - ) - - db_session.add(new_run) - db_session.commit() - - return new_run diff --git a/backend/ee/danswer/db/query_history.py b/backend/ee/danswer/db/query_history.py index 868afef23ce..8fb77f0a2dd 100644 --- a/backend/ee/danswer/db/query_history.py +++ b/backend/ee/danswer/db/query_history.py @@ -8,6 +8,7 @@ from sqlalchemy.orm import contains_eager from sqlalchemy.orm import joinedload from sqlalchemy.orm import Session +from sqlalchemy.sql.expression import UnaryExpression from danswer.db.models import ChatMessage from danswer.db.models import ChatSession @@ -20,21 +21,22 @@ def fetch_chat_sessions_eagerly_by_time( end: datetime.datetime, db_session: Session, limit: int | None = 500, - initial_id: int | None = None, + initial_time: datetime.datetime | None = None, ) -> list[ChatSession]: - id_order = desc(ChatSession.id) # type: ignore - time_order = desc(ChatSession.time_created) # type: ignore - message_order = asc(ChatMessage.id) # type: ignore + time_order: UnaryExpression = desc(ChatSession.time_created) + message_order: UnaryExpression = asc(ChatMessage.id) filters: list[ColumnElement | BinaryExpression] = [ ChatSession.time_created.between(start, end) ] - if initial_id: - filters.append(ChatSession.id < initial_id) + + if initial_time: + filters.append(ChatSession.time_created > initial_time) + subquery = ( db_session.query(ChatSession.id, ChatSession.time_created) .filter(*filters) - .order_by(id_order, time_order) + .order_by(ChatSession.id, time_order) .distinct(ChatSession.id) .limit(limit) .subquery() @@ -42,7 +44,7 @@ def fetch_chat_sessions_eagerly_by_time( query = ( db_session.query(ChatSession) - .join(subquery, ChatSession.id == subquery.c.id) # type: ignore + .join(subquery, ChatSession.id == subquery.c.id) .outerjoin(ChatMessage, ChatSession.id == ChatMessage.chat_session_id) .options( joinedload(ChatSession.user), diff --git a/backend/ee/danswer/db/saml.py b/backend/ee/danswer/db/saml.py index 6689a7a7e14..a7f0f7e1661 100644 --- a/backend/ee/danswer/db/saml.py +++ b/backend/ee/danswer/db/saml.py @@ -5,11 +5,12 @@ from sqlalchemy import and_ from sqlalchemy import func from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload from sqlalchemy.orm import Session from danswer.configs.app_configs import SESSION_EXPIRE_TIME_SECONDS from danswer.db.models import SamlAccount -from danswer.db.models import User def upsert_saml_account( @@ -44,10 +45,14 @@ def upsert_saml_account( return saml_acc.expires_at -def get_saml_account(cookie: str, db_session: Session) -> SamlAccount | None: +async def get_saml_account( + cookie: str, async_db_session: AsyncSession +) -> SamlAccount | None: + """NOTE: this is async, since it's used during auth + (which is necessarily async due to FastAPI Users)""" stmt = ( select(SamlAccount) - .join(User, User.id == SamlAccount.user_id) # type: ignore + .options(selectinload(SamlAccount.user)) # Use selectinload for collections .where( and_( SamlAccount.encrypted_cookie == cookie, @@ -56,10 +61,12 @@ def get_saml_account(cookie: str, db_session: Session) -> SamlAccount | None: ) ) - result = db_session.execute(stmt) - return result.scalar_one_or_none() + result = await async_db_session.execute(stmt) + return result.scalars().unique().one_or_none() -def expire_saml_account(saml_account: SamlAccount, db_session: Session) -> None: +async def expire_saml_account( + saml_account: SamlAccount, async_db_session: AsyncSession +) -> None: saml_account.expires_at = func.now() - db_session.commit() + await async_db_session.commit() diff --git a/backend/ee/danswer/db/token_limit.py b/backend/ee/danswer/db/token_limit.py index 95dd0011853..46f5e2d5e73 100644 --- a/backend/ee/danswer/db/token_limit.py +++ b/backend/ee/danswer/db/token_limit.py @@ -65,64 +65,6 @@ def _add_user_filters( return stmt.where(where_clause) -def fetch_all_user_token_rate_limits( - db_session: Session, - enabled_only: bool = False, - ordered: bool = True, -) -> Sequence[TokenRateLimit]: - query = select(TokenRateLimit).where( - TokenRateLimit.scope == TokenRateLimitScope.USER - ) - - if enabled_only: - query = query.where(TokenRateLimit.enabled.is_(True)) - - if ordered: - query = query.order_by(TokenRateLimit.created_at.desc()) - - return db_session.scalars(query).all() - - -def fetch_all_global_token_rate_limits( - db_session: Session, - enabled_only: bool = False, - ordered: bool = True, -) -> Sequence[TokenRateLimit]: - query = select(TokenRateLimit).where( - TokenRateLimit.scope == TokenRateLimitScope.GLOBAL - ) - - if enabled_only: - query = query.where(TokenRateLimit.enabled.is_(True)) - - if ordered: - query = query.order_by(TokenRateLimit.created_at.desc()) - - token_rate_limits = db_session.scalars(query).all() - return token_rate_limits - - -def fetch_user_group_token_rate_limits( - db_session: Session, - group_id: int, - user: User | None = None, - enabled_only: bool = False, - ordered: bool = True, - get_editable: bool = True, -) -> Sequence[TokenRateLimit]: - stmt = select(TokenRateLimit) - stmt = stmt.where(User__UserGroup.user_group_id == group_id) - stmt = _add_user_filters(stmt, user, get_editable) - - if enabled_only: - stmt = stmt.where(TokenRateLimit.enabled.is_(True)) - - if ordered: - stmt = stmt.order_by(TokenRateLimit.created_at.desc()) - - return db_session.scalars(stmt).all() - - def fetch_all_user_group_token_rate_limits_by_group( db_session: Session, ) -> Sequence[Row[tuple[TokenRateLimit, str]]]: @@ -138,38 +80,6 @@ def fetch_all_user_group_token_rate_limits_by_group( return db_session.execute(query).all() -def insert_user_token_rate_limit( - db_session: Session, - token_rate_limit_settings: TokenRateLimitArgs, -) -> TokenRateLimit: - token_limit = TokenRateLimit( - enabled=token_rate_limit_settings.enabled, - token_budget=token_rate_limit_settings.token_budget, - period_hours=token_rate_limit_settings.period_hours, - scope=TokenRateLimitScope.USER, - ) - db_session.add(token_limit) - db_session.commit() - - return token_limit - - -def insert_global_token_rate_limit( - db_session: Session, - token_rate_limit_settings: TokenRateLimitArgs, -) -> TokenRateLimit: - token_limit = TokenRateLimit( - enabled=token_rate_limit_settings.enabled, - token_budget=token_rate_limit_settings.token_budget, - period_hours=token_rate_limit_settings.period_hours, - scope=TokenRateLimitScope.GLOBAL, - ) - db_session.add(token_limit) - db_session.commit() - - return token_limit - - def insert_user_group_token_rate_limit( db_session: Session, token_rate_limit_settings: TokenRateLimitArgs, @@ -193,34 +103,22 @@ def insert_user_group_token_rate_limit( return token_limit -def update_token_rate_limit( +def fetch_user_group_token_rate_limits( db_session: Session, - token_rate_limit_id: int, - token_rate_limit_settings: TokenRateLimitArgs, -) -> TokenRateLimit: - token_limit = db_session.get(TokenRateLimit, token_rate_limit_id) - if token_limit is None: - raise ValueError(f"TokenRateLimit with id '{token_rate_limit_id}' not found") - - token_limit.enabled = token_rate_limit_settings.enabled - token_limit.token_budget = token_rate_limit_settings.token_budget - token_limit.period_hours = token_rate_limit_settings.period_hours - db_session.commit() - - return token_limit - + group_id: int, + user: User | None = None, + enabled_only: bool = False, + ordered: bool = True, + get_editable: bool = True, +) -> Sequence[TokenRateLimit]: + stmt = select(TokenRateLimit) + stmt = stmt.where(User__UserGroup.user_group_id == group_id) + stmt = _add_user_filters(stmt, user, get_editable) -def delete_token_rate_limit( - db_session: Session, - token_rate_limit_id: int, -) -> None: - token_limit = db_session.get(TokenRateLimit, token_rate_limit_id) - if token_limit is None: - raise ValueError(f"TokenRateLimit with id '{token_rate_limit_id}' not found") + if enabled_only: + stmt = stmt.where(TokenRateLimit.enabled.is_(True)) - db_session.query(TokenRateLimit__UserGroup).filter( - TokenRateLimit__UserGroup.rate_limit_id == token_rate_limit_id - ).delete() + if ordered: + stmt = stmt.order_by(TokenRateLimit.created_at.desc()) - db_session.delete(token_limit) - db_session.commit() + return db_session.scalars(stmt).all() diff --git a/backend/ee/danswer/db/usage_export.py b/backend/ee/danswer/db/usage_export.py index bf53362e97e..074e1ae7d6d 100644 --- a/backend/ee/danswer/db/usage_export.py +++ b/backend/ee/danswer/db/usage_export.py @@ -2,6 +2,7 @@ from collections.abc import Generator from datetime import datetime from typing import IO +from typing import Optional from fastapi_users_db_sqlalchemy import UUID_ID from sqlalchemy.orm import Session @@ -19,11 +20,15 @@ def get_empty_chat_messages_entries__paginated( db_session: Session, period: tuple[datetime, datetime], - limit: int | None = 1, - initial_id: int | None = None, -) -> list[ChatMessageSkeleton]: + limit: int | None = 500, + initial_time: datetime | None = None, +) -> tuple[Optional[datetime], list[ChatMessageSkeleton]]: chat_sessions = fetch_chat_sessions_eagerly_by_time( - period[0], period[1], db_session, limit=limit, initial_id=initial_id + start=period[0], + end=period[1], + db_session=db_session, + limit=limit, + initial_time=initial_time, ) message_skeletons: list[ChatMessageSkeleton] = [] @@ -36,37 +41,47 @@ def get_empty_chat_messages_entries__paginated( flow_type = FlowType.CHAT for message in chat_session.messages: - # only count user messages + # Only count user messages if message.message_type != MessageType.USER: continue message_skeletons.append( ChatMessageSkeleton( - message_id=chat_session.id, + message_id=message.id, chat_session_id=chat_session.id, user_id=str(chat_session.user_id) if chat_session.user_id else None, flow_type=flow_type, time_sent=message.time_sent, ) ) + if len(chat_sessions) == 0: + return None, [] - return message_skeletons + return chat_sessions[0].time_created, message_skeletons def get_all_empty_chat_message_entries( db_session: Session, period: tuple[datetime, datetime], ) -> Generator[list[ChatMessageSkeleton], None, None]: - initial_id = None + initial_time: Optional[datetime] = period[0] + ind = 0 while True: - message_skeletons = get_empty_chat_messages_entries__paginated( - db_session, period, initial_id=initial_id + ind += 1 + + time_created, message_skeletons = get_empty_chat_messages_entries__paginated( + db_session, + period, + initial_time=initial_time, ) + if not message_skeletons: return yield message_skeletons - initial_id = message_skeletons[-1].message_id + + # Update initial_time for the next iteration + initial_time = time_created def get_all_usage_reports(db_session: Session) -> list[UsageReportMetadata]: diff --git a/backend/ee/danswer/db/user_group.py b/backend/ee/danswer/db/user_group.py index ab666f747b5..187f7c7b901 100644 --- a/backend/ee/danswer/db/user_group.py +++ b/backend/ee/danswer/db/user_group.py @@ -11,12 +11,15 @@ from sqlalchemy.orm import Session from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id +from danswer.db.enums import AccessType from danswer.db.enums import ConnectorCredentialPairStatus from danswer.db.models import ConnectorCredentialPair from danswer.db.models import Credential__UserGroup from danswer.db.models import Document from danswer.db.models import DocumentByConnectorCredentialPair +from danswer.db.models import DocumentSet__UserGroup from danswer.db.models import LLMProvider__UserGroup +from danswer.db.models import Persona__UserGroup from danswer.db.models import TokenRateLimit__UserGroup from danswer.db.models import User from danswer.db.models import User__UserGroup @@ -32,19 +35,111 @@ logger = setup_logger() +def _cleanup_user__user_group_relationships__no_commit( + db_session: Session, + user_group_id: int, + user_ids: list[UUID] | None = None, +) -> None: + """NOTE: does not commit the transaction.""" + where_clause = User__UserGroup.user_group_id == user_group_id + if user_ids: + where_clause &= User__UserGroup.user_id.in_(user_ids) + + user__user_group_relationships = db_session.scalars( + select(User__UserGroup).where(where_clause) + ).all() + for user__user_group_relationship in user__user_group_relationships: + db_session.delete(user__user_group_relationship) + + +def _cleanup_credential__user_group_relationships__no_commit( + db_session: Session, + user_group_id: int, +) -> None: + """NOTE: does not commit the transaction.""" + db_session.query(Credential__UserGroup).filter( + Credential__UserGroup.user_group_id == user_group_id + ).delete(synchronize_session=False) + + +def _cleanup_llm_provider__user_group_relationships__no_commit( + db_session: Session, user_group_id: int +) -> None: + """NOTE: does not commit the transaction.""" + db_session.query(LLMProvider__UserGroup).filter( + LLMProvider__UserGroup.user_group_id == user_group_id + ).delete(synchronize_session=False) + + +def _cleanup_persona__user_group_relationships__no_commit( + db_session: Session, user_group_id: int +) -> None: + """NOTE: does not commit the transaction.""" + db_session.query(Persona__UserGroup).filter( + Persona__UserGroup.user_group_id == user_group_id + ).delete(synchronize_session=False) + + +def _cleanup_token_rate_limit__user_group_relationships__no_commit( + db_session: Session, user_group_id: int +) -> None: + """NOTE: does not commit the transaction.""" + token_rate_limit__user_group_relationships = db_session.scalars( + select(TokenRateLimit__UserGroup).where( + TokenRateLimit__UserGroup.user_group_id == user_group_id + ) + ).all() + for ( + token_rate_limit__user_group_relationship + ) in token_rate_limit__user_group_relationships: + db_session.delete(token_rate_limit__user_group_relationship) + + +def _cleanup_user_group__cc_pair_relationships__no_commit( + db_session: Session, user_group_id: int, outdated_only: bool +) -> None: + """NOTE: does not commit the transaction.""" + stmt = select(UserGroup__ConnectorCredentialPair).where( + UserGroup__ConnectorCredentialPair.user_group_id == user_group_id + ) + if outdated_only: + stmt = stmt.where( + UserGroup__ConnectorCredentialPair.is_current == False # noqa: E712 + ) + user_group__cc_pair_relationships = db_session.scalars(stmt) + for user_group__cc_pair_relationship in user_group__cc_pair_relationships: + db_session.delete(user_group__cc_pair_relationship) + + +def _cleanup_document_set__user_group_relationships__no_commit( + db_session: Session, user_group_id: int +) -> None: + """NOTE: does not commit the transaction.""" + db_session.execute( + delete(DocumentSet__UserGroup).where( + DocumentSet__UserGroup.user_group_id == user_group_id + ) + ) + + def validate_user_creation_permissions( db_session: Session, user: User | None, - target_group_ids: list[int] | None, - object_is_public: bool | None, + target_group_ids: list[int] | None = None, + object_is_public: bool | None = None, + object_is_perm_sync: bool | None = None, ) -> None: """ + All users can create/edit permission synced objects if they don't specify a group All admin actions are allowed. Prevents non-admins from creating/editing: - public objects - objects with no groups - objects that belong to a group they don't curate """ + if object_is_perm_sync and not target_group_ids: + return + if not user or user.role == UserRole.ADMIN: return @@ -62,8 +157,12 @@ def validate_user_creation_permissions( status_code=400, detail=detail, ) + user_curated_groups = fetch_user_groups_for_user( - db_session=db_session, user_id=user.id, only_curator_groups=True + db_session=db_session, + user_id=user.id, + # Global curators can curate all groups they are member of + only_curator_groups=user.role != UserRole.GLOBAL_CURATOR, ) user_curated_group_ids = set([group.id for group in user_curated_groups]) target_group_ids_set = set(target_group_ids) @@ -199,7 +298,12 @@ def fetch_documents_for_user_group_paginated( def fetch_user_groups_for_documents( db_session: Session, document_ids: list[str], -) -> Sequence[tuple[int, list[str]]]: +) -> Sequence[tuple[str, list[str]]]: + """ + Fetches all user groups that have access to the given documents. + + NOTE: this doesn't include groups if the cc_pair is access type SYNC + """ stmt = ( select(Document.id, func.array_agg(UserGroup.name)) .join( @@ -208,7 +312,11 @@ def fetch_user_groups_for_documents( ) .join( ConnectorCredentialPair, - ConnectorCredentialPair.id == UserGroup__ConnectorCredentialPair.cc_pair_id, + and_( + ConnectorCredentialPair.id + == UserGroup__ConnectorCredentialPair.cc_pair_id, + ConnectorCredentialPair.access_type != AccessType.SYNC, + ), ) .join( DocumentByConnectorCredentialPair, @@ -285,42 +393,6 @@ def insert_user_group(db_session: Session, user_group: UserGroupCreate) -> UserG return db_user_group -def _cleanup_user__user_group_relationships__no_commit( - db_session: Session, - user_group_id: int, - user_ids: list[UUID] | None = None, -) -> None: - """NOTE: does not commit the transaction.""" - where_clause = User__UserGroup.user_group_id == user_group_id - if user_ids: - where_clause &= User__UserGroup.user_id.in_(user_ids) - - user__user_group_relationships = db_session.scalars( - select(User__UserGroup).where(where_clause) - ).all() - for user__user_group_relationship in user__user_group_relationships: - db_session.delete(user__user_group_relationship) - - -def _cleanup_credential__user_group_relationships__no_commit( - db_session: Session, - user_group_id: int, -) -> None: - """NOTE: does not commit the transaction.""" - db_session.query(Credential__UserGroup).filter( - Credential__UserGroup.user_group_id == user_group_id - ).delete(synchronize_session=False) - - -def _cleanup_llm_provider__user_group_relationships__no_commit( - db_session: Session, user_group_id: int -) -> None: - """NOTE: does not commit the transaction.""" - db_session.query(LLMProvider__UserGroup).filter( - LLMProvider__UserGroup.user_group_id == user_group_id - ).delete(synchronize_session=False) - - def _mark_user_group__cc_pair_relationships_outdated__no_commit( db_session: Session, user_group_id: int ) -> None: @@ -349,6 +421,8 @@ def _validate_curator_status__no_commit( .all() ) + # if the user is a curator in any of their groups, set their role to CURATOR + # otherwise, set their role to BASIC if curator_relationships: user.role = UserRole.CURATOR elif user.role == UserRole.CURATOR: @@ -374,6 +448,15 @@ def update_user_curator_relationship( user = fetch_user_by_id(db_session, set_curator_request.user_id) if not user: raise ValueError(f"User with id '{set_curator_request.user_id}' not found") + + if user.role == UserRole.ADMIN: + raise ValueError( + f"User '{user.email}' is an admin and therefore has all permissions " + "of a curator. If you'd like this user to only have curator permissions, " + "you must update their role to BASIC then assign them to be CURATOR in the " + "appropriate groups." + ) + requested_user_groups = fetch_user_groups_for_user( db_session=db_session, user_id=set_curator_request.user_id, @@ -475,21 +558,6 @@ def update_user_group( return db_user_group -def _cleanup_token_rate_limit__user_group_relationships__no_commit( - db_session: Session, user_group_id: int -) -> None: - """NOTE: does not commit the transaction.""" - token_rate_limit__user_group_relationships = db_session.scalars( - select(TokenRateLimit__UserGroup).where( - TokenRateLimit__UserGroup.user_group_id == user_group_id - ) - ).all() - for ( - token_rate_limit__user_group_relationship - ) in token_rate_limit__user_group_relationships: - db_session.delete(token_rate_limit__user_group_relationship) - - def prepare_user_group_for_deletion(db_session: Session, user_group_id: int) -> None: stmt = select(UserGroup).where(UserGroup.id == user_group_id) db_user_group = db_session.scalar(stmt) @@ -498,16 +566,31 @@ def prepare_user_group_for_deletion(db_session: Session, user_group_id: int) -> _check_user_group_is_modifiable(db_user_group) + _mark_user_group__cc_pair_relationships_outdated__no_commit( + db_session=db_session, user_group_id=user_group_id + ) + _cleanup_credential__user_group_relationships__no_commit( db_session=db_session, user_group_id=user_group_id ) _cleanup_user__user_group_relationships__no_commit( db_session=db_session, user_group_id=user_group_id ) - _mark_user_group__cc_pair_relationships_outdated__no_commit( + _cleanup_token_rate_limit__user_group_relationships__no_commit( db_session=db_session, user_group_id=user_group_id ) - _cleanup_token_rate_limit__user_group_relationships__no_commit( + _cleanup_document_set__user_group_relationships__no_commit( + db_session=db_session, user_group_id=user_group_id + ) + _cleanup_persona__user_group_relationships__no_commit( + db_session=db_session, user_group_id=user_group_id + ) + _cleanup_user_group__cc_pair_relationships__no_commit( + db_session=db_session, + user_group_id=user_group_id, + outdated_only=False, + ) + _cleanup_llm_provider__user_group_relationships__no_commit( db_session=db_session, user_group_id=user_group_id ) @@ -516,20 +599,12 @@ def prepare_user_group_for_deletion(db_session: Session, user_group_id: int) -> db_session.commit() -def _cleanup_user_group__cc_pair_relationships__no_commit( - db_session: Session, user_group_id: int, outdated_only: bool -) -> None: - """NOTE: does not commit the transaction.""" - stmt = select(UserGroup__ConnectorCredentialPair).where( - UserGroup__ConnectorCredentialPair.user_group_id == user_group_id - ) - if outdated_only: - stmt = stmt.where( - UserGroup__ConnectorCredentialPair.is_current == False # noqa: E712 - ) - user_group__cc_pair_relationships = db_session.scalars(stmt) - for user_group__cc_pair_relationship in user_group__cc_pair_relationships: - db_session.delete(user_group__cc_pair_relationship) +def delete_user_group(db_session: Session, user_group: UserGroup) -> None: + """ + This assumes that all the fk cleanup has already been done. + """ + db_session.delete(user_group) + db_session.commit() def mark_user_group_as_synced(db_session: Session, user_group: UserGroup) -> None: @@ -541,26 +616,6 @@ def mark_user_group_as_synced(db_session: Session, user_group: UserGroup) -> Non db_session.commit() -def delete_user_group(db_session: Session, user_group: UserGroup) -> None: - _cleanup_llm_provider__user_group_relationships__no_commit( - db_session=db_session, user_group_id=user_group.id - ) - _cleanup_user__user_group_relationships__no_commit( - db_session=db_session, user_group_id=user_group.id - ) - _cleanup_user_group__cc_pair_relationships__no_commit( - db_session=db_session, - user_group_id=user_group.id, - outdated_only=False, - ) - - # need to flush so that we don't get a foreign key error when deleting the user group row - db_session.flush() - - db_session.delete(user_group) - db_session.commit() - - def delete_user_group_cc_pair_relationship__no_commit( cc_pair_id: int, db_session: Session ) -> None: @@ -574,7 +629,7 @@ def delete_user_group_cc_pair_relationship__no_commit( if cc_pair.status != ConnectorCredentialPairStatus.DELETING: raise ValueError( - f"Connector Credential Pair '{cc_pair_id}' is not in the DELETING state" + f"Connector Credential Pair '{cc_pair_id}' is not in the DELETING state. status={cc_pair.status}" ) delete_stmt = delete(UserGroup__ConnectorCredentialPair).where( diff --git a/backend/ee/danswer/external_permissions/__init__.py b/backend/ee/danswer/external_permissions/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/ee/danswer/external_permissions/confluence/__init__.py b/backend/ee/danswer/external_permissions/confluence/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/ee/danswer/external_permissions/confluence/doc_sync.py b/backend/ee/danswer/external_permissions/confluence/doc_sync.py new file mode 100644 index 00000000000..d83da900d2c --- /dev/null +++ b/backend/ee/danswer/external_permissions/confluence/doc_sync.py @@ -0,0 +1,284 @@ +""" +Rules defined here: +https://confluence.atlassian.com/conf85/check-who-can-view-a-page-1283360557.html +""" +from typing import Any + +from danswer.access.models import DocExternalAccess +from danswer.access.models import ExternalAccess +from danswer.connectors.confluence.connector import ConfluenceConnector +from danswer.connectors.confluence.onyx_confluence import OnyxConfluence +from danswer.connectors.confluence.utils import get_user_email_from_username__server +from danswer.connectors.models import SlimDocument +from danswer.db.models import ConnectorCredentialPair +from danswer.utils.logger import setup_logger + +logger = setup_logger() + +_VIEWSPACE_PERMISSION_TYPE = "VIEWSPACE" +_REQUEST_PAGINATION_LIMIT = 100 + + +def _get_server_space_permissions( + confluence_client: OnyxConfluence, space_key: str +) -> ExternalAccess: + space_permissions = confluence_client.get_space_permissions(space_key=space_key) + + viewspace_permissions = [] + for permission_category in space_permissions: + if permission_category.get("type") == _VIEWSPACE_PERMISSION_TYPE: + viewspace_permissions.extend( + permission_category.get("spacePermissions", []) + ) + + user_names = set() + group_names = set() + for permission in viewspace_permissions: + if user_name := permission.get("userName"): + user_names.add(user_name) + if group_name := permission.get("groupName"): + group_names.add(group_name) + + user_emails = set() + for user_name in user_names: + user_email = get_user_email_from_username__server(confluence_client, user_name) + if user_email: + user_emails.add(user_email) + else: + logger.warning(f"Email for user {user_name} not found in Confluence") + + return ExternalAccess( + external_user_emails=user_emails, + external_user_group_ids=group_names, + # TODO: Check if the space is publicly accessible + # Currently, we assume the space is not public + # We need to check if anonymous access is turned on for the site and space + # This information is paywalled so it remains unimplemented + is_public=False, + ) + + +def _get_cloud_space_permissions( + confluence_client: OnyxConfluence, space_key: str +) -> ExternalAccess: + space_permissions_result = confluence_client.get_space( + space_key=space_key, expand="permissions" + ) + space_permissions = space_permissions_result.get("permissions", []) + + user_emails = set() + group_names = set() + is_externally_public = False + for permission in space_permissions: + subs = permission.get("subjects") + if subs: + # If there are subjects, then there are explicit users or groups with access + if email := subs.get("user", {}).get("results", [{}])[0].get("email"): + user_emails.add(email) + if group_name := subs.get("group", {}).get("results", [{}])[0].get("name"): + group_names.add(group_name) + else: + # If there are no subjects, then the permission is for everyone + if permission.get("operation", {}).get( + "operation" + ) == "read" and permission.get("anonymousAccess", False): + # If the permission specifies read access for anonymous users, then + # the space is publicly accessible + is_externally_public = True + + return ExternalAccess( + external_user_emails=user_emails, + external_user_group_ids=group_names, + is_public=is_externally_public, + ) + + +def _get_space_permissions( + confluence_client: OnyxConfluence, + is_cloud: bool, +) -> dict[str, ExternalAccess]: + logger.debug("Getting space permissions") + # Gets all the spaces in the Confluence instance + all_space_keys = [] + start = 0 + while True: + spaces_batch = confluence_client.get_all_spaces( + start=start, limit=_REQUEST_PAGINATION_LIMIT + ) + for space in spaces_batch.get("results", []): + all_space_keys.append(space.get("key")) + + if len(spaces_batch.get("results", [])) < _REQUEST_PAGINATION_LIMIT: + break + + start += len(spaces_batch.get("results", [])) + + # Gets the permissions for each space + logger.debug(f"Got {len(all_space_keys)} spaces from confluence") + space_permissions_by_space_key: dict[str, ExternalAccess] = {} + for space_key in all_space_keys: + if is_cloud: + space_permissions = _get_cloud_space_permissions( + confluence_client=confluence_client, space_key=space_key + ) + else: + space_permissions = _get_server_space_permissions( + confluence_client=confluence_client, space_key=space_key + ) + + # Stores the permissions for each space + space_permissions_by_space_key[space_key] = space_permissions + + return space_permissions_by_space_key + + +def _extract_read_access_restrictions( + confluence_client: OnyxConfluence, restrictions: dict[str, Any] +) -> ExternalAccess | None: + """ + Converts a page's restrictions dict into an ExternalAccess object. + If there are no restrictions, then return None + """ + read_access = restrictions.get("read", {}) + read_access_restrictions = read_access.get("restrictions", {}) + + # Extract the users with read access + read_access_user = read_access_restrictions.get("user", {}) + read_access_user_jsons = read_access_user.get("results", []) + read_access_user_emails = [] + for user in read_access_user_jsons: + # If the user has an email, then add it to the list + if user.get("email"): + read_access_user_emails.append(user["email"]) + # If the user has a username and not an email, then get the email from Confluence + elif user.get("username"): + email = get_user_email_from_username__server( + confluence_client=confluence_client, user_name=user["username"] + ) + if email: + read_access_user_emails.append(email) + else: + logger.warning( + f"Email for user {user['username']} not found in Confluence" + ) + else: + if user.get("email") is not None: + logger.warning(f"Cant find email for user {user.get('displayName')}") + logger.warning( + "This user needs to make their email accessible in Confluence Settings" + ) + + logger.warning(f"no user email or username for {user}") + + # Extract the groups with read access + read_access_group = read_access_restrictions.get("group", {}) + read_access_group_jsons = read_access_group.get("results", []) + read_access_group_names = [ + group["name"] for group in read_access_group_jsons if group.get("name") + ] + + # If there are no restrictions found, then the page + # inherits the space's restrictions so return None + is_space_public = read_access_user_emails == [] and read_access_group_names == [] + if is_space_public: + return None + + return ExternalAccess( + external_user_emails=set(read_access_user_emails), + external_user_group_ids=set(read_access_group_names), + # there is no way for a page to be individually public if the space isn't public + is_public=False, + ) + + +def _fetch_all_page_restrictions_for_space( + confluence_client: OnyxConfluence, + slim_docs: list[SlimDocument], + space_permissions_by_space_key: dict[str, ExternalAccess], +) -> list[DocExternalAccess]: + """ + For all pages, if a page has restrictions, then use those restrictions. + Otherwise, use the space's restrictions. + """ + document_restrictions: list[DocExternalAccess] = [] + + for slim_doc in slim_docs: + if slim_doc.perm_sync_data is None: + raise ValueError( + f"No permission sync data found for document {slim_doc.id}" + ) + restrictions = _extract_read_access_restrictions( + confluence_client=confluence_client, + restrictions=slim_doc.perm_sync_data.get("restrictions", {}), + ) + if restrictions: + document_restrictions.append( + DocExternalAccess( + doc_id=slim_doc.id, + external_access=restrictions, + ) + ) + # If there are restrictions, then we don't need to use the space's restrictions + continue + + space_key = slim_doc.perm_sync_data.get("space_key") + if space_permissions := space_permissions_by_space_key.get(space_key): + # If there are no restrictions, then use the space's restrictions + document_restrictions.append( + DocExternalAccess( + doc_id=slim_doc.id, + external_access=space_permissions, + ) + ) + if ( + not space_permissions.is_public + and not space_permissions.external_user_emails + and not space_permissions.external_user_group_ids + ): + logger.warning( + f"Permissions are empty for document: {slim_doc.id}\n" + "This means space permissions are may be wrong for" + f" Space key: {space_key}" + ) + continue + + logger.warning(f"No permissions found for document {slim_doc.id}") + + logger.debug("Finished fetching all page restrictions for space") + return document_restrictions + + +def confluence_doc_sync( + cc_pair: ConnectorCredentialPair, +) -> list[DocExternalAccess]: + """ + Adds the external permissions to the documents in postgres + if the document doesn't already exists in postgres, we create + it in postgres so that when it gets created later, the permissions are + already populated + """ + logger.debug("Starting confluence doc sync") + confluence_connector = ConfluenceConnector( + **cc_pair.connector.connector_specific_config + ) + confluence_connector.load_credentials(cc_pair.credential.credential_json) + + is_cloud = cc_pair.connector.connector_specific_config.get("is_cloud", False) + + space_permissions_by_space_key = _get_space_permissions( + confluence_client=confluence_connector.confluence_client, + is_cloud=is_cloud, + ) + + slim_docs = [] + logger.debug("Fetching all slim documents from confluence") + for doc_batch in confluence_connector.retrieve_all_slim_documents(): + logger.debug(f"Got {len(doc_batch)} slim documents from confluence") + slim_docs.extend(doc_batch) + + logger.debug("Fetching all page restrictions for space") + return _fetch_all_page_restrictions_for_space( + confluence_client=confluence_connector.confluence_client, + slim_docs=slim_docs, + space_permissions_by_space_key=space_permissions_by_space_key, + ) diff --git a/backend/ee/danswer/external_permissions/confluence/group_sync.py b/backend/ee/danswer/external_permissions/confluence/group_sync.py new file mode 100644 index 00000000000..8f3f3e43fc6 --- /dev/null +++ b/backend/ee/danswer/external_permissions/confluence/group_sync.py @@ -0,0 +1,64 @@ +from danswer.connectors.confluence.onyx_confluence import build_confluence_client +from danswer.connectors.confluence.onyx_confluence import OnyxConfluence +from danswer.connectors.confluence.utils import get_user_email_from_username__server +from danswer.db.models import ConnectorCredentialPair +from danswer.utils.logger import setup_logger +from ee.danswer.db.external_perm import ExternalUserGroup + + +logger = setup_logger() + + +def _build_group_member_email_map( + confluence_client: OnyxConfluence, +) -> dict[str, set[str]]: + group_member_emails: dict[str, set[str]] = {} + for user_result in confluence_client.paginated_cql_user_retrieval(): + user = user_result.get("user", {}) + if not user: + logger.warning(f"user result missing user field: {user_result}") + continue + email = user.get("email") + if not email: + # This field is only present in Confluence Server + user_name = user.get("username") + # If it is present, try to get the email using a Server-specific method + if user_name: + email = get_user_email_from_username__server( + confluence_client=confluence_client, + user_name=user_name, + ) + if not email: + # If we still don't have an email, skip this user + continue + + for group in confluence_client.paginated_groups_by_user_retrieval(user): + # group name uniqueness is enforced by Confluence, so we can use it as a group ID + group_id = group["name"] + group_member_emails.setdefault(group_id, set()).add(email) + + return group_member_emails + + +def confluence_group_sync( + cc_pair: ConnectorCredentialPair, +) -> list[ExternalUserGroup]: + confluence_client = build_confluence_client( + credentials=cc_pair.credential.credential_json, + is_cloud=cc_pair.connector.connector_specific_config.get("is_cloud", False), + wiki_base=cc_pair.connector.connector_specific_config["wiki_base"], + ) + + group_member_email_map = _build_group_member_email_map( + confluence_client=confluence_client, + ) + danswer_groups: list[ExternalUserGroup] = [] + for group_id, group_member_emails in group_member_email_map.items(): + danswer_groups.append( + ExternalUserGroup( + id=group_id, + user_emails=list(group_member_emails), + ) + ) + + return danswer_groups diff --git a/backend/ee/danswer/external_permissions/gmail/doc_sync.py b/backend/ee/danswer/external_permissions/gmail/doc_sync.py new file mode 100644 index 00000000000..6b72e7ba116 --- /dev/null +++ b/backend/ee/danswer/external_permissions/gmail/doc_sync.py @@ -0,0 +1,63 @@ +from datetime import datetime +from datetime import timezone + +from danswer.access.models import DocExternalAccess +from danswer.access.models import ExternalAccess +from danswer.connectors.gmail.connector import GmailConnector +from danswer.connectors.interfaces import GenerateSlimDocumentOutput +from danswer.db.models import ConnectorCredentialPair +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +def _get_slim_doc_generator( + cc_pair: ConnectorCredentialPair, + gmail_connector: GmailConnector, +) -> GenerateSlimDocumentOutput: + current_time = datetime.now(timezone.utc) + start_time = ( + cc_pair.last_time_perm_sync.replace(tzinfo=timezone.utc).timestamp() + if cc_pair.last_time_perm_sync + else 0.0 + ) + + return gmail_connector.retrieve_all_slim_documents( + start=start_time, end=current_time.timestamp() + ) + + +def gmail_doc_sync( + cc_pair: ConnectorCredentialPair, +) -> list[DocExternalAccess]: + """ + Adds the external permissions to the documents in postgres + if the document doesn't already exists in postgres, we create + it in postgres so that when it gets created later, the permissions are + already populated + """ + gmail_connector = GmailConnector(**cc_pair.connector.connector_specific_config) + gmail_connector.load_credentials(cc_pair.credential.credential_json) + + slim_doc_generator = _get_slim_doc_generator(cc_pair, gmail_connector) + + document_external_access: list[DocExternalAccess] = [] + for slim_doc_batch in slim_doc_generator: + for slim_doc in slim_doc_batch: + if slim_doc.perm_sync_data is None: + logger.warning(f"No permissions found for document {slim_doc.id}") + continue + if user_email := slim_doc.perm_sync_data.get("user_email"): + ext_access = ExternalAccess( + external_user_emails=set([user_email]), + external_user_group_ids=set(), + is_public=False, + ) + document_external_access.append( + DocExternalAccess( + doc_id=slim_doc.id, + external_access=ext_access, + ) + ) + + return document_external_access diff --git a/backend/ee/danswer/external_permissions/google_drive/__init__.py b/backend/ee/danswer/external_permissions/google_drive/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/ee/danswer/external_permissions/google_drive/doc_sync.py b/backend/ee/danswer/external_permissions/google_drive/doc_sync.py new file mode 100644 index 00000000000..10792b0ae7f --- /dev/null +++ b/backend/ee/danswer/external_permissions/google_drive/doc_sync.py @@ -0,0 +1,159 @@ +from datetime import datetime +from datetime import timezone +from typing import Any + +from danswer.access.models import DocExternalAccess +from danswer.access.models import ExternalAccess +from danswer.connectors.google_drive.connector import GoogleDriveConnector +from danswer.connectors.google_utils.google_utils import execute_paginated_retrieval +from danswer.connectors.google_utils.resources import get_drive_service +from danswer.connectors.interfaces import GenerateSlimDocumentOutput +from danswer.connectors.models import SlimDocument +from danswer.db.models import ConnectorCredentialPair +from danswer.utils.logger import setup_logger + +logger = setup_logger() + +_PERMISSION_ID_PERMISSION_MAP: dict[str, dict[str, Any]] = {} + + +def _get_slim_doc_generator( + cc_pair: ConnectorCredentialPair, + google_drive_connector: GoogleDriveConnector, +) -> GenerateSlimDocumentOutput: + current_time = datetime.now(timezone.utc) + start_time = ( + cc_pair.last_time_perm_sync.replace(tzinfo=timezone.utc).timestamp() + if cc_pair.last_time_perm_sync + else 0.0 + ) + + return google_drive_connector.retrieve_all_slim_documents( + start=start_time, end=current_time.timestamp() + ) + + +def _fetch_permissions_for_permission_ids( + google_drive_connector: GoogleDriveConnector, + permission_ids: list[str], + permission_info: dict[str, Any], +) -> list[dict[str, Any]]: + doc_id = permission_info.get("doc_id") + if not permission_info or not doc_id: + return [] + + # Check cache first for all permission IDs + permissions = [ + _PERMISSION_ID_PERMISSION_MAP[pid] + for pid in permission_ids + if pid in _PERMISSION_ID_PERMISSION_MAP + ] + + # If we found all permissions in cache, return them + if len(permissions) == len(permission_ids): + return permissions + + owner_email = permission_info.get("owner_email") + drive_service = get_drive_service( + creds=google_drive_connector.creds, + user_email=(owner_email or google_drive_connector.primary_admin_email), + ) + + # Otherwise, fetch all permissions and update cache + fetched_permissions = execute_paginated_retrieval( + retrieval_function=drive_service.permissions().list, + list_key="permissions", + fileId=doc_id, + fields="permissions(id, emailAddress, type, domain)", + supportsAllDrives=True, + ) + + permissions_for_doc_id = [] + # Update cache and return all permissions + for permission in fetched_permissions: + permissions_for_doc_id.append(permission) + _PERMISSION_ID_PERMISSION_MAP[permission["id"]] = permission + + return permissions_for_doc_id + + +def _get_permissions_from_slim_doc( + google_drive_connector: GoogleDriveConnector, + slim_doc: SlimDocument, +) -> ExternalAccess: + permission_info = slim_doc.perm_sync_data or {} + + permissions_list = permission_info.get("permissions", []) + if not permissions_list: + if permission_ids := permission_info.get("permission_ids"): + permissions_list = _fetch_permissions_for_permission_ids( + google_drive_connector=google_drive_connector, + permission_ids=permission_ids, + permission_info=permission_info, + ) + if not permissions_list: + logger.warning(f"No permissions found for document {slim_doc.id}") + return ExternalAccess( + external_user_emails=set(), + external_user_group_ids=set(), + is_public=False, + ) + + company_domain = google_drive_connector.google_domain + user_emails: set[str] = set() + group_emails: set[str] = set() + public = False + for permission in permissions_list: + permission_type = permission["type"] + if permission_type == "user": + user_emails.add(permission["emailAddress"]) + elif permission_type == "group": + group_emails.add(permission["emailAddress"]) + elif permission_type == "domain" and company_domain: + if permission.get("domain") == company_domain: + public = True + else: + logger.warning( + "Permission is type domain but does not match company domain:" + f"\n {permission}" + ) + elif permission_type == "anyone": + public = True + + return ExternalAccess( + external_user_emails=user_emails, + external_user_group_ids=group_emails, + is_public=public, + ) + + +def gdrive_doc_sync( + cc_pair: ConnectorCredentialPair, +) -> list[DocExternalAccess]: + """ + Adds the external permissions to the documents in postgres + if the document doesn't already exists in postgres, we create + it in postgres so that when it gets created later, the permissions are + already populated + """ + google_drive_connector = GoogleDriveConnector( + **cc_pair.connector.connector_specific_config + ) + google_drive_connector.load_credentials(cc_pair.credential.credential_json) + + slim_doc_generator = _get_slim_doc_generator(cc_pair, google_drive_connector) + + document_external_accesses = [] + for slim_doc_batch in slim_doc_generator: + for slim_doc in slim_doc_batch: + ext_access = _get_permissions_from_slim_doc( + google_drive_connector=google_drive_connector, + slim_doc=slim_doc, + ) + document_external_accesses.append( + DocExternalAccess( + external_access=ext_access, + doc_id=slim_doc.id, + ) + ) + return document_external_accesses diff --git a/backend/ee/danswer/external_permissions/google_drive/group_sync.py b/backend/ee/danswer/external_permissions/google_drive/group_sync.py new file mode 100644 index 00000000000..0f421f371b9 --- /dev/null +++ b/backend/ee/danswer/external_permissions/google_drive/group_sync.py @@ -0,0 +1,52 @@ +from danswer.connectors.google_drive.connector import GoogleDriveConnector +from danswer.connectors.google_utils.google_utils import execute_paginated_retrieval +from danswer.connectors.google_utils.resources import get_admin_service +from danswer.db.models import ConnectorCredentialPair +from danswer.utils.logger import setup_logger +from ee.danswer.db.external_perm import ExternalUserGroup + +logger = setup_logger() + + +def gdrive_group_sync( + cc_pair: ConnectorCredentialPair, +) -> list[ExternalUserGroup]: + google_drive_connector = GoogleDriveConnector( + **cc_pair.connector.connector_specific_config + ) + google_drive_connector.load_credentials(cc_pair.credential.credential_json) + admin_service = get_admin_service( + google_drive_connector.creds, google_drive_connector.primary_admin_email + ) + + danswer_groups: list[ExternalUserGroup] = [] + for group in execute_paginated_retrieval( + admin_service.groups().list, + list_key="groups", + domain=google_drive_connector.google_domain, + fields="groups(email)", + ): + # The id is the group email + group_email = group["email"] + + # Gather group member emails + group_member_emails: list[str] = [] + for member in execute_paginated_retrieval( + admin_service.members().list, + list_key="members", + groupKey=group_email, + fields="members(email)", + ): + group_member_emails.append(member["email"]) + + if not group_member_emails: + continue + + danswer_groups.append( + ExternalUserGroup( + id=group_email, + user_emails=list(group_member_emails), + ) + ) + + return danswer_groups diff --git a/backend/ee/danswer/external_permissions/slack/doc_sync.py b/backend/ee/danswer/external_permissions/slack/doc_sync.py new file mode 100644 index 00000000000..24c565fc4e5 --- /dev/null +++ b/backend/ee/danswer/external_permissions/slack/doc_sync.py @@ -0,0 +1,155 @@ +from slack_sdk import WebClient + +from danswer.access.models import DocExternalAccess +from danswer.access.models import ExternalAccess +from danswer.connectors.slack.connector import get_channels +from danswer.connectors.slack.connector import make_paginated_slack_api_call_w_retries +from danswer.connectors.slack.connector import SlackPollConnector +from danswer.db.models import ConnectorCredentialPair +from danswer.utils.logger import setup_logger +from ee.danswer.external_permissions.slack.utils import fetch_user_id_to_email_map + + +logger = setup_logger() + + +def _get_slack_document_ids_and_channels( + cc_pair: ConnectorCredentialPair, +) -> dict[str, list[str]]: + slack_connector = SlackPollConnector(**cc_pair.connector.connector_specific_config) + slack_connector.load_credentials(cc_pair.credential.credential_json) + + slim_doc_generator = slack_connector.retrieve_all_slim_documents() + + channel_doc_map: dict[str, list[str]] = {} + for doc_metadata_batch in slim_doc_generator: + for doc_metadata in doc_metadata_batch: + if doc_metadata.perm_sync_data is None: + continue + channel_id = doc_metadata.perm_sync_data["channel_id"] + if channel_id not in channel_doc_map: + channel_doc_map[channel_id] = [] + channel_doc_map[channel_id].append(doc_metadata.id) + + return channel_doc_map + + +def _fetch_workspace_permissions( + user_id_to_email_map: dict[str, str], +) -> ExternalAccess: + user_emails = set() + for email in user_id_to_email_map.values(): + user_emails.add(email) + return ExternalAccess( + external_user_emails=user_emails, + # No group<->document mapping for slack + external_user_group_ids=set(), + # No way to determine if slack is invite only without enterprise liscense + is_public=False, + ) + + +def _fetch_channel_permissions( + slack_client: WebClient, + workspace_permissions: ExternalAccess, + user_id_to_email_map: dict[str, str], +) -> dict[str, ExternalAccess]: + channel_permissions = {} + public_channels = get_channels( + client=slack_client, + get_public=True, + get_private=False, + ) + public_channel_ids = [ + channel["id"] for channel in public_channels if "id" in channel + ] + for channel_id in public_channel_ids: + channel_permissions[channel_id] = workspace_permissions + + private_channels = get_channels( + client=slack_client, + get_public=False, + get_private=True, + ) + private_channel_ids = [ + channel["id"] for channel in private_channels if "id" in channel + ] + + for channel_id in private_channel_ids: + # Collect all member ids for the channel pagination calls + member_ids = [] + for result in make_paginated_slack_api_call_w_retries( + slack_client.conversations_members, + channel=channel_id, + ): + member_ids.extend(result.get("members", [])) + + # Collect all member emails for the channel + member_emails = set() + for member_id in member_ids: + member_email = user_id_to_email_map.get(member_id) + + if not member_email: + # If the user is an external user, they wont get returned from the + # conversations_members call so we need to make a separate call to users_info + # and add them to the user_id_to_email_map + member_info = slack_client.users_info(user=member_id) + member_email = member_info["user"]["profile"].get("email") + if not member_email: + # If no email is found, we skip the user + continue + user_id_to_email_map[member_id] = member_email + + member_emails.add(member_email) + + channel_permissions[channel_id] = ExternalAccess( + external_user_emails=member_emails, + # No group<->document mapping for slack + external_user_group_ids=set(), + # No way to determine if slack is invite only without enterprise liscense + is_public=False, + ) + + return channel_permissions + + +def slack_doc_sync( + cc_pair: ConnectorCredentialPair, +) -> list[DocExternalAccess]: + """ + Adds the external permissions to the documents in postgres + if the document doesn't already exists in postgres, we create + it in postgres so that when it gets created later, the permissions are + already populated + """ + slack_client = WebClient( + token=cc_pair.credential.credential_json["slack_bot_token"] + ) + user_id_to_email_map = fetch_user_id_to_email_map(slack_client) + channel_doc_map = _get_slack_document_ids_and_channels( + cc_pair=cc_pair, + ) + workspace_permissions = _fetch_workspace_permissions( + user_id_to_email_map=user_id_to_email_map, + ) + channel_permissions = _fetch_channel_permissions( + slack_client=slack_client, + workspace_permissions=workspace_permissions, + user_id_to_email_map=user_id_to_email_map, + ) + + document_external_accesses = [] + for channel_id, ext_access in channel_permissions.items(): + doc_ids = channel_doc_map.get(channel_id) + if not doc_ids: + # No documents found for channel the channel_id + continue + + for doc_id in doc_ids: + document_external_accesses.append( + DocExternalAccess( + external_access=ext_access, + doc_id=doc_id, + ) + ) + return document_external_accesses diff --git a/backend/ee/danswer/external_permissions/slack/group_sync.py b/backend/ee/danswer/external_permissions/slack/group_sync.py new file mode 100644 index 00000000000..780f619e464 --- /dev/null +++ b/backend/ee/danswer/external_permissions/slack/group_sync.py @@ -0,0 +1,73 @@ +""" +THIS IS NOT USEFUL OR USED FOR PERMISSION SYNCING +WHEN USERGROUPS ARE ADDED TO A CHANNEL, IT JUST RESOLVES ALL THE USERS TO THAT CHANNEL +SO WHEN CHECKING IF A USER CAN ACCESS A DOCUMENT, WE ONLY NEED TO CHECK THEIR EMAIL +THERE IS NO USERGROUP <-> DOCUMENT PERMISSION MAPPING +""" +from slack_sdk import WebClient + +from danswer.connectors.slack.connector import make_paginated_slack_api_call_w_retries +from danswer.db.models import ConnectorCredentialPair +from danswer.utils.logger import setup_logger +from ee.danswer.db.external_perm import ExternalUserGroup +from ee.danswer.external_permissions.slack.utils import fetch_user_id_to_email_map + +logger = setup_logger() + + +def _get_slack_group_ids( + slack_client: WebClient, +) -> list[str]: + group_ids = [] + for result in make_paginated_slack_api_call_w_retries(slack_client.usergroups_list): + for group in result.get("usergroups", []): + group_ids.append(group.get("id")) + return group_ids + + +def _get_slack_group_members_email( + slack_client: WebClient, + group_name: str, + user_id_to_email_map: dict[str, str], +) -> list[str]: + group_member_emails = [] + for result in make_paginated_slack_api_call_w_retries( + slack_client.usergroups_users_list, usergroup=group_name + ): + for member_id in result.get("users", []): + member_email = user_id_to_email_map.get(member_id) + if not member_email: + # If the user is an external user, they wont get returned from the + # conversations_members call so we need to make a separate call to users_info + member_info = slack_client.users_info(user=member_id) + member_email = member_info["user"]["profile"].get("email") + if not member_email: + # If no email is found, we skip the user + continue + user_id_to_email_map[member_id] = member_email + group_member_emails.append(member_email) + + return group_member_emails + + +def slack_group_sync( + cc_pair: ConnectorCredentialPair, +) -> list[ExternalUserGroup]: + slack_client = WebClient( + token=cc_pair.credential.credential_json["slack_bot_token"] + ) + user_id_to_email_map = fetch_user_id_to_email_map(slack_client) + + danswer_groups: list[ExternalUserGroup] = [] + for group_name in _get_slack_group_ids(slack_client): + group_member_emails = _get_slack_group_members_email( + slack_client=slack_client, + group_name=group_name, + user_id_to_email_map=user_id_to_email_map, + ) + if not group_member_emails: + continue + danswer_groups.append( + ExternalUserGroup(id=group_name, user_emails=group_member_emails) + ) + return danswer_groups diff --git a/backend/ee/danswer/external_permissions/slack/utils.py b/backend/ee/danswer/external_permissions/slack/utils.py new file mode 100644 index 00000000000..a6a049aee03 --- /dev/null +++ b/backend/ee/danswer/external_permissions/slack/utils.py @@ -0,0 +1,18 @@ +from slack_sdk import WebClient + +from danswer.connectors.slack.connector import make_paginated_slack_api_call_w_retries + + +def fetch_user_id_to_email_map( + slack_client: WebClient, +) -> dict[str, str]: + user_id_to_email_map = {} + for user_info in make_paginated_slack_api_call_w_retries( + slack_client.users_list, + ): + for user in user_info.get("members", []): + if user.get("profile", {}).get("email"): + user_id_to_email_map[user.get("id")] = user.get("profile", {}).get( + "email" + ) + return user_id_to_email_map diff --git a/backend/ee/danswer/external_permissions/sync_params.py b/backend/ee/danswer/external_permissions/sync_params.py new file mode 100644 index 00000000000..43c8a78122c --- /dev/null +++ b/backend/ee/danswer/external_permissions/sync_params.py @@ -0,0 +1,67 @@ +from collections.abc import Callable + +from danswer.access.models import DocExternalAccess +from danswer.configs.constants import DocumentSource +from danswer.db.models import ConnectorCredentialPair +from ee.danswer.db.external_perm import ExternalUserGroup +from ee.danswer.external_permissions.confluence.doc_sync import confluence_doc_sync +from ee.danswer.external_permissions.confluence.group_sync import confluence_group_sync +from ee.danswer.external_permissions.gmail.doc_sync import gmail_doc_sync +from ee.danswer.external_permissions.google_drive.doc_sync import gdrive_doc_sync +from ee.danswer.external_permissions.google_drive.group_sync import gdrive_group_sync +from ee.danswer.external_permissions.slack.doc_sync import slack_doc_sync + +# Defining the input/output types for the sync functions +DocSyncFuncType = Callable[ + [ + ConnectorCredentialPair, + ], + list[DocExternalAccess], +] + +GroupSyncFuncType = Callable[ + [ + ConnectorCredentialPair, + ], + list[ExternalUserGroup], +] + +# These functions update: +# - the user_email <-> document mapping +# - the external_user_group_id <-> document mapping +# in postgres without committing +# THIS ONE IS NECESSARY FOR AUTO SYNC TO WORK +DOC_PERMISSIONS_FUNC_MAP: dict[DocumentSource, DocSyncFuncType] = { + DocumentSource.GOOGLE_DRIVE: gdrive_doc_sync, + DocumentSource.CONFLUENCE: confluence_doc_sync, + DocumentSource.SLACK: slack_doc_sync, + DocumentSource.GMAIL: gmail_doc_sync, +} + +# These functions update: +# - the user_email <-> external_user_group_id mapping +# in postgres without committing +# THIS ONE IS OPTIONAL ON AN APP BY APP BASIS +GROUP_PERMISSIONS_FUNC_MAP: dict[DocumentSource, GroupSyncFuncType] = { + DocumentSource.GOOGLE_DRIVE: gdrive_group_sync, + DocumentSource.CONFLUENCE: confluence_group_sync, +} + + +# If nothing is specified here, we run the doc_sync every time the celery beat runs +DOC_PERMISSION_SYNC_PERIODS: dict[DocumentSource, int] = { + # Polling is not supported so we fetch all doc permissions every 5 minutes + DocumentSource.CONFLUENCE: 5 * 60, + DocumentSource.SLACK: 5 * 60, +} + +# If nothing is specified here, we run the doc_sync every time the celery beat runs +EXTERNAL_GROUP_SYNC_PERIODS: dict[DocumentSource, int] = { + # Polling is not supported so we fetch all group permissions every 5 minutes + DocumentSource.GOOGLE_DRIVE: 5 * 60, + DocumentSource.CONFLUENCE: 5 * 60, +} + + +def check_if_valid_sync_source(source_type: DocumentSource) -> bool: + return source_type in DOC_PERMISSIONS_FUNC_MAP diff --git a/backend/ee/danswer/main.py b/backend/ee/danswer/main.py index 7d150107c75..198f945b8da 100644 --- a/backend/ee/danswer/main.py +++ b/backend/ee/danswer/main.py @@ -1,7 +1,9 @@ from fastapi import FastAPI +from httpx_oauth.clients.google import GoogleOAuth2 from httpx_oauth.clients.openid import OpenID from danswer.auth.users import auth_backend +from danswer.auth.users import create_danswer_oauth_router from danswer.auth.users import fastapi_users from danswer.configs.app_configs import AUTH_TYPE from danswer.configs.app_configs import OAUTH_CLIENT_ID @@ -15,7 +17,6 @@ from danswer.utils.variable_functionality import global_version from ee.danswer.configs.app_configs import OPENID_CONFIG_URL from ee.danswer.server.analytics.api import router as analytics_router -from ee.danswer.server.api_key.api import router as api_key_router from ee.danswer.server.auth_check import check_ee_router_auth from ee.danswer.server.enterprise_settings.api import ( admin_router as enterprise_settings_admin_router, @@ -24,6 +25,7 @@ basic_router as enterprise_settings_router, ) from ee.danswer.server.manage.standard_answer import router as standard_answer_router +from ee.danswer.server.middleware.tenant_tracking import add_tenant_id_middleware from ee.danswer.server.query_and_chat.chat_backend import ( router as chat_router, ) @@ -34,11 +36,13 @@ from ee.danswer.server.reporting.usage_export_api import router as usage_export_router from ee.danswer.server.saml import router as saml_router from ee.danswer.server.seeding import seed_db +from ee.danswer.server.tenants.api import router as tenants_router from ee.danswer.server.token_rate_limits.api import ( router as token_rate_limit_settings_router, ) from ee.danswer.server.user_group.api import router as user_group_router from ee.danswer.utils.encryption import test_encryption +from shared_configs.configs import MULTI_TENANT logger = setup_logger() @@ -52,10 +56,38 @@ def get_application() -> FastAPI: application = get_application_base() + if MULTI_TENANT: + add_tenant_id_middleware(application, logger) + + if AUTH_TYPE == AuthType.CLOUD: + oauth_client = GoogleOAuth2(OAUTH_CLIENT_ID, OAUTH_CLIENT_SECRET) + include_router_with_global_prefix_prepended( + application, + create_danswer_oauth_router( + oauth_client, + auth_backend, + USER_AUTH_SECRET, + associate_by_email=True, + is_verified_by_default=True, + # Points the user back to the login page + redirect_url=f"{WEB_DOMAIN}/auth/oauth/callback", + ), + prefix="/auth/oauth", + tags=["auth"], + ) + + # Need basic auth router for `logout` endpoint + include_router_with_global_prefix_prepended( + application, + fastapi_users.get_logout_router(auth_backend), + prefix="/auth", + tags=["auth"], + ) + if AUTH_TYPE == AuthType.OIDC: include_router_with_global_prefix_prepended( application, - fastapi_users.get_oauth_router( + create_danswer_oauth_router( OpenID(OAUTH_CLIENT_ID, OAUTH_CLIENT_SECRET, OPENID_CONFIG_URL), auth_backend, USER_AUTH_SECRET, @@ -66,6 +98,7 @@ def get_application() -> FastAPI: prefix="/auth/oidc", tags=["auth"], ) + # need basic auth router for `logout` endpoint include_router_with_global_prefix_prepended( application, @@ -82,8 +115,6 @@ def get_application() -> FastAPI: # Analytics endpoints include_router_with_global_prefix_prepended(application, analytics_router) include_router_with_global_prefix_prepended(application, query_history_router) - # Api key management - include_router_with_global_prefix_prepended(application, api_key_router) # EE only backend APIs include_router_with_global_prefix_prepended(application, query_router) include_router_with_global_prefix_prepended(application, chat_router) @@ -99,6 +130,10 @@ def get_application() -> FastAPI: include_router_with_global_prefix_prepended(application, enterprise_settings_router) include_router_with_global_prefix_prepended(application, usage_export_router) + if MULTI_TENANT: + # Tenant management + include_router_with_global_prefix_prepended(application, tenants_router) + # Ensure all routes have auth enabled or are explicitly marked as public check_ee_router_auth(application) diff --git a/backend/ee/danswer/seeding/load_docs.py b/backend/ee/danswer/seeding/load_docs.py new file mode 100644 index 00000000000..31047423c0d --- /dev/null +++ b/backend/ee/danswer/seeding/load_docs.py @@ -0,0 +1,45 @@ +import json +import os +from typing import cast +from typing import List + +from cohere import Client + +from ee.danswer.configs.app_configs import COHERE_DEFAULT_API_KEY + +Embedding = List[float] + + +def load_processed_docs(cohere_enabled: bool) -> list[dict]: + base_path = os.path.join(os.getcwd(), "danswer", "seeding") + + if cohere_enabled and COHERE_DEFAULT_API_KEY: + initial_docs_path = os.path.join(base_path, "initial_docs_cohere.json") + processed_docs = json.load(open(initial_docs_path)) + + cohere_client = Client(api_key=COHERE_DEFAULT_API_KEY) + embed_model = "embed-english-v3.0" + + for doc in processed_docs: + title_embed_response = cohere_client.embed( + texts=[doc["title"]], + model=embed_model, + input_type="search_document", + ) + content_embed_response = cohere_client.embed( + texts=[doc["content"]], + model=embed_model, + input_type="search_document", + ) + + doc["title_embedding"] = cast( + List[Embedding], title_embed_response.embeddings + )[0] + doc["content_embedding"] = cast( + List[Embedding], content_embed_response.embeddings + )[0] + else: + initial_docs_path = os.path.join(base_path, "initial_docs.json") + processed_docs = json.load(open(initial_docs_path)) + + return processed_docs diff --git a/backend/ee/danswer/server/enterprise_settings/models.py b/backend/ee/danswer/server/enterprise_settings/models.py index c770fbd73e7..df8f022a402 100644 --- a/backend/ee/danswer/server/enterprise_settings/models.py +++ b/backend/ee/danswer/server/enterprise_settings/models.py @@ -1,3 +1,4 @@ +from typing import Any from typing import List from pydantic import BaseModel @@ -6,8 +7,20 @@ class NavigationItem(BaseModel): link: str - icon: str title: str + # Right now must be one of the FA icons + icon: str | None = None + # NOTE: SVG must not have a width / height specified + # This is the actual SVG as a string. Done this way to reduce + # complexity / having to store additional "logos" in Postgres + svg_logo: str | None = None + + @classmethod + def model_validate(cls, *args: Any, **kwargs: Any) -> "NavigationItem": + instance = super().model_validate(*args, **kwargs) + if bool(instance.icon) == bool(instance.svg_logo): + raise ValueError("Exactly one of fa_icon or svg_logo must be specified") + return instance class EnterpriseSettings(BaseModel): diff --git a/backend/ee/danswer/server/enterprise_settings/store.py b/backend/ee/danswer/server/enterprise_settings/store.py index 30b72d5d2e8..74706e0f769 100644 --- a/backend/ee/danswer/server/enterprise_settings/store.py +++ b/backend/ee/danswer/server/enterprise_settings/store.py @@ -11,9 +11,9 @@ from danswer.configs.constants import FileOrigin from danswer.configs.constants import KV_CUSTOM_ANALYTICS_SCRIPT_KEY from danswer.configs.constants import KV_ENTERPRISE_SETTINGS_KEY -from danswer.dynamic_configs.factory import get_dynamic_config_store -from danswer.dynamic_configs.interface import ConfigNotFoundError from danswer.file_store.file_store import get_default_file_store +from danswer.key_value_store.factory import get_kv_store +from danswer.key_value_store.interface import KvKeyNotFoundError from danswer.utils.logger import setup_logger from ee.danswer.server.enterprise_settings.models import AnalyticsScriptUpload from ee.danswer.server.enterprise_settings.models import EnterpriseSettings @@ -23,12 +23,12 @@ def load_settings() -> EnterpriseSettings: - dynamic_config_store = get_dynamic_config_store() + dynamic_config_store = get_kv_store() try: settings = EnterpriseSettings( **cast(dict, dynamic_config_store.load(KV_ENTERPRISE_SETTINGS_KEY)) ) - except ConfigNotFoundError: + except KvKeyNotFoundError: settings = EnterpriseSettings() dynamic_config_store.store(KV_ENTERPRISE_SETTINGS_KEY, settings.model_dump()) @@ -36,17 +36,17 @@ def load_settings() -> EnterpriseSettings: def store_settings(settings: EnterpriseSettings) -> None: - get_dynamic_config_store().store(KV_ENTERPRISE_SETTINGS_KEY, settings.model_dump()) + get_kv_store().store(KV_ENTERPRISE_SETTINGS_KEY, settings.model_dump()) _CUSTOM_ANALYTICS_SECRET_KEY = os.environ.get("CUSTOM_ANALYTICS_SECRET_KEY") def load_analytics_script() -> str | None: - dynamic_config_store = get_dynamic_config_store() + dynamic_config_store = get_kv_store() try: return cast(str, dynamic_config_store.load(KV_CUSTOM_ANALYTICS_SCRIPT_KEY)) - except ConfigNotFoundError: + except KvKeyNotFoundError: return None @@ -57,9 +57,7 @@ def store_analytics_script(analytics_script_upload: AnalyticsScriptUpload) -> No ): raise ValueError("Invalid secret key") - get_dynamic_config_store().store( - KV_CUSTOM_ANALYTICS_SCRIPT_KEY, analytics_script_upload.script - ) + get_kv_store().store(KV_CUSTOM_ANALYTICS_SCRIPT_KEY, analytics_script_upload.script) _LOGO_FILENAME = "__logo__" diff --git a/backend/ee/danswer/server/middleware/tenant_tracking.py b/backend/ee/danswer/server/middleware/tenant_tracking.py new file mode 100644 index 00000000000..92cc37f10bf --- /dev/null +++ b/backend/ee/danswer/server/middleware/tenant_tracking.py @@ -0,0 +1,75 @@ +import logging +from collections.abc import Awaitable +from collections.abc import Callable + +import jwt +from fastapi import FastAPI +from fastapi import HTTPException +from fastapi import Request +from fastapi import Response + +from danswer.auth.api_key import extract_tenant_from_api_key_header +from danswer.configs.app_configs import USER_AUTH_SECRET +from danswer.db.engine import is_valid_schema_name +from shared_configs.configs import MULTI_TENANT +from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR + + +def add_tenant_id_middleware(app: FastAPI, logger: logging.LoggerAdapter) -> None: + @app.middleware("http") + async def set_tenant_id( + request: Request, call_next: Callable[[Request], Awaitable[Response]] + ) -> Response: + try: + tenant_id = ( + _get_tenant_id_from_request(request, logger) + if MULTI_TENANT + else POSTGRES_DEFAULT_SCHEMA + ) + CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id) + return await call_next(request) + + except Exception as e: + logger.error(f"Error in tenant ID middleware: {str(e)}") + raise + + +def _get_tenant_id_from_request(request: Request, logger: logging.LoggerAdapter) -> str: + # First check for API key + tenant_id = extract_tenant_from_api_key_header(request) + if tenant_id is not None: + return tenant_id + + # Check for cookie-based auth + token = request.cookies.get("fastapiusersauth") + if not token: + return POSTGRES_DEFAULT_SCHEMA + + try: + payload = jwt.decode( + token, + USER_AUTH_SECRET, + audience=["fastapi-users:auth"], + algorithms=["HS256"], + ) + tenant_id_from_payload = payload.get("tenant_id", POSTGRES_DEFAULT_SCHEMA) + + # Since payload.get() can return None, ensure we have a string + tenant_id = ( + str(tenant_id_from_payload) + if tenant_id_from_payload is not None + else POSTGRES_DEFAULT_SCHEMA + ) + + if not is_valid_schema_name(tenant_id): + raise HTTPException(status_code=400, detail="Invalid tenant ID format") + + return tenant_id + + except jwt.InvalidTokenError: + return POSTGRES_DEFAULT_SCHEMA + + except Exception as e: + logger.error(f"Unexpected error in set_tenant_id_middleware: {str(e)}") + raise HTTPException(status_code=500, detail="Internal server error") diff --git a/backend/ee/danswer/server/query_and_chat/chat_backend.py b/backend/ee/danswer/server/query_and_chat/chat_backend.py index f5b56b7d43e..ef707cbfb24 100644 --- a/backend/ee/danswer/server/query_and_chat/chat_backend.py +++ b/backend/ee/danswer/server/query_and_chat/chat_backend.py @@ -18,6 +18,9 @@ from danswer.chat.process_message import stream_chat_message_objects from danswer.configs.constants import MessageType from danswer.configs.danswerbot_configs import DANSWER_BOT_TARGET_CHUNK_PERCENTAGE +from danswer.context.search.models import OptionalSearchSetting +from danswer.context.search.models import RetrievalDetails +from danswer.context.search.models import SavedSearchDoc from danswer.db.chat import create_chat_session from danswer.db.chat import create_new_chat_message from danswer.db.chat import get_or_create_root_message @@ -27,9 +30,6 @@ from danswer.llm.utils import get_max_input_tokens from danswer.natural_language_processing.utils import get_tokenizer from danswer.one_shot_answer.qa_utils import combine_message_thread -from danswer.search.models import OptionalSearchSetting -from danswer.search.models import RetrievalDetails -from danswer.search.models import SavedSearchDoc from danswer.secondary_llm_flows.query_expansion import thread_based_query_rephrase from danswer.server.query_and_chat.models import ChatMessageDetail from danswer.server.query_and_chat.models import CreateChatMessageRequest @@ -176,12 +176,14 @@ def handle_simplified_chat_message( chunks_above=0, chunks_below=0, full_doc=chat_message_req.full_doc, + structured_response_format=chat_message_req.structured_response_format, ) packets = stream_chat_message_objects( new_msg_req=full_chat_msg_info, user=user, db_session=db_session, + enforce_chat_session_id_for_search_docs=False, ) return _convert_packet_stream_to_response(packets) @@ -201,7 +203,7 @@ def handle_send_message_simple_with_history( raise HTTPException(status_code=400, detail="Messages cannot be zero length") # This is a sanity check to make sure the chat history is valid - # It must start with a user message and alternate between user and assistant + # It must start with a user message and alternate beteen user and assistant expected_role = MessageType.USER for msg in req.messages: if not msg.message: @@ -295,12 +297,14 @@ def handle_send_message_simple_with_history( chunks_above=0, chunks_below=0, full_doc=req.full_doc, + structured_response_format=req.structured_response_format, ) packets = stream_chat_message_objects( new_msg_req=full_chat_msg_info, user=user, db_session=db_session, + enforce_chat_session_id_for_search_docs=False, ) return _convert_packet_stream_to_response(packets) diff --git a/backend/ee/danswer/server/query_and_chat/models.py b/backend/ee/danswer/server/query_and_chat/models.py index ec9db73ecff..1fd37a21145 100644 --- a/backend/ee/danswer/server/query_and_chat/models.py +++ b/backend/ee/danswer/server/query_and_chat/models.py @@ -1,14 +1,16 @@ +from uuid import UUID + from pydantic import BaseModel from pydantic import Field from danswer.configs.constants import DocumentSource +from danswer.context.search.enums import LLMEvaluationType +from danswer.context.search.enums import SearchType +from danswer.context.search.models import ChunkContext +from danswer.context.search.models import RerankingDetails +from danswer.context.search.models import RetrievalDetails +from danswer.context.search.models import SavedSearchDoc from danswer.one_shot_answer.models import ThreadMessage -from danswer.search.enums import LLMEvaluationType -from danswer.search.enums import SearchType -from danswer.search.models import ChunkContext -from danswer.search.models import RerankingDetails -from danswer.search.models import RetrievalDetails -from danswer.search.models import SavedSearchDoc from ee.danswer.server.manage.models import StandardAnswer @@ -36,7 +38,7 @@ class BasicCreateChatMessageRequest(ChunkContext): Note, for simplicity this option only allows for a single linear chain of messages """ - chat_session_id: int + chat_session_id: UUID # New message contents message: str # Defaults to using retrieval with no additional filters @@ -46,6 +48,9 @@ class BasicCreateChatMessageRequest(ChunkContext): query_override: str | None = None # If search_doc_ids provided, then retrieval options are unused search_doc_ids: list[int] | None = None + # only works if using an OpenAI model. See the following for more details: + # https://platform.openai.com/docs/guides/structured-outputs/introduction + structured_response_format: dict | None = None class BasicCreateChatMessageWithHistoryRequest(ChunkContext): @@ -58,6 +63,9 @@ class BasicCreateChatMessageWithHistoryRequest(ChunkContext): skip_rerank: bool | None = None # If search_doc_ids provided, then retrieval options are unused search_doc_ids: list[int] | None = None + # only works if using an OpenAI model. See the following for more details: + # https://platform.openai.com/docs/guides/structured-outputs/introduction + structured_response_format: dict | None = None class SimpleDoc(BaseModel): diff --git a/backend/ee/danswer/server/query_and_chat/query_backend.py b/backend/ee/danswer/server/query_and_chat/query_backend.py index 59e61ba12df..0b380d5d3f7 100644 --- a/backend/ee/danswer/server/query_and_chat/query_backend.py +++ b/backend/ee/danswer/server/query_and_chat/query_backend.py @@ -6,6 +6,12 @@ from danswer.auth.users import current_user from danswer.configs.danswerbot_configs import DANSWER_BOT_TARGET_CHUNK_PERCENTAGE +from danswer.context.search.models import SavedSearchDocWithContent +from danswer.context.search.models import SearchRequest +from danswer.context.search.pipeline import SearchPipeline +from danswer.context.search.utils import dedupe_documents +from danswer.context.search.utils import drop_llm_indices +from danswer.context.search.utils import relevant_sections_to_indices from danswer.db.engine import get_session from danswer.db.models import User from danswer.db.persona import get_persona_by_id @@ -19,12 +25,6 @@ from danswer.one_shot_answer.answer_question import get_search_answer from danswer.one_shot_answer.models import DirectQARequest from danswer.one_shot_answer.models import OneShotQAResponse -from danswer.search.models import SavedSearchDocWithContent -from danswer.search.models import SearchRequest -from danswer.search.pipeline import SearchPipeline -from danswer.search.utils import dedupe_documents -from danswer.search.utils import drop_llm_indices -from danswer.search.utils import relevant_sections_to_indices from danswer.utils.logger import setup_logger from ee.danswer.danswerbot.slack.handlers.handle_standard_answers import ( oneoff_standard_answers, diff --git a/backend/ee/danswer/server/query_and_chat/token_limit.py b/backend/ee/danswer/server/query_and_chat/token_limit.py index 538458fb63f..5c11a0a8e69 100644 --- a/backend/ee/danswer/server/query_and_chat/token_limit.py +++ b/backend/ee/danswer/server/query_and_chat/token_limit.py @@ -12,7 +12,8 @@ from sqlalchemy import select from sqlalchemy.orm import Session -from danswer.db.engine import get_session_context_manager +from danswer.db.api_key import is_api_key_email_address +from danswer.db.engine import get_session_with_tenant from danswer.db.models import ChatMessage from danswer.db.models import ChatSession from danswer.db.models import TokenRateLimit @@ -20,29 +21,28 @@ from danswer.db.models import User from danswer.db.models import User__UserGroup from danswer.db.models import UserGroup +from danswer.db.token_limit import fetch_all_user_token_rate_limits from danswer.server.query_and_chat.token_limit import _get_cutoff_time from danswer.server.query_and_chat.token_limit import _is_rate_limited from danswer.server.query_and_chat.token_limit import _user_is_rate_limited_by_global from danswer.utils.threadpool_concurrency import run_functions_tuples_in_parallel -from ee.danswer.db.api_key import is_api_key_email_address -from ee.danswer.db.token_limit import fetch_all_user_token_rate_limits -def _check_token_rate_limits(user: User | None) -> None: +def _check_token_rate_limits(user: User | None, tenant_id: str | None) -> None: if user is None: # Unauthenticated users are only rate limited by global settings - _user_is_rate_limited_by_global() + _user_is_rate_limited_by_global(tenant_id) elif is_api_key_email_address(user.email): # API keys are only rate limited by global settings - _user_is_rate_limited_by_global() + _user_is_rate_limited_by_global(tenant_id) else: run_functions_tuples_in_parallel( [ - (_user_is_rate_limited, (user.id,)), - (_user_is_rate_limited_by_group, (user.id,)), - (_user_is_rate_limited_by_global, ()), + (_user_is_rate_limited, (user.id, tenant_id)), + (_user_is_rate_limited_by_group, (user.id, tenant_id)), + (_user_is_rate_limited_by_global, (tenant_id,)), ] ) @@ -52,8 +52,8 @@ def _check_token_rate_limits(user: User | None) -> None: """ -def _user_is_rate_limited(user_id: UUID) -> None: - with get_session_context_manager() as db_session: +def _user_is_rate_limited(user_id: UUID, tenant_id: str | None) -> None: + with get_session_with_tenant(tenant_id) as db_session: user_rate_limits = fetch_all_user_token_rate_limits( db_session=db_session, enabled_only=True, ordered=False ) @@ -93,8 +93,8 @@ def _fetch_user_usage( """ -def _user_is_rate_limited_by_group(user_id: UUID) -> None: - with get_session_context_manager() as db_session: +def _user_is_rate_limited_by_group(user_id: UUID, tenant_id: str | None) -> None: + with get_session_with_tenant(tenant_id) as db_session: group_rate_limits = _fetch_all_user_group_rate_limits(user_id, db_session) if group_rate_limits: diff --git a/backend/ee/danswer/server/query_and_chat/utils.py b/backend/ee/danswer/server/query_and_chat/utils.py index beb970fd1b8..be5507b01c2 100644 --- a/backend/ee/danswer/server/query_and_chat/utils.py +++ b/backend/ee/danswer/server/query_and_chat/utils.py @@ -12,7 +12,9 @@ from danswer.db.models import User from danswer.db.persona import get_prompts_by_ids from danswer.one_shot_answer.models import PersonaConfig -from danswer.tools.custom.custom_tool import build_custom_tools_from_openapi_schema +from danswer.tools.tool_implementations.custom.custom_tool import ( + build_custom_tools_from_openapi_schema_and_headers, +) def create_temporary_persona( @@ -58,7 +60,7 @@ def create_temporary_persona( for schema in persona_config.custom_tools_openapi: tools = cast( list[Tool], - build_custom_tools_from_openapi_schema(schema), + build_custom_tools_from_openapi_schema_and_headers(schema), ) persona.tools.extend(tools) diff --git a/backend/ee/danswer/server/query_history/api.py b/backend/ee/danswer/server/query_history/api.py index 3fc0a98153a..df6175cf271 100644 --- a/backend/ee/danswer/server/query_history/api.py +++ b/backend/ee/danswer/server/query_history/api.py @@ -4,6 +4,7 @@ from datetime import timedelta from datetime import timezone from typing import Literal +from uuid import UUID from fastapi import APIRouter from fastapi import Depends @@ -19,10 +20,13 @@ from danswer.configs.constants import QAFeedbackType from danswer.configs.constants import SessionType from danswer.db.chat import get_chat_session_by_id +from danswer.db.chat import get_chat_sessions_by_user from danswer.db.engine import get_session from danswer.db.models import ChatMessage from danswer.db.models import ChatSession from danswer.db.models import User +from danswer.server.query_and_chat.models import ChatSessionDetails +from danswer.server.query_and_chat.models import ChatSessionsResponse from ee.danswer.db.query_history import fetch_chat_sessions_eagerly_by_time router = APIRouter() @@ -83,29 +87,32 @@ def build(cls, message: ChatMessage) -> "MessageSnapshot": class ChatSessionMinimal(BaseModel): - id: int + id: UUID user_email: str name: str | None first_user_message: str first_ai_message: str - persona_name: str | None + assistant_id: int | None + assistant_name: str | None time_created: datetime feedback_type: QAFeedbackType | Literal["mixed"] | None flow_type: SessionType + conversation_length: int class ChatSessionSnapshot(BaseModel): - id: int + id: UUID user_email: str name: str | None messages: list[MessageSnapshot] - persona_name: str | None + assistant_id: int | None + assistant_name: str | None time_created: datetime flow_type: SessionType class QuestionAnswerPairSnapshot(BaseModel): - chat_session_id: int + chat_session_id: UUID # 1-indexed message number in the chat_session # e.g. the first message pair in the chat_session is 1, the second is 2, etc. message_pair_num: int @@ -142,7 +149,7 @@ def from_chat_session_snapshot( retrieved_documents=ai_message.documents, feedback_type=ai_message.feedback_type, feedback_text=ai_message.feedback_text, - persona_name=chat_session_snapshot.persona_name, + persona_name=chat_session_snapshot.assistant_name, user_email=get_display_email(chat_session_snapshot.user_email), time_created=user_message.time_created, flow_type=chat_session_snapshot.flow_type, @@ -253,12 +260,20 @@ def fetch_and_process_chat_session_history_minimal( name=chat_session.description, first_user_message=first_user_message, first_ai_message=first_ai_message, - persona_name=chat_session.persona.name - if chat_session.persona - else None, + assistant_id=chat_session.persona_id, + assistant_name=( + chat_session.persona.name if chat_session.persona else None + ), time_created=chat_session.time_created, feedback_type=feedback_type, flow_type=flow_type, + conversation_length=len( + [ + m + for m in chat_session.messages + if m.message_type != MessageType.SYSTEM + ] + ), ) ) @@ -323,12 +338,43 @@ def snapshot_from_chat_session( for message in messages if message.message_type != MessageType.SYSTEM ], - persona_name=chat_session.persona.name if chat_session.persona else None, + assistant_id=chat_session.persona_id, + assistant_name=chat_session.persona.name if chat_session.persona else None, time_created=chat_session.time_created, flow_type=flow_type, ) +@router.get("/admin/chat-sessions") +def get_user_chat_sessions( + user_id: UUID, + _: User | None = Depends(current_admin_user), + db_session: Session = Depends(get_session), +) -> ChatSessionsResponse: + try: + chat_sessions = get_chat_sessions_by_user( + user_id=user_id, deleted=False, db_session=db_session, limit=0 + ) + + except ValueError: + raise ValueError("Chat session does not exist or has been deleted") + + return ChatSessionsResponse( + sessions=[ + ChatSessionDetails( + id=chat.id, + name=chat.description, + persona_id=chat.persona_id, + time_created=chat.time_created.isoformat(), + shared_status=chat.shared_status, + folder_id=chat.folder_id, + current_alternate_model=chat.current_alternate_model, + ) + for chat in chat_sessions + ] + ) + + @router.get("/admin/chat-session-history") def get_chat_session_history( feedback_type: QAFeedbackType | None = None, @@ -350,7 +396,7 @@ def get_chat_session_history( @router.get("/admin/chat-session-history/{chat_session_id}") def get_chat_session_admin( - chat_session_id: int, + chat_session_id: UUID, _: User | None = Depends(current_admin_user), db_session: Session = Depends(get_session), ) -> ChatSessionSnapshot: @@ -381,12 +427,14 @@ def get_chat_session_admin( @router.get("/admin/query-history-csv") def get_query_history_as_csv( _: User | None = Depends(current_admin_user), + start: datetime | None = None, + end: datetime | None = None, db_session: Session = Depends(get_session), ) -> StreamingResponse: complete_chat_session_history = fetch_and_process_chat_session_history( db_session=db_session, - start=datetime.fromtimestamp(0, tz=timezone.utc), - end=datetime.now(tz=timezone.utc), + start=start or datetime.fromtimestamp(0, tz=timezone.utc), + end=end or datetime.now(tz=timezone.utc), feedback_type=None, limit=None, ) diff --git a/backend/ee/danswer/server/reporting/usage_export_models.py b/backend/ee/danswer/server/reporting/usage_export_models.py index 98d9021f816..21cd104e862 100644 --- a/backend/ee/danswer/server/reporting/usage_export_models.py +++ b/backend/ee/danswer/server/reporting/usage_export_models.py @@ -1,5 +1,6 @@ from datetime import datetime from enum import Enum +from uuid import UUID from pydantic import BaseModel @@ -14,7 +15,7 @@ class FlowType(str, Enum): class ChatMessageSkeleton(BaseModel): message_id: int - chat_session_id: int + chat_session_id: UUID user_id: str | None flow_type: FlowType time_sent: datetime diff --git a/backend/ee/danswer/server/saml.py b/backend/ee/danswer/server/saml.py index 38966c15756..20c786af144 100644 --- a/backend/ee/danswer/server/saml.py +++ b/backend/ee/danswer/server/saml.py @@ -12,7 +12,7 @@ from fastapi_users.password import PasswordHelper from onelogin.saml2.auth import OneLogin_Saml2_Auth # type: ignore from pydantic import BaseModel -from pydantic import EmailStr +from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.orm import Session from danswer.auth.schemas import UserCreate @@ -61,11 +61,10 @@ async def upsert_saml_user(email: str) -> User: user: User = await user_manager.create( UserCreate( - email=EmailStr(email), + email=email, password=hashed_pass, is_verified=True, role=role, - has_web_login=True, ) ) @@ -172,15 +171,19 @@ async def saml_login_callback( @router.post("/logout") -def saml_logout( +async def saml_logout( request: Request, - db_session: Session = Depends(get_session), + async_db_session: AsyncSession = Depends(get_async_session), ) -> None: saved_cookie = extract_hashed_cookie(request) if saved_cookie: - saml_account = get_saml_account(cookie=saved_cookie, db_session=db_session) + saml_account = await get_saml_account( + cookie=saved_cookie, async_db_session=async_db_session + ) if saml_account: - expire_saml_account(saml_account, db_session) + await expire_saml_account( + saml_account=saml_account, async_db_session=async_db_session + ) return diff --git a/backend/ee/danswer/server/seeding.py b/backend/ee/danswer/server/seeding.py index ab6c4b017f9..7aa87379221 100644 --- a/backend/ee/danswer/server/seeding.py +++ b/backend/ee/danswer/server/seeding.py @@ -1,13 +1,18 @@ +import json import os +from copy import deepcopy +from typing import List +from typing import Optional from pydantic import BaseModel from sqlalchemy.orm import Session +from danswer.context.search.enums import RecencyBiasSetting from danswer.db.engine import get_session_context_manager from danswer.db.llm import update_default_provider from danswer.db.llm import upsert_llm_provider +from danswer.db.models import Tool from danswer.db.persona import upsert_persona -from danswer.search.enums import RecencyBiasSetting from danswer.server.features.persona.models import CreatePersonaRequest from danswer.server.manage.llm.models import LLMProviderUpsertRequest from danswer.server.settings.models import Settings @@ -18,6 +23,7 @@ ) from ee.danswer.server.enterprise_settings.models import AnalyticsScriptUpload from ee.danswer.server.enterprise_settings.models import EnterpriseSettings +from ee.danswer.server.enterprise_settings.models import NavigationItem from ee.danswer.server.enterprise_settings.store import store_analytics_script from ee.danswer.server.enterprise_settings.store import ( store_settings as store_ee_settings, @@ -25,11 +31,28 @@ from ee.danswer.server.enterprise_settings.store import upload_logo +class CustomToolSeed(BaseModel): + name: str + description: str + definition_path: str + custom_headers: Optional[List[dict]] = None + display_name: Optional[str] = None + in_code_tool_id: Optional[str] = None + user_id: Optional[str] = None + + logger = setup_logger() _SEED_CONFIG_ENV_VAR_NAME = "ENV_SEED_CONFIGURATION" +class NavigationItemSeed(BaseModel): + link: str + title: str + # NOTE: SVG at this path must not have a width / height specified + svg_path: str + + class SeedConfiguration(BaseModel): llms: list[LLMProviderUpsertRequest] | None = None admin_user_emails: list[str] | None = None @@ -37,18 +60,60 @@ class SeedConfiguration(BaseModel): personas: list[CreatePersonaRequest] | None = None settings: Settings | None = None enterprise_settings: EnterpriseSettings | None = None + + # allows for specifying custom navigation items that have your own custom SVG logos + nav_item_overrides: list[NavigationItemSeed] | None = None + # Use existing `CUSTOM_ANALYTICS_SECRET_KEY` for reference analytics_script_path: str | None = None + custom_tools: List[CustomToolSeed] | None = None def _parse_env() -> SeedConfiguration | None: seed_config_str = os.getenv(_SEED_CONFIG_ENV_VAR_NAME) if not seed_config_str: return None - seed_config = SeedConfiguration.parse_raw(seed_config_str) + seed_config = SeedConfiguration.model_validate_json(seed_config_str) return seed_config +def _seed_custom_tools(db_session: Session, tools: List[CustomToolSeed]) -> None: + if tools: + logger.notice("Seeding Custom Tools") + for tool in tools: + try: + logger.debug(f"Attempting to seed tool: {tool.name}") + logger.debug(f"Reading definition from: {tool.definition_path}") + with open(tool.definition_path, "r") as file: + file_content = file.read() + if not file_content.strip(): + raise ValueError("File is empty") + openapi_schema = json.loads(file_content) + db_tool = Tool( + name=tool.name, + description=tool.description, + openapi_schema=openapi_schema, + custom_headers=tool.custom_headers, + display_name=tool.display_name, + in_code_tool_id=tool.in_code_tool_id, + user_id=tool.user_id, + ) + db_session.add(db_tool) + logger.debug(f"Successfully added tool: {tool.name}") + except FileNotFoundError: + logger.error( + f"Definition file not found for tool {tool.name}: {tool.definition_path}" + ) + except json.JSONDecodeError as e: + logger.error( + f"Invalid JSON in definition file for tool {tool.name}: {str(e)}" + ) + except Exception as e: + logger.error(f"Failed to seed tool {tool.name}: {str(e)}") + db_session.commit() + logger.notice(f"Successfully seeded {len(tools)} Custom Tools") + + def _seed_llms( db_session: Session, llm_upsert_requests: list[LLMProviderUpsertRequest] ) -> None: @@ -85,6 +150,7 @@ def _seed_personas(db_session: Session, personas: list[CreatePersonaRequest]) -> is_public=persona.is_public, db_session=db_session, tool_ids=persona.tool_ids, + display_priority=persona.display_priority, ) @@ -99,9 +165,35 @@ def _seed_settings(settings: Settings) -> None: def _seed_enterprise_settings(seed_config: SeedConfiguration) -> None: - if seed_config.enterprise_settings is not None: + if ( + seed_config.enterprise_settings is not None + or seed_config.nav_item_overrides is not None + ): + final_enterprise_settings = ( + deepcopy(seed_config.enterprise_settings) + if seed_config.enterprise_settings + else EnterpriseSettings() + ) + + final_nav_items = final_enterprise_settings.custom_nav_items + if seed_config.nav_item_overrides is not None: + final_nav_items = [] + for item in seed_config.nav_item_overrides: + with open(item.svg_path, "r") as file: + svg_content = file.read().strip() + + final_nav_items.append( + NavigationItem( + link=item.link, + title=item.title, + svg_logo=svg_content, + ) + ) + + final_enterprise_settings.custom_nav_items = final_nav_items + logger.notice("Seeding enterprise settings") - store_ee_settings(seed_config.enterprise_settings) + store_ee_settings(final_enterprise_settings) def _seed_logo(db_session: Session, logo_path: str | None) -> None: @@ -146,6 +238,8 @@ def seed_db() -> None: _seed_personas(db_session, seed_config.personas) if seed_config.settings is not None: _seed_settings(seed_config.settings) + if seed_config.custom_tools is not None: + _seed_custom_tools(db_session, seed_config.custom_tools) _seed_logo(db_session, seed_config.seeded_logo_path) _seed_enterprise_settings(seed_config) diff --git a/backend/ee/danswer/server/tenants/__init__.py b/backend/ee/danswer/server/tenants/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/ee/danswer/server/tenants/access.py b/backend/ee/danswer/server/tenants/access.py new file mode 100644 index 00000000000..255e6c0ea94 --- /dev/null +++ b/backend/ee/danswer/server/tenants/access.py @@ -0,0 +1,53 @@ +from datetime import datetime +from datetime import timedelta + +import jwt +from fastapi import HTTPException +from fastapi import Request + +from danswer.configs.app_configs import DATA_PLANE_SECRET +from danswer.configs.app_configs import EXPECTED_API_KEY +from danswer.configs.app_configs import JWT_ALGORITHM +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +def generate_data_plane_token() -> str: + if DATA_PLANE_SECRET is None: + raise ValueError("DATA_PLANE_SECRET is not set") + + payload = { + "iss": "data_plane", + "exp": datetime.utcnow() + timedelta(minutes=5), + "iat": datetime.utcnow(), + "scope": "api_access", + } + + token = jwt.encode(payload, DATA_PLANE_SECRET, algorithm=JWT_ALGORITHM) + return token + + +async def control_plane_dep(request: Request) -> None: + api_key = request.headers.get("X-API-KEY") + if api_key != EXPECTED_API_KEY: + logger.warning("Invalid API key") + raise HTTPException(status_code=401, detail="Invalid API key") + + auth_header = request.headers.get("Authorization") + if not auth_header or not auth_header.startswith("Bearer "): + logger.warning("Invalid authorization header") + raise HTTPException(status_code=401, detail="Invalid authorization header") + + token = auth_header.split(" ")[1] + try: + payload = jwt.decode(token, DATA_PLANE_SECRET, algorithms=[JWT_ALGORITHM]) + if payload.get("scope") != "tenant:create": + logger.warning("Insufficient permissions") + raise HTTPException(status_code=403, detail="Insufficient permissions") + except jwt.ExpiredSignatureError: + logger.warning("Token has expired") + raise HTTPException(status_code=401, detail="Token has expired") + except jwt.InvalidTokenError: + logger.warning("Invalid token") + raise HTTPException(status_code=401, detail="Invalid token") diff --git a/backend/ee/danswer/server/tenants/api.py b/backend/ee/danswer/server/tenants/api.py new file mode 100644 index 00000000000..8c1331c15a6 --- /dev/null +++ b/backend/ee/danswer/server/tenants/api.py @@ -0,0 +1,116 @@ +import stripe +from fastapi import APIRouter +from fastapi import Depends +from fastapi import HTTPException +from fastapi import Response + +from danswer.auth.users import auth_backend +from danswer.auth.users import current_admin_user +from danswer.auth.users import get_jwt_strategy +from danswer.auth.users import User +from danswer.configs.app_configs import WEB_DOMAIN +from danswer.db.engine import get_session_with_tenant +from danswer.db.notification import create_notification +from danswer.db.users import get_user_by_email +from danswer.server.settings.store import load_settings +from danswer.server.settings.store import store_settings +from danswer.utils.logger import setup_logger +from ee.danswer.auth.users import current_cloud_superuser +from ee.danswer.configs.app_configs import STRIPE_SECRET_KEY +from ee.danswer.server.tenants.access import control_plane_dep +from ee.danswer.server.tenants.billing import fetch_billing_information +from ee.danswer.server.tenants.billing import fetch_tenant_stripe_information +from ee.danswer.server.tenants.models import BillingInformation +from ee.danswer.server.tenants.models import ImpersonateRequest +from ee.danswer.server.tenants.models import ProductGatingRequest +from ee.danswer.server.tenants.user_mapping import get_tenant_id_for_email +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR + +stripe.api_key = STRIPE_SECRET_KEY + +logger = setup_logger() +router = APIRouter(prefix="/tenants") + + +@router.post("/product-gating") +def gate_product( + product_gating_request: ProductGatingRequest, _: None = Depends(control_plane_dep) +) -> None: + """ + Gating the product means that the product is not available to the tenant. + They will be directed to the billing page. + We gate the product when + 1) User has ended free trial without adding payment method + 2) User's card has declined + """ + tenant_id = product_gating_request.tenant_id + token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id) + + settings = load_settings() + settings.product_gating = product_gating_request.product_gating + store_settings(settings) + + if product_gating_request.notification: + with get_session_with_tenant(tenant_id) as db_session: + create_notification(None, product_gating_request.notification, db_session) + + if token is not None: + CURRENT_TENANT_ID_CONTEXTVAR.reset(token) + + +@router.get("/billing-information", response_model=BillingInformation) +async def billing_information( + _: User = Depends(current_admin_user), +) -> BillingInformation: + logger.info("Fetching billing information") + return BillingInformation( + **fetch_billing_information(CURRENT_TENANT_ID_CONTEXTVAR.get()) + ) + + +@router.post("/create-customer-portal-session") +async def create_customer_portal_session(_: User = Depends(current_admin_user)) -> dict: + try: + # Fetch tenant_id and current tenant's information + tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() + stripe_info = fetch_tenant_stripe_information(tenant_id) + stripe_customer_id = stripe_info.get("stripe_customer_id") + if not stripe_customer_id: + raise HTTPException(status_code=400, detail="Stripe customer ID not found") + logger.info(stripe_customer_id) + portal_session = stripe.billing_portal.Session.create( + customer=stripe_customer_id, + return_url=f"{WEB_DOMAIN}/admin/cloud-settings", + ) + logger.info(portal_session) + return {"url": portal_session.url} + except Exception as e: + logger.exception("Failed to create customer portal session") + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/impersonate") +async def impersonate_user( + impersonate_request: ImpersonateRequest, + _: User = Depends(current_cloud_superuser), +) -> Response: + """Allows a cloud superuser to impersonate another user by generating an impersonation JWT token""" + tenant_id = get_tenant_id_for_email(impersonate_request.email) + + with get_session_with_tenant(tenant_id) as tenant_session: + user_to_impersonate = get_user_by_email( + impersonate_request.email, tenant_session + ) + if user_to_impersonate is None: + raise HTTPException(status_code=404, detail="User not found") + token = await get_jwt_strategy().write_token(user_to_impersonate) + + response = await auth_backend.transport.get_login_response(token) + response.set_cookie( + key="fastapiusersauth", + value=token, + httponly=True, + secure=True, + samesite="lax", + ) + return response diff --git a/backend/ee/danswer/server/tenants/billing.py b/backend/ee/danswer/server/tenants/billing.py new file mode 100644 index 00000000000..681ac835e5f --- /dev/null +++ b/backend/ee/danswer/server/tenants/billing.py @@ -0,0 +1,67 @@ +from typing import cast + +import requests +import stripe + +from danswer.configs.app_configs import CONTROL_PLANE_API_BASE_URL +from danswer.utils.logger import setup_logger +from ee.danswer.configs.app_configs import STRIPE_PRICE_ID +from ee.danswer.configs.app_configs import STRIPE_SECRET_KEY +from ee.danswer.server.tenants.access import generate_data_plane_token + +stripe.api_key = STRIPE_SECRET_KEY + +logger = setup_logger() + + +def fetch_tenant_stripe_information(tenant_id: str) -> dict: + token = generate_data_plane_token() + headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + } + url = f"{CONTROL_PLANE_API_BASE_URL}/tenant-stripe-information" + params = {"tenant_id": tenant_id} + response = requests.get(url, headers=headers, params=params) + response.raise_for_status() + return response.json() + + +def fetch_billing_information(tenant_id: str) -> dict: + logger.info("Fetching billing information") + token = generate_data_plane_token() + headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + } + url = f"{CONTROL_PLANE_API_BASE_URL}/billing-information" + params = {"tenant_id": tenant_id} + response = requests.get(url, headers=headers, params=params) + response.raise_for_status() + billing_info = response.json() + return billing_info + + +def register_tenant_users(tenant_id: str, number_of_users: int) -> stripe.Subscription: + """ + Send a request to the control service to register the number of users for a tenant. + """ + if not STRIPE_PRICE_ID: + raise Exception("STRIPE_PRICE_ID is not set") + + response = fetch_tenant_stripe_information(tenant_id) + stripe_subscription_id = cast(str, response.get("stripe_subscription_id")) + + subscription = stripe.Subscription.retrieve(stripe_subscription_id) + updated_subscription = stripe.Subscription.modify( + stripe_subscription_id, + items=[ + { + "id": subscription["items"]["data"][0].id, + "price": STRIPE_PRICE_ID, + "quantity": number_of_users, + } + ], + metadata={"tenant_id": str(tenant_id)}, + ) + return updated_subscription diff --git a/backend/ee/danswer/server/tenants/models.py b/backend/ee/danswer/server/tenants/models.py new file mode 100644 index 00000000000..c372418f6a4 --- /dev/null +++ b/backend/ee/danswer/server/tenants/models.py @@ -0,0 +1,41 @@ +from pydantic import BaseModel + +from danswer.configs.constants import NotificationType +from danswer.server.settings.models import GatingType + + +class CheckoutSessionCreationRequest(BaseModel): + quantity: int + + +class CreateTenantRequest(BaseModel): + tenant_id: str + initial_admin_email: str + + +class ProductGatingRequest(BaseModel): + tenant_id: str + product_gating: GatingType + notification: NotificationType | None = None + + +class BillingInformation(BaseModel): + seats: int + subscription_status: str + billing_start: str + billing_end: str + payment_method_enabled: bool + + +class CheckoutSessionCreationResponse(BaseModel): + id: str + + +class ImpersonateRequest(BaseModel): + email: str + + +class TenantCreationPayload(BaseModel): + tenant_id: str + email: str + referral_source: str | None = None diff --git a/backend/ee/danswer/server/tenants/provisioning.py b/backend/ee/danswer/server/tenants/provisioning.py new file mode 100644 index 00000000000..1cc07210e56 --- /dev/null +++ b/backend/ee/danswer/server/tenants/provisioning.py @@ -0,0 +1,269 @@ +import asyncio +import logging +import uuid + +import aiohttp # Async HTTP client +from fastapi import HTTPException +from sqlalchemy import select +from sqlalchemy.orm import Session + +from danswer.auth.users import exceptions +from danswer.configs.app_configs import CONTROL_PLANE_API_BASE_URL +from danswer.db.engine import get_session_with_tenant +from danswer.db.engine import get_sqlalchemy_engine +from danswer.db.llm import update_default_provider +from danswer.db.llm import upsert_cloud_embedding_provider +from danswer.db.llm import upsert_llm_provider +from danswer.db.models import IndexModelStatus +from danswer.db.models import SearchSettings +from danswer.db.models import UserTenantMapping +from danswer.llm.llm_provider_options import ANTHROPIC_MODEL_NAMES +from danswer.llm.llm_provider_options import ANTHROPIC_PROVIDER_NAME +from danswer.llm.llm_provider_options import OPEN_AI_MODEL_NAMES +from danswer.llm.llm_provider_options import OPENAI_PROVIDER_NAME +from danswer.server.manage.embedding.models import CloudEmbeddingProviderCreationRequest +from danswer.server.manage.llm.models import LLMProviderUpsertRequest +from danswer.setup import setup_danswer +from ee.danswer.configs.app_configs import ANTHROPIC_DEFAULT_API_KEY +from ee.danswer.configs.app_configs import COHERE_DEFAULT_API_KEY +from ee.danswer.configs.app_configs import OPENAI_DEFAULT_API_KEY +from ee.danswer.server.tenants.access import generate_data_plane_token +from ee.danswer.server.tenants.models import TenantCreationPayload +from ee.danswer.server.tenants.schema_management import create_schema_if_not_exists +from ee.danswer.server.tenants.schema_management import drop_schema +from ee.danswer.server.tenants.schema_management import run_alembic_migrations +from ee.danswer.server.tenants.user_mapping import add_users_to_tenant +from ee.danswer.server.tenants.user_mapping import get_tenant_id_for_email +from ee.danswer.server.tenants.user_mapping import user_owns_a_tenant +from shared_configs.configs import MULTI_TENANT +from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA +from shared_configs.configs import TENANT_ID_PREFIX +from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR +from shared_configs.enums import EmbeddingProvider + +logger = logging.getLogger(__name__) + + +async def get_or_create_tenant_id( + email: str, referral_source: str | None = None +) -> str: + """Get existing tenant ID for an email or create a new tenant if none exists.""" + if not MULTI_TENANT: + return POSTGRES_DEFAULT_SCHEMA + + try: + tenant_id = get_tenant_id_for_email(email) + except exceptions.UserNotExists: + # If tenant does not exist and in Multi tenant mode, provision a new tenant + try: + tenant_id = await create_tenant(email, referral_source) + except Exception as e: + logger.error(f"Tenant provisioning failed: {e}") + raise HTTPException(status_code=500, detail="Failed to provision tenant.") + + if not tenant_id: + raise HTTPException( + status_code=401, detail="User does not belong to an organization" + ) + + return tenant_id + + +async def create_tenant(email: str, referral_source: str | None = None) -> str: + tenant_id = TENANT_ID_PREFIX + str(uuid.uuid4()) + try: + # Provision tenant on data plane + await provision_tenant(tenant_id, email) + # Notify control plane + await notify_control_plane(tenant_id, email, referral_source) + except Exception as e: + logger.error(f"Tenant provisioning failed: {e}") + await rollback_tenant_provisioning(tenant_id) + raise HTTPException(status_code=500, detail="Failed to provision tenant.") + return tenant_id + + +async def provision_tenant(tenant_id: str, email: str) -> None: + if not MULTI_TENANT: + raise HTTPException(status_code=403, detail="Multi-tenancy is not enabled") + + if user_owns_a_tenant(email): + raise HTTPException( + status_code=409, detail="User already belongs to an organization" + ) + + logger.info(f"Provisioning tenant: {tenant_id}") + token = None + + try: + if not create_schema_if_not_exists(tenant_id): + logger.info(f"Created schema for tenant {tenant_id}") + else: + logger.info(f"Schema already exists for tenant {tenant_id}") + + token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id) + + # Await the Alembic migrations + await asyncio.to_thread(run_alembic_migrations, tenant_id) + + with get_session_with_tenant(tenant_id) as db_session: + configure_default_api_keys(db_session) + + current_search_settings = ( + db_session.query(SearchSettings) + .filter_by(status=IndexModelStatus.FUTURE) + .first() + ) + cohere_enabled = ( + current_search_settings is not None + and current_search_settings.provider_type == EmbeddingProvider.COHERE + ) + setup_danswer(db_session, tenant_id, cohere_enabled=cohere_enabled) + + add_users_to_tenant([email], tenant_id) + + except Exception as e: + logger.exception(f"Failed to create tenant {tenant_id}") + raise HTTPException( + status_code=500, detail=f"Failed to create tenant: {str(e)}" + ) + finally: + if token is not None: + CURRENT_TENANT_ID_CONTEXTVAR.reset(token) + + +async def notify_control_plane( + tenant_id: str, email: str, referral_source: str | None = None +) -> None: + logger.info("Fetching billing information") + token = generate_data_plane_token() + headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + } + payload = TenantCreationPayload( + tenant_id=tenant_id, email=email, referral_source=referral_source + ) + + async with aiohttp.ClientSession() as session: + async with session.post( + f"{CONTROL_PLANE_API_BASE_URL}/tenants/create", + headers=headers, + json=payload.model_dump(), + ) as response: + if response.status != 200: + error_text = await response.text() + logger.error(f"Control plane tenant creation failed: {error_text}") + raise Exception( + f"Failed to create tenant on control plane: {error_text}" + ) + + +async def rollback_tenant_provisioning(tenant_id: str) -> None: + # Logic to rollback tenant provisioning on data plane + logger.info(f"Rolling back tenant provisioning for tenant_id: {tenant_id}") + try: + # Drop the tenant's schema to rollback provisioning + drop_schema(tenant_id) + # Remove tenant mapping + with Session(get_sqlalchemy_engine()) as db_session: + db_session.query(UserTenantMapping).filter( + UserTenantMapping.tenant_id == tenant_id + ).delete() + db_session.commit() + except Exception as e: + logger.error(f"Failed to rollback tenant provisioning: {e}") + + +def configure_default_api_keys(db_session: Session) -> None: + if OPENAI_DEFAULT_API_KEY: + open_provider = LLMProviderUpsertRequest( + name="OpenAI", + provider=OPENAI_PROVIDER_NAME, + api_key=OPENAI_DEFAULT_API_KEY, + default_model_name="gpt-4", + fast_default_model_name="gpt-4o-mini", + model_names=OPEN_AI_MODEL_NAMES, + ) + try: + full_provider = upsert_llm_provider(open_provider, db_session) + update_default_provider(full_provider.id, db_session) + except Exception as e: + logger.error(f"Failed to configure OpenAI provider: {e}") + else: + logger.error( + "OPENAI_DEFAULT_API_KEY not set, skipping OpenAI provider configuration" + ) + + if ANTHROPIC_DEFAULT_API_KEY: + anthropic_provider = LLMProviderUpsertRequest( + name="Anthropic", + provider=ANTHROPIC_PROVIDER_NAME, + api_key=ANTHROPIC_DEFAULT_API_KEY, + default_model_name="claude-3-5-sonnet-20241022", + fast_default_model_name="claude-3-5-sonnet-20241022", + model_names=ANTHROPIC_MODEL_NAMES, + ) + try: + full_provider = upsert_llm_provider(anthropic_provider, db_session) + update_default_provider(full_provider.id, db_session) + except Exception as e: + logger.error(f"Failed to configure Anthropic provider: {e}") + else: + logger.error( + "ANTHROPIC_DEFAULT_API_KEY not set, skipping Anthropic provider configuration" + ) + + if COHERE_DEFAULT_API_KEY: + cloud_embedding_provider = CloudEmbeddingProviderCreationRequest( + provider_type=EmbeddingProvider.COHERE, + api_key=COHERE_DEFAULT_API_KEY, + ) + + try: + logger.info("Attempting to upsert Cohere cloud embedding provider") + upsert_cloud_embedding_provider(db_session, cloud_embedding_provider) + logger.info("Successfully upserted Cohere cloud embedding provider") + + logger.info("Updating search settings with Cohere embedding model details") + query = ( + select(SearchSettings) + .where(SearchSettings.status == IndexModelStatus.FUTURE) + .order_by(SearchSettings.id.desc()) + ) + result = db_session.execute(query) + current_search_settings = result.scalars().first() + + if current_search_settings: + current_search_settings.model_name = ( + "embed-english-v3.0" # Cohere's latest model as of now + ) + current_search_settings.model_dim = ( + 1024 # Cohere's embed-english-v3.0 dimension + ) + current_search_settings.provider_type = EmbeddingProvider.COHERE + current_search_settings.index_name = ( + "danswer_chunk_cohere_embed_english_v3_0" + ) + current_search_settings.query_prefix = "" + current_search_settings.passage_prefix = "" + db_session.commit() + else: + raise RuntimeError( + "No search settings specified, DB is not in a valid state" + ) + logger.info("Fetching updated search settings to verify changes") + updated_query = ( + select(SearchSettings) + .where(SearchSettings.status == IndexModelStatus.PRESENT) + .order_by(SearchSettings.id.desc()) + ) + updated_result = db_session.execute(updated_query) + updated_result.scalars().first() + + except Exception: + logger.exception("Failed to configure Cohere embedding provider") + else: + logger.info( + "COHERE_DEFAULT_API_KEY not set, skipping Cohere embedding provider configuration" + ) diff --git a/backend/ee/danswer/server/tenants/schema_management.py b/backend/ee/danswer/server/tenants/schema_management.py new file mode 100644 index 00000000000..9be4e79f984 --- /dev/null +++ b/backend/ee/danswer/server/tenants/schema_management.py @@ -0,0 +1,76 @@ +import logging +import os +from types import SimpleNamespace + +from sqlalchemy import text +from sqlalchemy.orm import Session +from sqlalchemy.schema import CreateSchema + +from alembic import command +from alembic.config import Config +from danswer.db.engine import build_connection_string +from danswer.db.engine import get_sqlalchemy_engine + +logger = logging.getLogger(__name__) + + +def run_alembic_migrations(schema_name: str) -> None: + logger.info(f"Starting Alembic migrations for schema: {schema_name}") + + try: + current_dir = os.path.dirname(os.path.abspath(__file__)) + root_dir = os.path.abspath(os.path.join(current_dir, "..", "..", "..", "..")) + alembic_ini_path = os.path.join(root_dir, "alembic.ini") + + # Configure Alembic + alembic_cfg = Config(alembic_ini_path) + alembic_cfg.set_main_option("sqlalchemy.url", build_connection_string()) + alembic_cfg.set_main_option( + "script_location", os.path.join(root_dir, "alembic") + ) + + # Ensure that logging isn't broken + alembic_cfg.attributes["configure_logger"] = False + + # Mimic command-line options by adding 'cmd_opts' to the config + alembic_cfg.cmd_opts = SimpleNamespace() # type: ignore + alembic_cfg.cmd_opts.x = [f"schema={schema_name}"] # type: ignore + + # Run migrations programmatically + command.upgrade(alembic_cfg, "head") + + # Run migrations programmatically + logger.info( + f"Alembic migrations completed successfully for schema: {schema_name}" + ) + + except Exception as e: + logger.exception(f"Alembic migration failed for schema {schema_name}: {str(e)}") + raise + + +def create_schema_if_not_exists(tenant_id: str) -> bool: + with Session(get_sqlalchemy_engine()) as db_session: + with db_session.begin(): + result = db_session.execute( + text( + "SELECT schema_name FROM information_schema.schemata WHERE schema_name = :schema_name" + ), + {"schema_name": tenant_id}, + ) + schema_exists = result.scalar() is not None + if not schema_exists: + stmt = CreateSchema(tenant_id) + db_session.execute(stmt) + return True + return False + + +def drop_schema(tenant_id: str) -> None: + if not tenant_id.isidentifier(): + raise ValueError("Invalid tenant_id.") + with get_sqlalchemy_engine().connect() as connection: + connection.execute( + text("DROP SCHEMA IF EXISTS %(schema_name)s CASCADE"), + {"schema_name": tenant_id}, + ) diff --git a/backend/ee/danswer/server/tenants/user_mapping.py b/backend/ee/danswer/server/tenants/user_mapping.py new file mode 100644 index 00000000000..cf0e5ec5f21 --- /dev/null +++ b/backend/ee/danswer/server/tenants/user_mapping.py @@ -0,0 +1,70 @@ +import logging + +from fastapi_users import exceptions +from sqlalchemy import select +from sqlalchemy.orm import Session + +from danswer.db.engine import get_session_with_tenant +from danswer.db.engine import get_sqlalchemy_engine +from danswer.db.models import UserTenantMapping +from shared_configs.configs import MULTI_TENANT +from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA + +logger = logging.getLogger(__name__) + + +def get_tenant_id_for_email(email: str) -> str: + if not MULTI_TENANT: + return POSTGRES_DEFAULT_SCHEMA + # Implement logic to get tenant_id from the mapping table + with Session(get_sqlalchemy_engine()) as db_session: + result = db_session.execute( + select(UserTenantMapping.tenant_id).where(UserTenantMapping.email == email) + ) + tenant_id = result.scalar_one_or_none() + if tenant_id is None: + raise exceptions.UserNotExists() + return tenant_id + + +def user_owns_a_tenant(email: str) -> bool: + with get_session_with_tenant(POSTGRES_DEFAULT_SCHEMA) as db_session: + result = ( + db_session.query(UserTenantMapping) + .filter(UserTenantMapping.email == email) + .first() + ) + return result is not None + + +def add_users_to_tenant(emails: list[str], tenant_id: str) -> None: + with get_session_with_tenant(POSTGRES_DEFAULT_SCHEMA) as db_session: + try: + for email in emails: + db_session.add(UserTenantMapping(email=email, tenant_id=tenant_id)) + except Exception: + logger.exception(f"Failed to add users to tenant {tenant_id}") + db_session.commit() + + +def remove_users_from_tenant(emails: list[str], tenant_id: str) -> None: + with get_session_with_tenant(POSTGRES_DEFAULT_SCHEMA) as db_session: + try: + mappings_to_delete = ( + db_session.query(UserTenantMapping) + .filter( + UserTenantMapping.email.in_(emails), + UserTenantMapping.tenant_id == tenant_id, + ) + .all() + ) + + for mapping in mappings_to_delete: + db_session.delete(mapping) + + db_session.commit() + except Exception as e: + logger.exception( + f"Failed to remove users from tenant {tenant_id}: {str(e)}" + ) + db_session.rollback() diff --git a/backend/ee/danswer/server/token_rate_limits/api.py b/backend/ee/danswer/server/token_rate_limits/api.py index 97f1f15faed..5006b34cf11 100644 --- a/backend/ee/danswer/server/token_rate_limits/api.py +++ b/backend/ee/danswer/server/token_rate_limits/api.py @@ -8,14 +8,14 @@ from danswer.auth.users import current_curator_or_admin_user from danswer.db.engine import get_session from danswer.db.models import User +from danswer.db.token_limit import fetch_all_user_token_rate_limits +from danswer.db.token_limit import insert_user_token_rate_limit from danswer.server.query_and_chat.token_limit import any_rate_limit_exists from danswer.server.token_rate_limits.models import TokenRateLimitArgs from danswer.server.token_rate_limits.models import TokenRateLimitDisplay from ee.danswer.db.token_limit import fetch_all_user_group_token_rate_limits_by_group -from ee.danswer.db.token_limit import fetch_all_user_token_rate_limits from ee.danswer.db.token_limit import fetch_user_group_token_rate_limits from ee.danswer.db.token_limit import insert_user_group_token_rate_limit -from ee.danswer.db.token_limit import insert_user_token_rate_limit router = APIRouter(prefix="/admin/token-rate-limits") diff --git a/backend/model_server/encoders.py b/backend/model_server/encoders.py index 860151b3dc4..c72be9e4ac3 100644 --- a/backend/model_server/encoders.py +++ b/backend/model_server/encoders.py @@ -1,5 +1,5 @@ import json -from typing import Any +from typing import cast from typing import Optional import httpx @@ -10,6 +10,8 @@ from fastapi import APIRouter from fastapi import HTTPException from google.oauth2 import service_account # type: ignore +from litellm import embedding +from litellm.exceptions import RateLimitError from retry import retry from sentence_transformers import CrossEncoder # type: ignore from sentence_transformers import SentenceTransformer # type: ignore @@ -24,7 +26,9 @@ from model_server.constants import EmbeddingModelTextType from model_server.constants import EmbeddingProvider from model_server.utils import simple_log_function_time +from shared_configs.configs import API_BASED_EMBEDDING_TIMEOUT from shared_configs.configs import INDEXING_ONLY +from shared_configs.configs import OPENAI_EMBEDDING_TIMEOUT from shared_configs.enums import EmbedTextType from shared_configs.enums import RerankerProvider from shared_configs.model_server_models import Embedding @@ -52,48 +56,29 @@ _COHERE_MAX_INPUT_LEN = 96 -def _initialize_client( - api_key: str, provider: EmbeddingProvider, model: str | None = None -) -> Any: - if provider == EmbeddingProvider.OPENAI: - return openai.OpenAI(api_key=api_key) - elif provider == EmbeddingProvider.COHERE: - return CohereClient(api_key=api_key) - elif provider == EmbeddingProvider.VOYAGE: - return voyageai.Client(api_key=api_key) - elif provider == EmbeddingProvider.GOOGLE: - credentials = service_account.Credentials.from_service_account_info( - json.loads(api_key) - ) - project_id = json.loads(api_key)["project_id"] - vertexai.init(project=project_id, credentials=credentials) - return TextEmbeddingModel.from_pretrained(model or DEFAULT_VERTEX_MODEL) - else: - raise ValueError(f"Unsupported provider: {provider}") - - class CloudEmbedding: def __init__( self, api_key: str, provider: EmbeddingProvider, - # Only for Google as is needed on client setup - model: str | None = None, + api_url: str | None = None, + api_version: str | None = None, ) -> None: self.provider = provider - self.client = _initialize_client(api_key, self.provider, model) + self.api_key = api_key + self.api_url = api_url + self.api_version = api_version def _embed_openai(self, texts: list[str], model: str | None) -> list[Embedding]: if not model: model = DEFAULT_OPENAI_MODEL - # OpenAI does not seem to provide truncation option, however - # the context lengths used by Danswer currently are smaller than the max token length - # for OpenAI embeddings so it's not a big deal + client = openai.OpenAI(api_key=self.api_key, timeout=OPENAI_EMBEDDING_TIMEOUT) + final_embeddings: list[Embedding] = [] try: for text_batch in batch_list(texts, _OPENAI_MAX_INPUT_LEN): - response = self.client.embeddings.create(input=text_batch, model=model) + response = client.embeddings.create(input=text_batch, model=model) final_embeddings.extend( [embedding.embedding for embedding in response.data] ) @@ -114,17 +99,19 @@ def _embed_cohere( if not model: model = DEFAULT_COHERE_MODEL + client = CohereClient(api_key=self.api_key, timeout=API_BASED_EMBEDDING_TIMEOUT) + final_embeddings: list[Embedding] = [] for text_batch in batch_list(texts, _COHERE_MAX_INPUT_LEN): # Does not use the same tokenizer as the Danswer API server but it's approximately the same # empirically it's only off by a very few tokens so it's not a big deal - response = self.client.embed( + response = client.embed( texts=text_batch, model=model, input_type=embedding_type, truncate="END", ) - final_embeddings.extend(response.embeddings) + final_embeddings.extend(cast(list[Embedding], response.embeddings)) return final_embeddings def _embed_voyage( @@ -133,23 +120,45 @@ def _embed_voyage( if not model: model = DEFAULT_VOYAGE_MODEL - # Similar to Cohere, the API server will do approximate size chunking - # it's acceptable to miss by a few tokens - response = self.client.embed( + client = voyageai.Client( + api_key=self.api_key, timeout=API_BASED_EMBEDDING_TIMEOUT + ) + + response = client.embed( texts, model=model, input_type=embedding_type, - truncation=True, # Also this is default + truncation=True, ) return response.embeddings + def _embed_azure(self, texts: list[str], model: str | None) -> list[Embedding]: + response = embedding( + model=model, + input=texts, + timeout=API_BASED_EMBEDDING_TIMEOUT, + api_key=self.api_key, + api_base=self.api_url, + api_version=self.api_version, + ) + embeddings = [embedding["embedding"] for embedding in response.data] + + return embeddings + def _embed_vertex( self, texts: list[str], model: str | None, embedding_type: str ) -> list[Embedding]: if not model: model = DEFAULT_VERTEX_MODEL - embeddings = self.client.get_embeddings( + credentials = service_account.Credentials.from_service_account_info( + json.loads(self.api_key) + ) + project_id = json.loads(self.api_key)["project_id"] + vertexai.init(project=project_id, credentials=credentials) + client = TextEmbeddingModel.from_pretrained(model) + + embeddings = client.get_embeddings( [ TextEmbeddingInput( text, @@ -161,6 +170,33 @@ def _embed_vertex( ) return [embedding.values for embedding in embeddings] + def _embed_litellm_proxy( + self, texts: list[str], model_name: str | None + ) -> list[Embedding]: + if not model_name: + raise ValueError("Model name is required for LiteLLM proxy embedding.") + + if not self.api_url: + raise ValueError("API URL is required for LiteLLM proxy embedding.") + + headers = ( + {} if not self.api_key else {"Authorization": f"Bearer {self.api_key}"} + ) + + with httpx.Client() as client: + response = client.post( + self.api_url, + json={ + "model": model_name, + "input": texts, + }, + headers=headers, + timeout=API_BASED_EMBEDDING_TIMEOUT, + ) + response.raise_for_status() + result = response.json() + return [embedding["embedding"] for embedding in result["data"]] + @retry(tries=_RETRY_TRIES, delay=_RETRY_DELAY) def embed( self, @@ -168,31 +204,34 @@ def embed( texts: list[str], text_type: EmbedTextType, model_name: str | None = None, + deployment_name: str | None = None, ) -> list[Embedding]: - try: - if self.provider == EmbeddingProvider.OPENAI: - return self._embed_openai(texts, model_name) - embedding_type = EmbeddingModelTextType.get_type(self.provider, text_type) - if self.provider == EmbeddingProvider.COHERE: - return self._embed_cohere(texts, model_name, embedding_type) - elif self.provider == EmbeddingProvider.VOYAGE: - return self._embed_voyage(texts, model_name, embedding_type) - elif self.provider == EmbeddingProvider.GOOGLE: - return self._embed_vertex(texts, model_name, embedding_type) - else: - raise ValueError(f"Unsupported provider: {self.provider}") - except Exception as e: - raise HTTPException( - status_code=500, - detail=f"Error embedding text with {self.provider}: {str(e)}", - ) + if self.provider == EmbeddingProvider.OPENAI: + return self._embed_openai(texts, model_name) + elif self.provider == EmbeddingProvider.AZURE: + return self._embed_azure(texts, f"azure/{deployment_name}") + elif self.provider == EmbeddingProvider.LITELLM: + return self._embed_litellm_proxy(texts, model_name) + + embedding_type = EmbeddingModelTextType.get_type(self.provider, text_type) + if self.provider == EmbeddingProvider.COHERE: + return self._embed_cohere(texts, model_name, embedding_type) + elif self.provider == EmbeddingProvider.VOYAGE: + return self._embed_voyage(texts, model_name, embedding_type) + elif self.provider == EmbeddingProvider.GOOGLE: + return self._embed_vertex(texts, model_name, embedding_type) + else: + raise ValueError(f"Unsupported provider: {self.provider}") @staticmethod def create( - api_key: str, provider: EmbeddingProvider, model: str | None = None + api_key: str, + provider: EmbeddingProvider, + api_url: str | None = None, + api_version: str | None = None, ) -> "CloudEmbedding": logger.debug(f"Creating Embedding instance for provider: {provider}") - return CloudEmbedding(api_key, provider, model) + return CloudEmbedding(api_key, provider, api_url, api_version) def get_embedding_model( @@ -235,36 +274,19 @@ def get_local_reranking_model( return _RERANK_MODEL -def embed_with_litellm_proxy( - texts: list[str], api_url: str, model_name: str, api_key: str | None -) -> list[Embedding]: - headers = {} if not api_key else {"Authorization": f"Bearer {api_key}"} - - with httpx.Client() as client: - response = client.post( - api_url, - json={ - "model": model_name, - "input": texts, - }, - headers=headers, - ) - response.raise_for_status() - result = response.json() - return [embedding["embedding"] for embedding in result["data"]] - - @simple_log_function_time() def embed_text( texts: list[str], text_type: EmbedTextType, model_name: str | None, + deployment_name: str | None, max_context_length: int, normalize_embeddings: bool, api_key: str | None, provider_type: EmbeddingProvider | None, prefix: str | None, api_url: str | None, + api_version: str | None, ) -> list[Embedding]: logger.info(f"Embedding {len(texts)} texts with provider: {provider_type}") @@ -276,23 +298,7 @@ def embed_text( logger.error("No texts provided for embedding") raise ValueError("No texts provided for embedding.") - if provider_type == EmbeddingProvider.LITELLM: - logger.debug(f"Using LiteLLM proxy for embedding with URL: {api_url}") - if not api_url: - logger.error("API URL not provided for LiteLLM proxy") - raise ValueError("API URL is required for LiteLLM proxy embedding.") - try: - return embed_with_litellm_proxy( - texts=texts, - api_url=api_url, - model_name=model_name or "", - api_key=api_key, - ) - except Exception as e: - logger.exception(f"Error during LiteLLM proxy embedding: {str(e)}") - raise - - elif provider_type is not None: + if provider_type is not None: logger.debug(f"Using cloud provider {provider_type} for embedding") if api_key is None: logger.error("API key not provided for cloud model") @@ -306,11 +312,15 @@ def embed_text( ) cloud_model = CloudEmbedding( - api_key=api_key, provider=provider_type, model=model_name + api_key=api_key, + provider=provider_type, + api_url=api_url, + api_version=api_version, ) embeddings = cloud_model.embed( texts=texts, model_name=model_name, + deployment_name=deployment_name, text_type=text_type, ) @@ -404,15 +414,22 @@ async def process_embed_request( embeddings = embed_text( texts=embed_request.texts, model_name=embed_request.model_name, + deployment_name=embed_request.deployment_name, max_context_length=embed_request.max_context_length, normalize_embeddings=embed_request.normalize_embeddings, api_key=embed_request.api_key, provider_type=embed_request.provider_type, text_type=embed_request.text_type, api_url=embed_request.api_url, + api_version=embed_request.api_version, prefix=prefix, ) return EmbedResponse(embeddings=embeddings) + except RateLimitError as e: + raise HTTPException( + status_code=429, + detail=str(e), + ) except Exception as e: exception_detail = f"Error during embedding process:\n{str(e)}" logger.exception(exception_detail) diff --git a/backend/model_server/main.py b/backend/model_server/main.py index 5c7979475c7..ce9cc724a98 100644 --- a/backend/model_server/main.py +++ b/backend/model_server/main.py @@ -4,9 +4,12 @@ from contextlib import asynccontextmanager from pathlib import Path +import sentry_sdk import torch import uvicorn from fastapi import FastAPI +from sentry_sdk.integrations.fastapi import FastApiIntegration +from sentry_sdk.integrations.starlette import StarletteIntegration from transformers import logging as transformer_logging # type:ignore from danswer import __version__ @@ -19,6 +22,7 @@ from shared_configs.configs import MIN_THREADS_ML_MODELS from shared_configs.configs import MODEL_SERVER_ALLOWED_HOST from shared_configs.configs import MODEL_SERVER_PORT +from shared_configs.configs import SENTRY_DSN os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1" @@ -81,6 +85,15 @@ def get_model_app() -> FastAPI: application = FastAPI( title="Danswer Model Server", version=__version__, lifespan=lifespan ) + if SENTRY_DSN: + sentry_sdk.init( + dsn=SENTRY_DSN, + integrations=[StarletteIntegration(), FastApiIntegration()], + traces_sample_rate=0.1, + ) + logger.info("Sentry initialized") + else: + logger.debug("Sentry DSN not provided, skipping Sentry initialization") application.include_router(management_router) application.include_router(encoders_router) diff --git a/backend/pyproject.toml b/backend/pyproject.toml index a9cf3650e13..d32255d9f65 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -4,6 +4,14 @@ mypy_path = "$MYPY_CONFIG_FILE_DIR" explicit_package_bases = true disallow_untyped_defs = true +[[tool.mypy.overrides]] +module = "alembic.versions.*" +disable_error_code = ["var-annotated"] + +[[tool.mypy.overrides]] +module = "alembic_tenants.versions.*" +disable_error_code = ["var-annotated"] + [tool.ruff] ignore = [] line-length = 130 diff --git a/backend/pytest.ini b/backend/pytest.ini index db3dbf8b00d..954a0274064 100644 --- a/backend/pytest.ini +++ b/backend/pytest.ini @@ -1,4 +1,8 @@ [pytest] pythonpath = . markers = - slow: marks tests as slow \ No newline at end of file + slow: marks tests as slow +filterwarnings = + ignore::DeprecationWarning + ignore::cryptography.utils.CryptographyDeprecationWarning + \ No newline at end of file diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index dbad39347a0..32f9a9e0abc 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -1,10 +1,10 @@ aiohttp==3.10.2 alembic==1.10.4 asyncpg==0.27.0 -atlassian-python-api==3.37.0 -beautifulsoup4==4.12.2 +atlassian-python-api==3.41.16 +beautifulsoup4==4.12.3 boto3==1.34.84 -celery==5.3.4 +celery==5.5.0b4 chardet==5.2.0 dask==2023.8.1 ddtrace==2.6.5 @@ -19,16 +19,19 @@ google-auth-oauthlib==1.0.0 # GPT4All library has issues running on Macs and python:3.11.4-slim-bookworm # will reintroduce this when library version catches up # gpt4all==2.0.2 -httpcore==0.16.3 -httpx[http2]==0.23.3 -httpx-oauth==0.11.2 +httpcore==1.0.5 +httpx[http2]==0.27.0 +httpx-oauth==0.15.1 huggingface-hub==0.20.1 jira==3.5.1 jsonref==1.1.0 +trafilatura==1.12.2 langchain==0.1.17 langchain-core==0.1.50 langchain-text-splitters==0.0.1 -litellm==1.43.18 +litellm==1.50.2 +lxml==5.3.0 +lxml_html_clean==0.2.2 llama-index==0.9.45 Mako==1.2.4 msal==1.28.0 @@ -45,7 +48,7 @@ PyGithub==1.58.2 python-dateutil==2.8.2 python-gitlab==3.9.0 python-pptx==0.6.23 -pypdf==3.17.0 +pypdf==4.3.0 pytest-mock==3.12.0 pytest-playwright==0.3.2 python-docx==1.1.2 @@ -57,7 +60,6 @@ requests==2.32.2 requests-oauthlib==1.3.1 retry==0.9.2 # This pulls in py which is in CVE-2022-42969, must remove py from image rfc3986==1.5.0 -rt==3.1.2 simple-salesforce==1.12.6 slack-sdk==3.20.2 SQLAlchemy[mypy]==2.0.15 @@ -66,10 +68,12 @@ supervisor==4.2.5 tiktoken==0.7.0 timeago==1.0.16 transformers==4.39.2 +unstructured==0.15.1 +unstructured-client==0.25.4 uvicorn==0.21.1 zulip==0.8.2 hubspot-api-client==8.1.0 -zenpy==2.0.41 +asana==5.0.8 dropbox==11.36.2 boto3-stubs[s3]==1.34.133 ultimate_sitemap_parser==0.5 @@ -77,3 +81,8 @@ pyairtable==3.0.0a3 anthropic[vertex]==0.36.1 google-cloud-aiplatform==1.70.0 openai==1.53.0 +stripe==10.12.0 +urllib3==2.2.3 +mistune==0.8.4 +sentry-sdk==2.14.0 +prometheus_client==0.21.0 diff --git a/backend/requirements/dev.txt b/backend/requirements/dev.txt index 881920af7f2..27304dbef37 100644 --- a/backend/requirements/dev.txt +++ b/backend/requirements/dev.txt @@ -11,6 +11,7 @@ types-beautifulsoup4==4.12.0.3 types-html5lib==1.1.11.13 types-oauthlib==3.2.0.9 types-setuptools==68.0.0.3 +types-Pillow==10.2.0.20240822 types-passlib==1.7.7.20240106 types-psutil==5.9.5.17 types-psycopg2==2.9.21.10 @@ -20,4 +21,10 @@ types-regex==2023.3.23.1 types-requests==2.28.11.17 types-retry==0.9.9.3 types-urllib3==1.26.25.11 -boto3-stubs[s3]==1.34.133 \ No newline at end of file +trafilatura==1.12.2 +lxml==5.3.0 +lxml_html_clean==0.2.2 +boto3-stubs[s3]==1.34.133 +pandas==2.2.3 +pandas-stubs==2.2.3.241009 +cohere==5.6.1 \ No newline at end of file diff --git a/backend/requirements/ee.txt b/backend/requirements/ee.txt index 0717e3a67e7..18dc3200bdc 100644 --- a/backend/requirements/ee.txt +++ b/backend/requirements/ee.txt @@ -1 +1,2 @@ python3-saml==1.15.0 +cohere==5.6.1 \ No newline at end of file diff --git a/backend/requirements/model_server.txt b/backend/requirements/model_server.txt index 410abc7edaf..0710780cfd0 100644 --- a/backend/requirements/model_server.txt +++ b/backend/requirements/model_server.txt @@ -1,9 +1,8 @@ -cohere==5.6.1 einops==0.8.0 +cohere==5.6.1 fastapi==0.109.2 google-cloud-aiplatform==1.58.0 numpy==1.26.4 -openai==1.53.0 pydantic==2.8.2 retry==0.9.2 safetensors==0.4.2 @@ -11,4 +10,6 @@ sentence-transformers==2.6.1 transformers==4.39.2 uvicorn==0.21.1 voyageai==0.2.3 -torch==2.5.1 \ No newline at end of file +torch==2.5.1 +litellm==1.50.2 +sentry-sdk[fastapi,celery,starlette]==2.14.0 diff --git a/backend/scripts/add_connector_creation_script.py b/backend/scripts/add_connector_creation_script.py new file mode 100644 index 00000000000..9a1944080c0 --- /dev/null +++ b/backend/scripts/add_connector_creation_script.py @@ -0,0 +1,148 @@ +from typing import Any +from typing import Dict + +import requests + +API_SERVER_URL = "http://localhost:3000" # Adjust this to your Danswer server URL +HEADERS = {"Content-Type": "application/json"} +API_KEY = "danswer-api-key" # API key here, if auth is enabled + + +def create_connector( + name: str, + source: str, + input_type: str, + connector_specific_config: Dict[str, Any], + is_public: bool = True, + groups: list[int] | None = None, +) -> Dict[str, Any]: + connector_update_request = { + "name": name, + "source": source, + "input_type": input_type, + "connector_specific_config": connector_specific_config, + "is_public": is_public, + "groups": groups or [], + } + + response = requests.post( + url=f"{API_SERVER_URL}/api/manage/admin/connector", + json=connector_update_request, + headers=HEADERS, + ) + response.raise_for_status() + return response.json() + + +def create_credential( + name: str, + source: str, + credential_json: Dict[str, Any], + is_public: bool = True, + groups: list[int] | None = None, +) -> Dict[str, Any]: + credential_request = { + "name": name, + "source": source, + "credential_json": credential_json, + "admin_public": is_public, + "groups": groups or [], + } + + response = requests.post( + url=f"{API_SERVER_URL}/api/manage/credential", + json=credential_request, + headers=HEADERS, + ) + response.raise_for_status() + return response.json() + + +def create_cc_pair( + connector_id: int, + credential_id: int, + name: str, + access_type: str = "public", + groups: list[int] | None = None, +) -> Dict[str, Any]: + cc_pair_request = { + "name": name, + "access_type": access_type, + "groups": groups or [], + } + + response = requests.put( + url=f"{API_SERVER_URL}/api/manage/connector/{connector_id}/credential/{credential_id}", + json=cc_pair_request, + headers=HEADERS, + ) + response.raise_for_status() + return response.json() + + +def main() -> None: + # Create a Web connector + web_connector = create_connector( + name="Example Web Connector", + source="web", + input_type="load_state", + connector_specific_config={ + "base_url": "https://example.com", + "web_connector_type": "recursive", + }, + ) + print(f"Created Web Connector: {web_connector}") + + # Create a credential for the Web connector + web_credential = create_credential( + name="Example Web Credential", + source="web", + credential_json={}, # Web connectors typically don't need credentials + is_public=True, + ) + print(f"Created Web Credential: {web_credential}") + + # Create CC pair for Web connector + web_cc_pair = create_cc_pair( + connector_id=web_connector["id"], + credential_id=web_credential["id"], + name="Example Web CC Pair", + access_type="public", + ) + print(f"Created Web CC Pair: {web_cc_pair}") + + # Create a GitHub connector + github_connector = create_connector( + name="Example GitHub Connector", + source="github", + input_type="poll", + connector_specific_config={ + "repo_owner": "example-owner", + "repo_name": "example-repo", + "include_prs": True, + "include_issues": True, + }, + ) + print(f"Created GitHub Connector: {github_connector}") + + # Create a credential for the GitHub connector + github_credential = create_credential( + name="Example GitHub Credential", + source="github", + credential_json={"github_access_token": "your_github_access_token_here"}, + is_public=True, + ) + print(f"Created GitHub Credential: {github_credential}") + + # Create CC pair for GitHub connector + github_cc_pair = create_cc_pair( + connector_id=github_connector["id"], + credential_id=github_credential["id"], + name="Example GitHub CC Pair", + access_type="public", + ) + print(f"Created GitHub CC Pair: {github_cc_pair}") + + +if __name__ == "__main__": + main() diff --git a/backend/scripts/chat_feedback_dump.py b/backend/scripts/chat_feedback_dump.py new file mode 100644 index 00000000000..f0d6d3cbb37 --- /dev/null +++ b/backend/scripts/chat_feedback_dump.py @@ -0,0 +1,239 @@ +# This file is used to demonstrate how to use the backend APIs directly +# to query out feedback for all messages +import argparse +import logging +from logging import getLogger +from typing import Any +from uuid import UUID + +import requests + +from danswer.server.manage.models import AllUsersResponse +from danswer.server.query_and_chat.models import ChatSessionsResponse +from ee.danswer.server.query_history.api import ChatSessionSnapshot + +# Configure the logger +logging.basicConfig( + level=logging.INFO, # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", # Log format + handlers=[logging.StreamHandler()], # Output logs to console +) + +logger = getLogger(__name__) + +# uncomment the following pydantic models if you need the script to be independent +# from pydantic import BaseModel +# from datetime import datetime +# from enum import Enum + +# class UserRole(str, Enum): +# """ +# User roles +# - Basic can't perform any admin actions +# - Admin can perform all admin actions +# - Curator can perform admin actions for +# groups they are curators of +# - Global Curator can perform admin actions +# for all groups they are a member of +# """ + +# BASIC = "basic" +# ADMIN = "admin" +# CURATOR = "curator" +# GLOBAL_CURATOR = "global_curator" + + +# class UserStatus(str, Enum): +# LIVE = "live" +# INVITED = "invited" +# DEACTIVATED = "deactivated" + + +# class FullUserSnapshot(BaseModel): +# id: UUID +# email: str +# role: UserRole +# status: UserStatus + + +# class InvitedUserSnapshot(BaseModel): +# email: str + + +# class AllUsersResponse(BaseModel): +# accepted: list[FullUserSnapshot] +# invited: list[InvitedUserSnapshot] +# accepted_pages: int +# invited_pages: int + + +# class ChatSessionSharedStatus(str, Enum): +# PUBLIC = "public" +# PRIVATE = "private" + + +# class ChatSessionDetails(BaseModel): +# id: UUID +# name: str +# persona_id: int | None = None +# time_created: str +# shared_status: ChatSessionSharedStatus +# folder_id: int | None = None +# current_alternate_model: str | None = None + + +# class ChatSessionsResponse(BaseModel): +# sessions: list[ChatSessionDetails] + + +# class SessionType(str, Enum): +# CHAT = "Chat" +# SEARCH = "Search" +# SLACK = "Slack" + + +# class AbridgedSearchDoc(BaseModel): +# """A subset of the info present in `SearchDoc`""" + +# document_id: str +# semantic_identifier: str +# link: str | None + + +# class QAFeedbackType(str, Enum): +# LIKE = "like" # User likes the answer, used for metrics +# DISLIKE = "dislike" # User dislikes the answer, used for metrics + + +# class MessageType(str, Enum): +# # Using OpenAI standards, Langchain equivalent shown in comment +# # System message is always constructed on the fly, not saved +# SYSTEM = "system" # SystemMessage +# USER = "user" # HumanMessage +# ASSISTANT = "assistant" # AIMessage + + +# class MessageSnapshot(BaseModel): +# message: str +# message_type: MessageType +# documents: list[AbridgedSearchDoc] +# feedback_type: QAFeedbackType | None +# feedback_text: str | None +# time_created: datetime + + +# class ChatSessionSnapshot(BaseModel): +# id: UUID +# user_email: str +# name: str | None +# messages: list[MessageSnapshot] +# persona_name: str | None +# time_created: datetime +# flow_type: SessionType + + +def create_new_chat_session(danswer_url: str, api_key: str | None) -> int: + headers = {"Authorization": f"Bearer {api_key}"} if api_key else None + session_endpoint = danswer_url + "/api/chat/create-chat-session" + + response = requests.get(session_endpoint, headers=headers) + response.raise_for_status() + + new_session_id = response.json()["chat_session_id"] + return new_session_id + + +def manage_users(danswer_url: str, headers: dict[str, str] | None) -> AllUsersResponse: + endpoint = danswer_url + "/manage/users" + + response = requests.get( + endpoint, + headers=headers, + ) + response.raise_for_status() + + all_users = AllUsersResponse(**response.json()) + return all_users + + +def get_chat_sessions( + danswer_url: str, headers: dict[str, str] | None, user_id: UUID +) -> ChatSessionsResponse: + endpoint = danswer_url + "/admin/chat-sessions" + + params: dict[str, Any] = {"user_id": user_id} + response = requests.get( + endpoint, + params=params, + headers=headers, + ) + response.raise_for_status() + + sessions = ChatSessionsResponse(**response.json()) + return sessions + + +def get_session_history( + danswer_url: str, headers: dict[str, str] | None, session_id: UUID +) -> ChatSessionSnapshot: + endpoint = danswer_url + f"/admin/chat-session-history/{session_id}" + + response = requests.get( + endpoint, + headers=headers, + ) + response.raise_for_status() + + sessions = ChatSessionSnapshot(**response.json()) + return sessions + + +def process_all_chat_feedback(danswer_url: str, api_key: str | None) -> None: + headers = {"Authorization": f"Bearer {api_key}"} if api_key else None + + all_users = manage_users(danswer_url, headers) + if not all_users: + raise RuntimeError("manage_users returned None") + + logger.info(f"Accepted users: {len(all_users.accepted)}") + + user_ids: list[UUID] = [user.id for user in all_users.accepted] + + for user_id in user_ids: + r_sessions = get_chat_sessions(danswer_url, headers, user_id) + logger.info(f"user={user_id} num_sessions={len(r_sessions.sessions)}") + for session in r_sessions.sessions: + try: + s = get_session_history(danswer_url, headers, session.id) + except requests.exceptions.HTTPError: + logger.exception("get_session_history failed.") + + for m in s.messages: + logger.info( + f"user={user_id} " + f"session={session.id} " + f"message={m.message} " + f"feedback_type={m.feedback_type} " + f"feedback_text={m.feedback_text}" + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Sample API Usage - Chat Feedback") + parser.add_argument( + "--url", + type=str, + default="http://localhost:8080", + help="Danswer URL, should point to Danswer nginx.", + ) + + # Not needed if Auth is disabled? + # Or for Danswer MIT Edition API key must be replaced with session cookie + parser.add_argument( + "--api-key", + type=str, + help="Danswer Admin Level API key", + ) + + args = parser.parse_args() + process_all_chat_feedback(danswer_url=args.url, api_key=args.api_key) diff --git a/backend/scripts/dev_run_background_jobs.py b/backend/scripts/dev_run_background_jobs.py index 96fdc21156b..6abb5fad8a1 100644 --- a/backend/scripts/dev_run_background_jobs.py +++ b/backend/scripts/dev_run_background_jobs.py @@ -1,5 +1,3 @@ -import argparse -import os import subprocess import threading @@ -17,91 +15,119 @@ def monitor_process(process_name: str, process: subprocess.Popen) -> None: break -def run_jobs(exclude_indexing: bool) -> None: - cmd_worker = [ +def run_jobs() -> None: + # command setup + cmd_worker_primary = [ "celery", "-A", - "ee.danswer.background.celery.celery_app", + "danswer.background.celery.versioned_apps.primary", "worker", "--pool=threads", "--concurrency=6", + "--prefetch-multiplier=1", "--loglevel=INFO", + "--hostname=primary@%n", "-Q", - "celery,vespa_metadata_sync,connector_deletion", + "celery", + ] + + cmd_worker_light = [ + "celery", + "-A", + "danswer.background.celery.versioned_apps.light", + "worker", + "--pool=threads", + "--concurrency=16", + "--prefetch-multiplier=8", + "--loglevel=INFO", + "--hostname=light@%n", + "-Q", + "vespa_metadata_sync,connector_deletion,doc_permissions_upsert", + ] + + cmd_worker_heavy = [ + "celery", + "-A", + "danswer.background.celery.versioned_apps.heavy", + "worker", + "--pool=threads", + "--concurrency=6", + "--prefetch-multiplier=1", + "--loglevel=INFO", + "--hostname=heavy@%n", + "-Q", + "connector_pruning,connector_doc_permissions_sync,connector_external_group_sync", + ] + + cmd_worker_indexing = [ + "celery", + "-A", + "danswer.background.celery.versioned_apps.indexing", + "worker", + "--pool=threads", + "--concurrency=1", + "--prefetch-multiplier=1", + "--loglevel=INFO", + "--hostname=indexing@%n", + "--queues=connector_indexing", ] cmd_beat = [ "celery", "-A", - "ee.danswer.background.celery.celery_app", + "danswer.background.celery.versioned_apps.beat", "beat", "--loglevel=INFO", ] - worker_process = subprocess.Popen( - cmd_worker, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True + # spawn processes + worker_primary_process = subprocess.Popen( + cmd_worker_primary, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True + ) + + worker_light_process = subprocess.Popen( + cmd_worker_light, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True + ) + + worker_heavy_process = subprocess.Popen( + cmd_worker_heavy, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True + ) + + worker_indexing_process = subprocess.Popen( + cmd_worker_indexing, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True ) + beat_process = subprocess.Popen( cmd_beat, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True ) - worker_thread = threading.Thread( - target=monitor_process, args=("WORKER", worker_process) + # monitor threads + worker_primary_thread = threading.Thread( + target=monitor_process, args=("PRIMARY", worker_primary_process) + ) + worker_light_thread = threading.Thread( + target=monitor_process, args=("LIGHT", worker_light_process) + ) + worker_heavy_thread = threading.Thread( + target=monitor_process, args=("HEAVY", worker_heavy_process) + ) + worker_indexing_thread = threading.Thread( + target=monitor_process, args=("INDEX", worker_indexing_process) ) beat_thread = threading.Thread(target=monitor_process, args=("BEAT", beat_process)) - worker_thread.start() + worker_primary_thread.start() + worker_light_thread.start() + worker_heavy_thread.start() + worker_indexing_thread.start() beat_thread.start() - if not exclude_indexing: - update_env = os.environ.copy() - update_env["PYTHONPATH"] = "." - cmd_indexing = ["python", "danswer/background/update.py"] - - indexing_process = subprocess.Popen( - cmd_indexing, - env=update_env, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - ) - - indexing_thread = threading.Thread( - target=monitor_process, args=("INDEXING", indexing_process) - ) - - indexing_thread.start() - indexing_thread.join() - try: - update_env = os.environ.copy() - update_env["PYTHONPATH"] = "." - cmd_perm_sync = ["python", "ee/danswer/background/permission_sync.py"] - - indexing_process = subprocess.Popen( - cmd_perm_sync, - env=update_env, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - ) - - perm_sync_thread = threading.Thread( - target=monitor_process, args=("INDEXING", indexing_process) - ) - perm_sync_thread.start() - perm_sync_thread.join() - except Exception: - pass - - worker_thread.join() + worker_primary_thread.join() + worker_light_thread.join() + worker_heavy_thread.join() + worker_indexing_thread.join() beat_thread.join() if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Run background jobs.") - parser.add_argument( - "--no-indexing", action="store_true", help="Do not run indexing process" - ) - args = parser.parse_args() - - run_jobs(args.no_indexing) + run_jobs() diff --git a/backend/scripts/document_seeding_prep.py b/backend/scripts/document_seeding_prep.py new file mode 100644 index 00000000000..d853b68ccb4 --- /dev/null +++ b/backend/scripts/document_seeding_prep.py @@ -0,0 +1,240 @@ +# This script preps the documents used for initially seeding the index. It handles the embedding so that the +# documents can be added to the index with minimal processing. +import json + +from pydantic import BaseModel +from sentence_transformers import SentenceTransformer # type: ignore + + +class SeedPresaveDocument(BaseModel): + url: str + title: str + content: str + title_embedding: list[float] + content_embedding: list[float] + chunk_ind: int = 0 + + +# Be sure to use the default embedding model +model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True) +tokenizer = model.tokenizer + +# This is easier than cleaning up the crawl, needs to be updated if the sites are changed +overview_title = "Use Cases Overview" +overview = ( + "How to leverage Danswer in your organization\n\n" + "Danswer Overview\n" + "Danswer is the AI Assistant connected to your organization's docs, apps, and people. " + "Danswer makes Generative AI more versatile for work by enabling new types of questions like " + '"What is the most common feature request we\'ve heard from customers this month". ' + "Whereas other AI systems have no context of your team and are generally unhelpful with work related questions, " + "Danswer makes it possible to ask these questions in natural language and get back answers in seconds.\n\n" + "Danswer can connect to +30 different tools and the use cases are not limited to the ones in the following pages. " + "The highlighted use cases are for inspiration and come from feedback gathered from our users and customers.\n\n\n" + "Common Getting Started Questions:\n\n" + "Why are these docs connected in my Danswer deployment?\n" + "Answer: This is just an example of how connectors work in Danswer. You can connect up your own team's knowledge " + "and you will be able to ask questions unique to your organization. Danswer will keep all of the knowledge up to date " + "and in sync with your connected applications.\n\n" + "Is my data being sent anywhere when I connect it up to Danswer?\n" + "Answer: No! Danswer is built with data security as our highest priority. We open sourced it so our users can know " + "exactly what is going on with their data. By default all of the document processing happens within Danswer. " + "The only time it is sent outward is for the GenAI call to generate answers.\n\n" + "Where is the feature for auto sync-ing document level access permissions from all connected sources?\n" + "Answer: This falls under the Enterprise Edition set of Danswer features built on top of the MIT/community edition. " + "If you are on Danswer Cloud, you have access to them by default. If you're running it yourself, reach out to the " + "Danswer team to receive access." +) + +enterprise_search_title = "Enterprise Search" +enterprise_search_1 = ( + "Value of Enterprise Search with Danswer\n\n" + "What is Enterprise Search and why is it Important?\n" + "An Enterprise Search system gives team members a single place to access all of the disparate knowledge " + "of an organization. Critical information is saved across a host of channels like call transcripts with " + "prospects, engineering design docs, IT runbooks, customer support email exchanges, project management " + "tickets, and more. As fast moving teams scale up, information gets spread out and more disorganized.\n\n" + "Since it quickly becomes infeasible to check across every source, decisions get made on incomplete " + "information, employee satisfaction decreases, and the most valuable members of your team are tied up " + "with constant distractions as junior teammates are unable to unblock themselves. Danswer solves this " + "problem by letting anyone on the team access all of the knowledge across your organization in a " + "permissioned and secure way. Users can ask questions in natural language and get back answers and " + "documents across all of the connected sources instantly.\n\n" + "What's the real cost?\n" + "A typical knowledge worker spends over 2 hours a week on search, but more than that, the cost of " + "incomplete or incorrect information can be extremely high. Customer support/success that isn't able " + "to find the reference to similar cases could cause hours or even days of delay leading to lower " + "customer satisfaction or in the worst case - churn. An account exec not realizing that a prospect had " + "previously mentioned a specific need could lead to lost deals. An engineer not realizing a similar " + "feature had previously been built could result in weeks of wasted development time and tech debt with " + "duplicate implementation. With a lack of knowledge, your whole organization is navigating in the dark " + "- inefficient and mistake prone." +) + +enterprise_search_2 = ( + "More than Search\n" + "When analyzing the entire corpus of knowledge within your company is as easy as asking a question " + "in a search bar, your entire team can stay informed and up to date. Danswer also makes it trivial " + "to identify where knowledge is well documented and where it is lacking. Team members who are centers " + "of knowledge can begin to effectively document their expertise since it is no longer being thrown into " + "a black hole. All of this allows the organization to achieve higher efficiency and drive business outcomes.\n\n" + "With Generative AI, the entire user experience has evolved as well. For example, instead of just finding similar " + "cases for your customer support team to reference, Danswer breaks down the issue and explains it so that even " + "the most junior members can understand it. This in turn lets them give the most holistic and technically accurate " + "response possible to your customers. On the other end, even the super stars of your sales team will not be able " + "to review 10 hours of transcripts before hopping on that critical call, but Danswer can easily parse through it " + "in mere seconds and give crucial context to help your team close." +) + +ai_platform_title = "AI Platform" +ai_platform = ( + "Build AI Agents powered by the knowledge and workflows specific to your organization.\n\n" + "Beyond Answers\n" + "Agents enabled by generative AI and reasoning capable models are helping teams to automate their work. " + "Danswer is helping teams make it happen. Danswer provides out of the box user chat sessions, attaching custom tools, " + "handling LLM reasoning, code execution, data analysis, referencing internal knowledge, and much more.\n\n" + "Danswer as a platform is not a no-code agent builder. We are made by developers for developers and this gives your " + "team the full flexibility and power to create agents not constrained by blocks and simple logic paths.\n\n" + "Flexibility and Extensibility\n" + "Danswer is open source and completely whitebox. This not only gives transparency to what happens within the system " + "but also means that your team can directly modify the source code to suit your unique needs." +) + +customer_support_title = "Customer Support" +customer_support = ( + "Help your customer support team instantly answer any question across your entire product.\n\n" + "AI Enabled Support\n" + "Customer support agents have one of the highest breadth jobs. They field requests that cover the entire surface " + "area of the product and need to help your users find success on extremely short timelines. " + "Because they're not the same people who designed or built the system, they often lack the depth of understanding " + "needed - resulting in delays and escalations to other teams. Modern teams are leveraging AI to help their CS team " + "optimize the speed and quality of these critical customer-facing interactions.\n\n" + "The Importance of Context\n" + "There are two critical components of AI copilots for customer support. The first is that the AI system needs to be " + "connected with as much information as possible (not just support tools like Zendesk or Intercom) and that the " + "knowledge needs to be as fresh as possible. Sometimes a fix might even be in places rarely checked by CS such as " + "pull requests in a code repository. The second critical component is the ability of the AI system to break down " + "difficult concepts and convoluted processes into more digestible descriptions and for your team members to be able " + "to chat back and forth with the system to build a better understanding.\n\n" + "Danswer takes care of both of these. The system connects up to over 30+ different applications and the knowledge is " + "pulled in constantly so that the information access is always up to date." +) + +sales_title = "Sales" +sales = ( + "Keep your team up to date on every conversation and update so they can close.\n\n" + "Recall Every Detail\n" + "Being able to instantly revisit every detail of any call without reading transcripts is helping Sales teams provide " + "more tailored pitches, build stronger relationships, and close more deals. Instead of searching and reading through " + 'hours of transcripts in preparation for a call, your team can now ask Danswer "What specific features was ACME ' + "interested in seeing for the demo\". Since your team doesn't have time to read every transcript prior to a call, " + "Danswer provides a more thorough summary because it can instantly parse hundreds of pages and distill out the relevant " + "information. Even for fast lookups it becomes much more convenient - for example to brush up on connection building " + 'topics by asking "What rapport building topic did we chat about in the last call with ACME".\n\n' + "Know Every Product Update\n" + "It is impossible for Sales teams to keep up with every product update. Because of this, when a prospect has a question " + "that the Sales team does not know, they have no choice but to rely on the Product and Engineering orgs to get an " + "authoritative answer. Not only is this distracting to the other teams, it also slows down the time to respond to the " + "prospect (and as we know, time is the biggest killer of deals). With Danswer, it is even possible to get answers live " + 'on call because of how fast accessing information becomes. A question like "Have we shipped the Microsoft AD ' + 'integration yet?" can now be answered in seconds meaning that prospects can get answers while on the call instead of ' + "asynchronously and sales cycles are reduced as a result." +) + +operations_title = "Operations" +operations = ( + "Double the productivity of your Ops teams like IT, HR, etc.\n\n" + "Automatically Resolve Tickets\n" + "Modern teams are leveraging AI to auto-resolve up to 50% of tickets. Whether it is an employee asking about benefits " + "details or how to set up the VPN for remote work, Danswer can help your team help themselves. This frees up your team to " + "do the real impactful work of landing star candidates or improving your internal processes.\n\n" + "AI Aided Onboarding\n" + "One of the periods where your team needs the most help is when they're just ramping up. Instead of feeling lost in dozens " + "of new tools, Danswer gives them a single place where they can ask about anything in natural language. Whether it's how to " + "set up their work environment or what their onboarding goals are, Danswer can walk them through every step with the help " + "of Generative AI. This lets your team feel more empowered and gives time back to the more seasoned members of your team to " + "focus on moving the needle." +) + +# For simplicity, we're not adding any metadata suffix here. Generally there is none for the Web connector anyway +overview_doc = SeedPresaveDocument( + url="https://docs.danswer.dev/more/use_cases/overview", + title=overview_title, + content=overview, + title_embedding=model.encode(f"search_document: {overview_title}"), + content_embedding=model.encode(f"search_document: {overview_title}\n{overview}"), +) + +enterprise_search_doc = SeedPresaveDocument( + url="https://docs.danswer.dev/more/use_cases/enterprise_search", + title=enterprise_search_title, + content=enterprise_search_1, + title_embedding=model.encode(f"search_document: {enterprise_search_title}"), + content_embedding=model.encode( + f"search_document: {enterprise_search_title}\n{enterprise_search_1}" + ), +) + +enterprise_search_doc_2 = SeedPresaveDocument( + url="https://docs.danswer.dev/more/use_cases/enterprise_search", + title=enterprise_search_title, + content=enterprise_search_2, + title_embedding=model.encode(f"search_document: {enterprise_search_title}"), + content_embedding=model.encode( + f"search_document: {enterprise_search_title}\n{enterprise_search_2}" + ), + chunk_ind=1, +) + +ai_platform_doc = SeedPresaveDocument( + url="https://docs.danswer.dev/more/use_cases/ai_platform", + title=ai_platform_title, + content=ai_platform, + title_embedding=model.encode(f"search_document: {ai_platform_title}"), + content_embedding=model.encode( + f"search_document: {ai_platform_title}\n{ai_platform}" + ), +) + +customer_support_doc = SeedPresaveDocument( + url="https://docs.danswer.dev/more/use_cases/customer_support", + title=customer_support_title, + content=customer_support, + title_embedding=model.encode(f"search_document: {customer_support_title}"), + content_embedding=model.encode( + f"search_document: {customer_support_title}\n{customer_support}" + ), +) + +sales_doc = SeedPresaveDocument( + url="https://docs.danswer.dev/more/use_cases/sales", + title=sales_title, + content=sales, + title_embedding=model.encode(f"search_document: {sales_title}"), + content_embedding=model.encode(f"search_document: {sales_title}\n{sales}"), +) + +operations_doc = SeedPresaveDocument( + url="https://docs.danswer.dev/more/use_cases/operations", + title=operations_title, + content=operations, + title_embedding=model.encode(f"search_document: {operations_title}"), + content_embedding=model.encode( + f"search_document: {operations_title}\n{operations}" + ), +) + +documents = [ + overview_doc, + enterprise_search_doc, + enterprise_search_doc_2, + ai_platform_doc, + customer_support_doc, + sales_doc, + operations_doc, +] + +documents_dict = [doc.model_dump() for doc in documents] + +with open("./backend/danswer/seeding/initial_docs.json", "w") as json_file: + json.dump(documents_dict, json_file, indent=4) diff --git a/backend/scripts/force_delete_connector_by_id.py b/backend/scripts/force_delete_connector_by_id.py index 0a9857304c8..241242f4a23 100755 --- a/backend/scripts/force_delete_connector_by_id.py +++ b/backend/scripts/force_delete_connector_by_id.py @@ -206,6 +206,8 @@ def _delete_connector(cc_pair_id: int, db_session: Session) -> None: logger.notice(f"Deleting file {file_name}") file_store.delete_file(file_name) + db_session.commit() + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Delete a connector by its ID") diff --git a/backend/scripts/query_time_check/seed_dummy_docs.py b/backend/scripts/query_time_check/seed_dummy_docs.py new file mode 100644 index 00000000000..e7aa65fba76 --- /dev/null +++ b/backend/scripts/query_time_check/seed_dummy_docs.py @@ -0,0 +1,168 @@ +""" +launch: +- api server +- postgres +- vespa +- model server (this is only needed so the api server can startup, no embedding is done) + +Run this script to seed the database with dummy documents. +Then run test_query_times.py to test query times. +""" +import random +from datetime import datetime + +from danswer.access.models import DocumentAccess +from danswer.configs.constants import DocumentSource +from danswer.connectors.models import Document +from danswer.db.engine import get_session_context_manager +from danswer.db.search_settings import get_current_search_settings +from danswer.document_index.vespa.index import VespaIndex +from danswer.indexing.models import ChunkEmbedding +from danswer.indexing.models import DocMetadataAwareIndexChunk +from danswer.indexing.models import IndexChunk +from danswer.utils.timing import log_function_time +from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA +from shared_configs.model_server_models import Embedding + + +TOTAL_DOC_SETS = 8 +TOTAL_ACL_ENTRIES_PER_CATEGORY = 80 + + +def generate_random_embedding(dim: int) -> Embedding: + return [random.uniform(-1, 1) for _ in range(dim)] + + +def generate_random_identifier() -> str: + return f"dummy_doc_{random.randint(1, 1000)}" + + +def generate_dummy_chunk( + doc_id: str, + chunk_id: int, + embedding_dim: int, + number_of_acl_entries: int, + number_of_document_sets: int, +) -> DocMetadataAwareIndexChunk: + document = Document( + id=doc_id, + source=DocumentSource.GOOGLE_DRIVE, + sections=[], + metadata={}, + semantic_identifier=generate_random_identifier(), + ) + + chunk = IndexChunk( + chunk_id=chunk_id, + blurb=f"Blurb for chunk {chunk_id} of document {doc_id}.", + content=f"Content for chunk {chunk_id} of document {doc_id}. This is dummy text for testing purposes.", + source_links={}, + section_continuation=False, + source_document=document, + title_prefix=f"Title prefix for doc {doc_id}", + metadata_suffix_semantic="", + metadata_suffix_keyword="", + mini_chunk_texts=None, + embeddings=ChunkEmbedding( + full_embedding=generate_random_embedding(embedding_dim), + mini_chunk_embeddings=[], + ), + title_embedding=generate_random_embedding(embedding_dim), + ) + + document_set_names = [] + for i in range(number_of_document_sets): + document_set_names.append(f"Document Set {i}") + + user_emails: set[str | None] = set() + user_groups: set[str] = set() + external_user_emails: set[str] = set() + external_user_group_ids: set[str] = set() + for i in range(number_of_acl_entries): + user_emails.add(f"user_{i}@example.com") + user_groups.add(f"group_{i}") + external_user_emails.add(f"external_user_{i}@example.com") + external_user_group_ids.add(f"external_group_{i}") + + return DocMetadataAwareIndexChunk.from_index_chunk( + index_chunk=chunk, + access=DocumentAccess( + user_emails=user_emails, + user_groups=user_groups, + external_user_emails=external_user_emails, + external_user_group_ids=external_user_group_ids, + is_public=random.choice([True, False]), + ), + document_sets={document_set for document_set in document_set_names}, + boost=random.randint(-1, 1), + tenant_id=POSTGRES_DEFAULT_SCHEMA, + ) + + +@log_function_time() +def do_insertion( + vespa_index: VespaIndex, all_chunks: list[DocMetadataAwareIndexChunk] +) -> None: + insertion_records = vespa_index.index(all_chunks) + print(f"Indexed {len(insertion_records)} documents.") + print( + f"New documents: {sum(1 for record in insertion_records if not record.already_existed)}" + ) + print( + f"Existing documents updated: {sum(1 for record in insertion_records if record.already_existed)}" + ) + + +@log_function_time() +def seed_dummy_docs( + number_of_document_sets: int, + number_of_acl_entries: int, + num_docs: int = 1000, + chunks_per_doc: int = 5, + batch_size: int = 100, +) -> None: + with get_session_context_manager() as db_session: + search_settings = get_current_search_settings(db_session) + index_name = search_settings.index_name + embedding_dim = search_settings.model_dim + + vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None) + print(index_name) + + all_chunks = [] + chunk_count = 0 + for doc_num in range(num_docs): + doc_id = f"dummy_doc_{doc_num}_{datetime.now().isoformat()}" + for chunk_num in range(chunks_per_doc): + chunk = generate_dummy_chunk( + doc_id=doc_id, + chunk_id=chunk_num, + embedding_dim=embedding_dim, + number_of_acl_entries=number_of_acl_entries, + number_of_document_sets=number_of_document_sets, + ) + all_chunks.append(chunk) + chunk_count += 1 + + if len(all_chunks) >= chunks_per_doc * batch_size: + do_insertion(vespa_index, all_chunks) + print( + f"Indexed {chunk_count} chunks out of {num_docs * chunks_per_doc}." + ) + print( + f"percentage: {chunk_count / (num_docs * chunks_per_doc) * 100:.2f}% \n" + ) + all_chunks = [] + + if all_chunks: + do_insertion(vespa_index, all_chunks) + + +if __name__ == "__main__": + seed_dummy_docs( + number_of_document_sets=TOTAL_DOC_SETS, + number_of_acl_entries=TOTAL_ACL_ENTRIES_PER_CATEGORY, + num_docs=100000, + chunks_per_doc=5, + batch_size=1000, + ) diff --git a/backend/scripts/query_time_check/test_query_times.py b/backend/scripts/query_time_check/test_query_times.py new file mode 100644 index 00000000000..144b2354f93 --- /dev/null +++ b/backend/scripts/query_time_check/test_query_times.py @@ -0,0 +1,122 @@ +""" +RUN THIS AFTER SEED_DUMMY_DOCS.PY +""" +import random +import time + +from danswer.configs.constants import DocumentSource +from danswer.configs.model_configs import DOC_EMBEDDING_DIM +from danswer.context.search.models import IndexFilters +from danswer.db.engine import get_session_context_manager +from danswer.db.search_settings import get_current_search_settings +from danswer.document_index.vespa.index import VespaIndex +from scripts.query_time_check.seed_dummy_docs import TOTAL_ACL_ENTRIES_PER_CATEGORY +from scripts.query_time_check.seed_dummy_docs import TOTAL_DOC_SETS +from shared_configs.model_server_models import Embedding + +# make sure these are smaller than TOTAL_ACL_ENTRIES_PER_CATEGORY and TOTAL_DOC_SETS, respectively +NUMBER_OF_ACL_ENTRIES_PER_QUERY = 6 +NUMBER_OF_DOC_SETS_PER_QUERY = 2 + + +def get_slowest_99th_percentile(results: list[float]) -> float: + return sorted(results)[int(0.99 * len(results))] + + +# Generate random filters +def _random_filters() -> IndexFilters: + """ + Generate random filters for the query containing: + - NUMBER_OF_ACL_ENTRIES_PER_QUERY user emails + - NUMBER_OF_ACL_ENTRIES_PER_QUERY groups + - NUMBER_OF_ACL_ENTRIES_PER_QUERY external groups + - NUMBER_OF_DOC_SETS_PER_QUERY document sets + """ + access_control_list = [ + f"user_email:user_{random.randint(0, TOTAL_ACL_ENTRIES_PER_CATEGORY - 1)}@example.com", + ] + acl_indices = random.sample( + range(TOTAL_ACL_ENTRIES_PER_CATEGORY), NUMBER_OF_ACL_ENTRIES_PER_QUERY + ) + for i in acl_indices: + access_control_list.append(f"group:group_{acl_indices[i]}") + access_control_list.append(f"external_group:external_group_{acl_indices[i]}") + + doc_sets = [] + doc_set_indices = random.sample( + range(TOTAL_DOC_SETS), NUMBER_OF_ACL_ENTRIES_PER_QUERY + ) + for i in doc_set_indices: + doc_sets.append(f"document_set:Document Set {doc_set_indices[i]}") + + return IndexFilters( + source_type=[DocumentSource.GOOGLE_DRIVE], + document_set=doc_sets, + tags=[], + access_control_list=access_control_list, + ) + + +def test_hybrid_retrieval_times( + number_of_queries: int, +) -> None: + with get_session_context_manager() as db_session: + search_settings = get_current_search_settings(db_session) + index_name = search_settings.index_name + + vespa_index = VespaIndex(index_name=index_name, secondary_index_name=None) + + # Generate random queries + queries = [f"Random Query {i}" for i in range(number_of_queries)] + + # Generate random embeddings + embeddings = [ + Embedding([random.random() for _ in range(DOC_EMBEDDING_DIM)]) + for _ in range(number_of_queries) + ] + + total_time = 0.0 + results = [] + for i in range(number_of_queries): + start_time = time.time() + + vespa_index.hybrid_retrieval( + query=queries[i], + query_embedding=embeddings[i], + final_keywords=None, + filters=_random_filters(), + hybrid_alpha=0.5, + time_decay_multiplier=1.0, + num_to_retrieve=50, + offset=0, + title_content_ratio=0.5, + ) + + end_time = time.time() + query_time = end_time - start_time + total_time += query_time + results.append(query_time) + + print(f"Query {i+1}: {query_time:.4f} seconds") + + avg_time = total_time / number_of_queries + fast_time = min(results) + slow_time = max(results) + ninety_ninth_percentile = get_slowest_99th_percentile(results) + # Write results to a file + _OUTPUT_PATH = "query_times_results_large_more.txt" + with open(_OUTPUT_PATH, "w") as f: + f.write(f"Average query time: {avg_time:.4f} seconds\n") + f.write(f"Fastest query: {fast_time:.4f} seconds\n") + f.write(f"Slowest query: {slow_time:.4f} seconds\n") + f.write(f"99th percentile: {ninety_ninth_percentile:.4f} seconds\n") + print(f"Results written to {_OUTPUT_PATH}") + + print(f"\nAverage query time: {avg_time:.4f} seconds") + print(f"Fastest query: {fast_time:.4f} seconds") + print(f"Slowest query: {max(results):.4f} seconds") + print(f"99th percentile: {get_slowest_99th_percentile(results):.4f} seconds") + + +if __name__ == "__main__": + test_hybrid_retrieval_times(number_of_queries=1000) diff --git a/backend/scripts/reset_indexes.py b/backend/scripts/reset_indexes.py index 4ec8d9bf312..1411a082499 100644 --- a/backend/scripts/reset_indexes.py +++ b/backend/scripts/reset_indexes.py @@ -1,8 +1,10 @@ # This file is purely for development use, not included in any builds import os import sys +from time import sleep import requests +from requests.exceptions import RequestException # makes it so `PYTHONPATH=.` is not required when running this script parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -15,22 +17,58 @@ logger = setup_logger() -def wipe_vespa_index() -> None: +def wipe_vespa_index() -> bool: + """ + Wipes the Vespa index by deleting all documents. + """ continuation = None should_continue = True + RETRIES = 3 + while should_continue: params = {"selection": "true", "cluster": DOCUMENT_INDEX_NAME} if continuation: - params = {**params, "continuation": continuation} - response = requests.delete(DOCUMENT_ID_ENDPOINT, params=params) - response.raise_for_status() + params["continuation"] = continuation + + for attempt in range(RETRIES): + try: + response = requests.delete(DOCUMENT_ID_ENDPOINT, params=params) + response.raise_for_status() + + response_json = response.json() + logger.info(f"Response: {response_json}") + + continuation = response_json.get("continuation") + should_continue = bool(continuation) + break # Exit the retry loop if the request is successful + + except RequestException: + logger.exception("Request failed") + sleep(2**attempt) # Exponential backoff + else: + logger.error(f"Max retries ({RETRIES}) exceeded. Exiting.") + return False + + return True + + +def main() -> int: + """ + Main function to execute the script. + """ + try: + succeeded = wipe_vespa_index() + except Exception: + logger.exception("wipe_vespa_index exceptioned.") + return 1 - response_json = response.json() - print(response_json) + if not succeeded: + logger.info("Vespa index wipe failed.") + return 0 - continuation = response_json.get("continuation") - should_continue = bool(continuation) + logger.info("Vespa index wiped successfully.") + return 1 if __name__ == "__main__": - wipe_vespa_index() + sys.exit(main()) diff --git a/backend/scripts/restart_containers.sh b/backend/scripts/restart_containers.sh index 838df5b5c79..06ed606c746 100755 --- a/backend/scripts/restart_containers.sh +++ b/backend/scripts/restart_containers.sh @@ -15,9 +15,9 @@ docker rm danswer_postgres danswer_vespa danswer_redis # Start the PostgreSQL container with optional volume echo "Starting PostgreSQL container..." if [[ -n "$POSTGRES_VOLUME" ]]; then - docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d -v $POSTGRES_VOLUME:/var/lib/postgresql/data postgres + docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d -v $POSTGRES_VOLUME:/var/lib/postgresql/data postgres -c max_connections=250 else - docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d postgres + docker run -p 5432:5432 --name danswer_postgres -e POSTGRES_PASSWORD=password -d postgres -c max_connections=250 fi # Start the Vespa container with optional volume diff --git a/backend/shared_configs/configs.py b/backend/shared_configs/configs.py index ea37b031c7a..2f558629def 100644 --- a/backend/shared_configs/configs.py +++ b/backend/shared_configs/configs.py @@ -1,6 +1,10 @@ import os +from typing import Any +from typing import List from urllib.parse import urlparse +from shared_configs.model_server_models import SupportedEmbeddingModel + # Used for logging SLACK_CHANNEL_ID = "channel_id" @@ -59,6 +63,24 @@ # notset, debug, info, notice, warning, error, or critical LOG_LEVEL = os.environ.get("LOG_LEVEL", "notice") +# Timeout for API-based embedding models +# NOTE: does not apply for Google VertexAI, since the python client doesn't +# allow us to specify a custom timeout +API_BASED_EMBEDDING_TIMEOUT = int(os.environ.get("API_BASED_EMBEDDING_TIMEOUT", "600")) + +# Only used for OpenAI +OPENAI_EMBEDDING_TIMEOUT = int( + os.environ.get("OPENAI_EMBEDDING_TIMEOUT", API_BASED_EMBEDDING_TIMEOUT) +) + +# Whether or not to strictly enforce token limit for chunking. +STRICT_CHUNK_TOKEN_LIMIT = ( + os.environ.get("STRICT_CHUNK_TOKEN_LIMIT", "").lower() == "true" +) + +# Set up Sentry integration (for error logging) +SENTRY_DSN = os.environ.get("SENTRY_DSN") + # Fields which should only be set on new search setting PRESERVED_SEARCH_FIELDS = [ @@ -76,16 +98,130 @@ ] -# CORS def validate_cors_origin(origin: str) -> None: parsed = urlparse(origin) if parsed.scheme not in ["http", "https"] or not parsed.netloc: raise ValueError(f"Invalid CORS origin: '{origin}'") -CORS_ALLOWED_ORIGIN = os.environ.get("CORS_ALLOWED_ORIGIN", "*").split(",") or ["*"] +# Examples of valid values for the environment variable: +# - "" (allow all origins) +# - "http://example.com" (single origin) +# - "http://example.com,https://example.org" (multiple origins) +# - "*" (allow all origins) +CORS_ALLOWED_ORIGIN_ENV = os.environ.get("CORS_ALLOWED_ORIGIN", "") + +# Explicitly declare the type of CORS_ALLOWED_ORIGIN +CORS_ALLOWED_ORIGIN: List[str] + +if CORS_ALLOWED_ORIGIN_ENV: + # Split the environment variable into a list of origins + CORS_ALLOWED_ORIGIN = [ + origin.strip() + for origin in CORS_ALLOWED_ORIGIN_ENV.split(",") + if origin.strip() + ] + # Validate each origin in the list + for origin in CORS_ALLOWED_ORIGIN: + validate_cors_origin(origin) +else: + # If the environment variable is empty, allow all origins + CORS_ALLOWED_ORIGIN = ["*"] + + +# Multi-tenancy configuration +MULTI_TENANT = os.environ.get("MULTI_TENANT", "").lower() == "true" -# Validate non-wildcard origins -for origin in CORS_ALLOWED_ORIGIN: - if origin != "*" and (stripped_origin := origin.strip()): - validate_cors_origin(stripped_origin) +POSTGRES_DEFAULT_SCHEMA = os.environ.get("POSTGRES_DEFAULT_SCHEMA") or "public" + + +async def async_return_default_schema(*args: Any, **kwargs: Any) -> str: + return POSTGRES_DEFAULT_SCHEMA + + +# Prefix used for all tenant ids +TENANT_ID_PREFIX = "tenant_" + +DISALLOWED_SLACK_BOT_TENANT_IDS = os.environ.get("DISALLOWED_SLACK_BOT_TENANT_IDS") +DISALLOWED_SLACK_BOT_TENANT_LIST = ( + [tenant.strip() for tenant in DISALLOWED_SLACK_BOT_TENANT_IDS.split(",")] + if DISALLOWED_SLACK_BOT_TENANT_IDS + else None +) + +IGNORED_SYNCING_TENANT_IDS = os.environ.get("IGNORED_SYNCING_TENANT_IDS") +IGNORED_SYNCING_TENANT_LIST = ( + [tenant.strip() for tenant in IGNORED_SYNCING_TENANT_IDS.split(",")] + if IGNORED_SYNCING_TENANT_IDS + else None +) + +SUPPORTED_EMBEDDING_MODELS = [ + # Cloud-based models + SupportedEmbeddingModel( + name="cohere/embed-english-v3.0", + dim=1024, + index_name="danswer_chunk_cohere_embed_english_v3_0", + ), + SupportedEmbeddingModel( + name="cohere/embed-english-light-v3.0", + dim=384, + index_name="danswer_chunk_cohere_embed_english_light_v3_0", + ), + SupportedEmbeddingModel( + name="openai/text-embedding-3-large", + dim=3072, + index_name="danswer_chunk_openai_text_embedding_3_large", + ), + SupportedEmbeddingModel( + name="openai/text-embedding-3-small", + dim=1536, + index_name="danswer_chunk_openai_text_embedding_3_small", + ), + SupportedEmbeddingModel( + name="google/text-embedding-004", + dim=768, + index_name="danswer_chunk_google_text_embedding_004", + ), + SupportedEmbeddingModel( + name="google/textembedding-gecko@003", + dim=768, + index_name="danswer_chunk_google_textembedding_gecko_003", + ), + SupportedEmbeddingModel( + name="voyage/voyage-large-2-instruct", + dim=1024, + index_name="danswer_chunk_voyage_large_2_instruct", + ), + SupportedEmbeddingModel( + name="voyage/voyage-light-2-instruct", + dim=384, + index_name="danswer_chunk_voyage_light_2_instruct", + ), + # Self-hosted models + SupportedEmbeddingModel( + name="nomic-ai/nomic-embed-text-v1", + dim=768, + index_name="danswer_chunk_nomic_ai_nomic_embed_text_v1", + ), + SupportedEmbeddingModel( + name="intfloat/e5-base-v2", + dim=768, + index_name="danswer_chunk_intfloat_e5_base_v2", + ), + SupportedEmbeddingModel( + name="intfloat/e5-small-v2", + dim=384, + index_name="danswer_chunk_intfloat_e5_small_v2", + ), + SupportedEmbeddingModel( + name="intfloat/multilingual-e5-base", + dim=768, + index_name="danswer_chunk_intfloat_multilingual_e5_base", + ), + SupportedEmbeddingModel( + name="intfloat/multilingual-e5-small", + dim=384, + index_name="danswer_chunk_intfloat_multilingual_e5_small", + ), +] diff --git a/backend/shared_configs/contextvars.py b/backend/shared_configs/contextvars.py new file mode 100644 index 00000000000..df66b141c6e --- /dev/null +++ b/backend/shared_configs/contextvars.py @@ -0,0 +1,8 @@ +import contextvars + +from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA + +# Context variable for the current tenant id +CURRENT_TENANT_ID_CONTEXTVAR = contextvars.ContextVar( + "current_tenant_id", default=POSTGRES_DEFAULT_SCHEMA +) diff --git a/backend/shared_configs/enums.py b/backend/shared_configs/enums.py index b58ac0a8928..3fe1cd0bd01 100644 --- a/backend/shared_configs/enums.py +++ b/backend/shared_configs/enums.py @@ -7,6 +7,7 @@ class EmbeddingProvider(str, Enum): VOYAGE = "voyage" GOOGLE = "google" LITELLM = "litellm" + AZURE = "azure" class RerankerProvider(str, Enum): diff --git a/backend/shared_configs/model_server_models.py b/backend/shared_configs/model_server_models.py index dd846ed6bad..9f7e853d26a 100644 --- a/backend/shared_configs/model_server_models.py +++ b/backend/shared_configs/model_server_models.py @@ -20,6 +20,7 @@ class EmbedRequest(BaseModel): texts: list[str] # Can be none for cloud embedding model requests, error handling logic exists for other cases model_name: str | None = None + deployment_name: str | None = None max_context_length: int normalize_embeddings: bool api_key: str | None = None @@ -28,7 +29,7 @@ class EmbedRequest(BaseModel): manual_query_prefix: str | None = None manual_passage_prefix: str | None = None api_url: str | None = None - + api_version: str | None = None # This disables the "model_" protected namespace for pydantic model_config = {"protected_namespaces": ()} @@ -64,3 +65,9 @@ class IntentRequest(BaseModel): class IntentResponse(BaseModel): is_keyword: bool keywords: list[str] + + +class SupportedEmbeddingModel(BaseModel): + name: str + dim: int + index_name: str diff --git a/backend/supervisord.conf b/backend/supervisord.conf index ff055a78f93..c4a431b1e3a 100644 --- a/backend/supervisord.conf +++ b/backend/supervisord.conf @@ -3,16 +3,6 @@ nodaemon=true user=root logfile=/var/log/supervisord.log -# Indexing is the heaviest job, also requires some CPU intensive steps -# Cannot place this in Celery for now because Celery must run as a single process (see note below) -# Indexing uses multi-processing to speed things up -[program:document_indexing] -environment=CURRENT_PROCESS_IS_AN_INDEXING_JOB=true,LOG_FILE_NAME=document_indexing -command=python danswer/background/update.py -redirect_stderr=true -autorestart=true - - # Background jobs that must be run async due to long time to completion # NOTE: due to an issue with Celery + SQLAlchemy # (https://github.com/celery/celery/issues/7007#issuecomment-1740139367) @@ -24,23 +14,65 @@ autorestart=true # on a system, but this should be okay for now since all our celery tasks are # relatively compute-light (e.g. they tend to just make a bunch of requests to # Vespa / Postgres) -[program:celery_worker] -command=celery -A danswer.background.celery.celery_run:celery_app worker - --pool=threads - --concurrency=6 +[program:celery_worker_primary] +command=celery -A danswer.background.celery.versioned_apps.primary worker + --loglevel=INFO + --hostname=primary@%%n + -Q celery +stdout_logfile=/var/log/celery_worker_primary.log +stdout_logfile_maxbytes=16MB +redirect_stderr=true +autorestart=true +startsecs=10 +stopasgroup=true + +# NOTE: only allowing configuration here and not in the other celery workers, +# since this is often the bottleneck for "sync" jobs (e.g. document set syncing, +# user group syncing, deletion, etc.) +[program:celery_worker_light] +command=celery -A danswer.background.celery.versioned_apps.light worker + --loglevel=INFO + --hostname=light@%%n + -Q vespa_metadata_sync,connector_deletion,doc_permissions_upsert +stdout_logfile=/var/log/celery_worker_light.log +stdout_logfile_maxbytes=16MB +redirect_stderr=true +autorestart=true +startsecs=10 +stopasgroup=true + +[program:celery_worker_heavy] +command=celery -A danswer.background.celery.versioned_apps.heavy worker --loglevel=INFO - --logfile=/var/log/celery_worker_supervisor.log - -Q celery,vespa_metadata_sync,connector_deletion -environment=LOG_FILE_NAME=celery_worker + --hostname=heavy@%%n + -Q connector_pruning,connector_doc_permissions_sync,connector_external_group_sync +stdout_logfile=/var/log/celery_worker_heavy.log +stdout_logfile_maxbytes=16MB +redirect_stderr=true +autorestart=true +startsecs=10 +stopasgroup=true + +[program:celery_worker_indexing] +command=celery -A danswer.background.celery.versioned_apps.indexing worker + --loglevel=INFO + --hostname=indexing@%%n + -Q connector_indexing +stdout_logfile=/var/log/celery_worker_indexing.log +stdout_logfile_maxbytes=16MB redirect_stderr=true autorestart=true +startsecs=10 +stopasgroup=true # Job scheduler for periodic tasks [program:celery_beat] -command=celery -A danswer.background.celery.celery_run:celery_app beat - --logfile=/var/log/celery_beat_supervisor.log -environment=LOG_FILE_NAME=celery_beat +command=celery -A danswer.background.celery.versioned_apps.beat beat +stdout_logfile=/var/log/celery_beat.log +stdout_logfile_maxbytes=16MB redirect_stderr=true +startsecs=10 +stopasgroup=true # Listens for Slack messages and responds with answers # for all channels that the DanswerBot has been added to. @@ -48,7 +80,8 @@ redirect_stderr=true # More details on setup here: https://docs.danswer.dev/slack_bot_setup [program:slack_bot] command=python danswer/danswerbot/slack/listener.py -environment=LOG_FILE_NAME=slack_bot +stdout_logfile=/var/log/slack_bot.log +stdout_logfile_maxbytes=16MB redirect_stderr=true autorestart=true startretries=5 @@ -58,13 +91,12 @@ startsecs=60 # No log rotation here, since it's stdout it's handled by the Docker container logging [program:log-redirect-handler] command=tail -qF - /var/log/document_indexing_info.log - /var/log/celery_beat_supervisor.log - /var/log/celery_worker_supervisor.log - /var/log/celery_beat_debug.log - /var/log/celery_worker_debug.log - /var/log/slack_bot_debug.log + /var/log/celery_beat.log + /var/log/celery_worker_primary.log + /var/log/celery_worker_light.log + /var/log/celery_worker_heavy.log + /var/log/celery_worker_indexing.log + /var/log/slack_bot.log stdout_logfile=/dev/stdout -stdout_logfile_maxbytes=0 -redirect_stderr=true -autorestart=true \ No newline at end of file +stdout_logfile_maxbytes = 0 # must be set to 0 when stdout_logfile=/dev/stdout +autorestart=true diff --git a/backend/tests/daily/conftest.py b/backend/tests/daily/conftest.py new file mode 100644 index 00000000000..88a74c7b4ce --- /dev/null +++ b/backend/tests/daily/conftest.py @@ -0,0 +1,24 @@ +import os +from collections.abc import Generator +from typing import Any + +import pytest +from fastapi.testclient import TestClient + +from danswer.main import fetch_versioned_implementation +from danswer.utils.logger import setup_logger + +logger = setup_logger() + + +@pytest.fixture(scope="function") +def client() -> Generator[TestClient, Any, None]: + # Set environment variables + os.environ["ENABLE_PAID_ENTERPRISE_EDITION_FEATURES"] = "True" + + # Initialize TestClient with the FastAPI app + app = fetch_versioned_implementation( + module="danswer.main", attribute="get_application" + )() + client = TestClient(app) + yield client diff --git a/backend/tests/daily/connectors/confluence/test_confluence_basic.py b/backend/tests/daily/connectors/confluence/test_confluence_basic.py index 4eb25207814..4f928b91ee5 100644 --- a/backend/tests/daily/connectors/confluence/test_confluence_basic.py +++ b/backend/tests/daily/connectors/confluence/test_confluence_basic.py @@ -1,5 +1,7 @@ import os import time +from unittest.mock import MagicMock +from unittest.mock import patch import pytest @@ -24,25 +26,61 @@ def confluence_connector() -> ConfluenceConnector: return connector -def test_confluence_connector_basic(confluence_connector: ConfluenceConnector) -> None: +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_confluence_connector_basic( + mock_get_api_key: MagicMock, confluence_connector: ConfluenceConnector +) -> None: doc_batch_generator = confluence_connector.poll_source(0, time.time()) doc_batch = next(doc_batch_generator) with pytest.raises(StopIteration): next(doc_batch_generator) - assert len(doc_batch) == 1 + assert len(doc_batch) == 3 - doc = doc_batch[0] - assert doc.semantic_identifier == "DailyConnectorTestSpace Home" - assert doc.metadata["labels"] == ["testlabel"] - assert doc.primary_owners - assert doc.primary_owners[0].email == "chris@danswer.ai" - assert len(doc.sections) == 1 + for doc in doc_batch: + if doc.semantic_identifier == "DailyConnectorTestSpace Home": + page_doc = doc + elif ".txt" in doc.semantic_identifier: + txt_doc = doc + elif doc.semantic_identifier == "Page Within A Page": + page_within_a_page_doc = doc - section = doc.sections[0] - assert section.text == "test123small" + assert page_within_a_page_doc.semantic_identifier == "Page Within A Page" + assert page_within_a_page_doc.primary_owners + assert page_within_a_page_doc.primary_owners[0].email == "hagen@danswer.ai" + assert len(page_within_a_page_doc.sections) == 1 + + page_within_a_page_section = page_within_a_page_doc.sections[0] + page_within_a_page_text = "@Chris Weaver loves cherry pie" + assert page_within_a_page_section.text == page_within_a_page_text + assert ( + page_within_a_page_section.link + == "https://danswerai.atlassian.net/wiki/spaces/DailyConne/pages/200769540/Page+Within+A+Page" + ) + + assert page_doc.semantic_identifier == "DailyConnectorTestSpace Home" + assert page_doc.metadata["labels"] == ["testlabel"] + assert page_doc.primary_owners + assert page_doc.primary_owners[0].email == "hagen@danswer.ai" + assert len(page_doc.sections) == 1 + + page_section = page_doc.sections[0] + assert page_section.text == "test123 " + page_within_a_page_text assert ( - section.link + page_section.link == "https://danswerai.atlassian.net/wiki/spaces/DailyConne/overview" ) + + assert txt_doc.semantic_identifier == "small-file.txt" + assert len(txt_doc.sections) == 1 + assert txt_doc.sections[0].text == "small" + assert txt_doc.primary_owners + assert txt_doc.primary_owners[0].email == "chris@danswer.ai" + assert ( + txt_doc.sections[0].link + == "https://danswerai.atlassian.net/wiki/pages/viewpageattachments.action?pageId=52494430&preview=%2F52494430%2F52527123%2Fsmall-file.txt" + ) diff --git a/backend/tests/daily/connectors/gmail/conftest.py b/backend/tests/daily/connectors/gmail/conftest.py new file mode 100644 index 00000000000..5010d0b5133 --- /dev/null +++ b/backend/tests/daily/connectors/gmail/conftest.py @@ -0,0 +1,89 @@ +import json +import os +from collections.abc import Callable + +import pytest + +from danswer.connectors.gmail.connector import GmailConnector +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY, +) +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_DICT_TOKEN_KEY, +) +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_PRIMARY_ADMIN_KEY, +) +from tests.load_env_vars import load_env_vars + + +# Load environment variables at the module level +load_env_vars() + + +def parse_credentials(env_str: str) -> dict: + """ + Parse a double-escaped JSON string from environment variables into a Python dictionary. + + Args: + env_str (str): The double-escaped JSON string from environment variables + + Returns: + dict: Parsed OAuth credentials + """ + # first try normally + try: + return json.loads(env_str) + except Exception: + # First, try remove extra escaping backslashes + unescaped = env_str.replace('\\"', '"') + + # remove leading / trailing quotes + unescaped = unescaped.strip('"') + + # Now parse the JSON + return json.loads(unescaped) + + +@pytest.fixture +def google_gmail_oauth_connector_factory() -> Callable[..., GmailConnector]: + def _connector_factory( + primary_admin_email: str = "admin@onyx-test.com", + ) -> GmailConnector: + print("Creating GmailConnector with OAuth credentials") + connector = GmailConnector() + + json_string = os.environ["GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR"] + refried_json_string = json.dumps(parse_credentials(json_string)) + + credentials_json = { + DB_CREDENTIALS_DICT_TOKEN_KEY: refried_json_string, + DB_CREDENTIALS_PRIMARY_ADMIN_KEY: primary_admin_email, + } + connector.load_credentials(credentials_json) + return connector + + return _connector_factory + + +@pytest.fixture +def google_gmail_service_acct_connector_factory() -> Callable[..., GmailConnector]: + def _connector_factory( + primary_admin_email: str = "admin@onyx-test.com", + ) -> GmailConnector: + print("Creating GmailConnector with service account credentials") + connector = GmailConnector() + + json_string = os.environ["GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR"] + refried_json_string = json.dumps(parse_credentials(json_string)) + + # Load Service Account Credentials + connector.load_credentials( + { + DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY: refried_json_string, + DB_CREDENTIALS_PRIMARY_ADMIN_KEY: primary_admin_email, + } + ) + return connector + + return _connector_factory diff --git a/backend/tests/daily/connectors/gmail/test_gmail_connector.py b/backend/tests/daily/connectors/gmail/test_gmail_connector.py new file mode 100644 index 00000000000..73cab1136c7 --- /dev/null +++ b/backend/tests/daily/connectors/gmail/test_gmail_connector.py @@ -0,0 +1,125 @@ +from collections.abc import Callable +from typing import Any +from unittest.mock import MagicMock +from unittest.mock import patch + +from danswer.connectors.gmail.connector import GmailConnector +from danswer.connectors.models import Document +from danswer.connectors.models import SlimDocument + + +_THREAD_1_START_TIME = 1730568700 +_THREAD_1_END_TIME = 1730569000 + +""" +This thread was 4 emails long: + admin@onyx-test.com -> test-group-1@onyx-test.com (conaining test_user_1 and test_user_2) + test_user_1@onyx-test.com -> admin@onyx-test.com + admin@onyx-test.com -> test_user_2@onyx-test.com + BCC: test_user_3@onyx-test.com + test_user_3@onyx-test.com -> admin@onyx-test.com +""" +_THREAD_1_BY_ID: dict[str, dict[str, Any]] = { + "192edefb315737c3": { + "email": "admin@onyx-test.com", + "sections_count": 4, + "primary_owners": set( + [ + "admin@onyx-test.com", + "test_user_1@onyx-test.com", + "test_user_3@onyx-test.com", + ] + ), + "secondary_owners": set( + [ + "test-group-1@onyx-test.com", + "admin@onyx-test.com", + "test_user_2@onyx-test.com", + "test_user_3@onyx-test.com", + ] + ), + }, + "192edf020d2f5def": { + "email": "test_user_1@onyx-test.com", + "sections_count": 2, + "primary_owners": set(["admin@onyx-test.com", "test_user_1@onyx-test.com"]), + "secondary_owners": set(["test-group-1@onyx-test.com", "admin@onyx-test.com"]), + }, + "192edf020ae90aab": { + "email": "test_user_2@onyx-test.com", + "sections_count": 2, + "primary_owners": set(["admin@onyx-test.com"]), + "secondary_owners": set( + ["test-group-1@onyx-test.com", "test_user_2@onyx-test.com"] + ), + }, + "192edf18316015fa": { + "email": "test_user_3@onyx-test.com", + "sections_count": 2, + "primary_owners": set(["admin@onyx-test.com", "test_user_3@onyx-test.com"]), + "secondary_owners": set( + [ + "admin@onyx-test.com", + "test_user_2@onyx-test.com", + "test_user_3@onyx-test.com", + ] + ), + }, +} + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_slim_docs_retrieval( + mock_get_api_key: MagicMock, + google_gmail_service_acct_connector_factory: Callable[..., GmailConnector], +) -> None: + print("\n\nRunning test_slim_docs_retrieval") + connector = google_gmail_service_acct_connector_factory() + retrieved_slim_docs: list[SlimDocument] = [] + for doc_batch in connector.retrieve_all_slim_documents( + _THREAD_1_START_TIME, _THREAD_1_END_TIME + ): + retrieved_slim_docs.extend(doc_batch) + + assert len(retrieved_slim_docs) == 4 + + for doc in retrieved_slim_docs: + permission_info = doc.perm_sync_data + assert isinstance(permission_info, dict) + user_email = permission_info["user_email"] + assert _THREAD_1_BY_ID[doc.id]["email"] == user_email + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_docs_retrieval( + mock_get_api_key: MagicMock, + google_gmail_service_acct_connector_factory: Callable[..., GmailConnector], +) -> None: + print("\n\nRunning test_docs_retrieval") + connector = google_gmail_service_acct_connector_factory() + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(_THREAD_1_START_TIME, _THREAD_1_END_TIME): + retrieved_docs.extend(doc_batch) + + assert len(retrieved_docs) == 4 + + for doc in retrieved_docs: + id = doc.id + if doc.primary_owners: + retrieved_primary_owner_emails = set( + [owner.email for owner in doc.primary_owners] + ) + if doc.secondary_owners: + retrieved_secondary_owner_emails = set( + [owner.email for owner in doc.secondary_owners] + ) + assert _THREAD_1_BY_ID[id]["sections_count"] == len(doc.sections) + assert _THREAD_1_BY_ID[id]["primary_owners"] == retrieved_primary_owner_emails + assert ( + _THREAD_1_BY_ID[id]["secondary_owners"] == retrieved_secondary_owner_emails + ) diff --git a/backend/tests/daily/connectors/google_drive/conftest.py b/backend/tests/daily/connectors/google_drive/conftest.py new file mode 100644 index 00000000000..4f525b24592 --- /dev/null +++ b/backend/tests/daily/connectors/google_drive/conftest.py @@ -0,0 +1,129 @@ +import json +import os +from collections.abc import Callable + +import pytest + +from danswer.connectors.google_drive.connector import GoogleDriveConnector +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY, +) +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_DICT_TOKEN_KEY, +) +from danswer.connectors.google_utils.shared_constants import ( + DB_CREDENTIALS_PRIMARY_ADMIN_KEY, +) +from tests.load_env_vars import load_env_vars + + +# Load environment variables at the module level +load_env_vars() + + +_USER_TO_OAUTH_CREDENTIALS_MAP = { + "admin@onyx-test.com": "GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR", + "test_user_1@onyx-test.com": "GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1", +} + +_USER_TO_SERVICE_ACCOUNT_CREDENTIALS_MAP = { + "admin@onyx-test.com": "GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR", +} + + +def parse_credentials(env_str: str) -> dict: + """ + Parse a double-escaped JSON string from environment variables into a Python dictionary. + + Args: + env_str (str): The double-escaped JSON string from environment variables + + Returns: + dict: Parsed OAuth credentials + """ + # first try normally + try: + return json.loads(env_str) + except Exception: + # First, try remove extra escaping backslashes + unescaped = env_str.replace('\\"', '"') + + # remove leading / trailing quotes + unescaped = unescaped.strip('"') + + # Now parse the JSON + return json.loads(unescaped) + + +@pytest.fixture +def google_drive_oauth_connector_factory() -> Callable[..., GoogleDriveConnector]: + def _connector_factory( + primary_admin_email: str, + include_shared_drives: bool, + shared_drive_urls: str | None, + include_my_drives: bool, + my_drive_emails: str | None, + shared_folder_urls: str | None, + include_files_shared_with_me: bool, + ) -> GoogleDriveConnector: + print("Creating GoogleDriveConnector with OAuth credentials") + connector = GoogleDriveConnector( + include_shared_drives=include_shared_drives, + shared_drive_urls=shared_drive_urls, + include_my_drives=include_my_drives, + include_files_shared_with_me=include_files_shared_with_me, + my_drive_emails=my_drive_emails, + shared_folder_urls=shared_folder_urls, + ) + + json_string = os.environ[_USER_TO_OAUTH_CREDENTIALS_MAP[primary_admin_email]] + refried_json_string = json.dumps(parse_credentials(json_string)) + + credentials_json = { + DB_CREDENTIALS_DICT_TOKEN_KEY: refried_json_string, + DB_CREDENTIALS_PRIMARY_ADMIN_KEY: primary_admin_email, + } + connector.load_credentials(credentials_json) + return connector + + return _connector_factory + + +@pytest.fixture +def google_drive_service_acct_connector_factory() -> ( + Callable[..., GoogleDriveConnector] +): + def _connector_factory( + primary_admin_email: str, + include_shared_drives: bool, + shared_drive_urls: str | None, + include_my_drives: bool, + my_drive_emails: str | None, + shared_folder_urls: str | None, + include_files_shared_with_me: bool, + ) -> GoogleDriveConnector: + print("Creating GoogleDriveConnector with service account credentials") + connector = GoogleDriveConnector( + include_shared_drives=include_shared_drives, + shared_drive_urls=shared_drive_urls, + include_my_drives=include_my_drives, + my_drive_emails=my_drive_emails, + shared_folder_urls=shared_folder_urls, + include_files_shared_with_me=include_files_shared_with_me, + ) + + json_string = os.environ[ + _USER_TO_SERVICE_ACCOUNT_CREDENTIALS_MAP[primary_admin_email] + ] + refried_json_string = json.dumps(parse_credentials(json_string)) + + # Load Service Account Credentials + connector.load_credentials( + { + DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY: refried_json_string, + DB_CREDENTIALS_PRIMARY_ADMIN_KEY: primary_admin_email, + } + ) + return connector + + return _connector_factory diff --git a/backend/tests/daily/connectors/google_drive/consts_and_utils.py b/backend/tests/daily/connectors/google_drive/consts_and_utils.py new file mode 100644 index 00000000000..1df59a58101 --- /dev/null +++ b/backend/tests/daily/connectors/google_drive/consts_and_utils.py @@ -0,0 +1,166 @@ +from collections.abc import Sequence + +from danswer.connectors.models import Document + +ALL_FILES = list(range(0, 60)) +SHARED_DRIVE_FILES = list(range(20, 25)) + + +ADMIN_FILE_IDS = list(range(0, 5)) +ADMIN_FOLDER_3_FILE_IDS = list(range(65, 70)) # This folder is shared with test_user_1 +TEST_USER_1_FILE_IDS = list(range(5, 10)) +TEST_USER_2_FILE_IDS = list(range(10, 15)) +TEST_USER_3_FILE_IDS = list(range(15, 20)) +SHARED_DRIVE_1_FILE_IDS = list(range(20, 25)) +FOLDER_1_FILE_IDS = list(range(25, 30)) +FOLDER_1_1_FILE_IDS = list(range(30, 35)) +FOLDER_1_2_FILE_IDS = list(range(35, 40)) # This folder is public +SHARED_DRIVE_2_FILE_IDS = list(range(40, 45)) +FOLDER_2_FILE_IDS = list(range(45, 50)) +FOLDER_2_1_FILE_IDS = list(range(50, 55)) +FOLDER_2_2_FILE_IDS = list(range(55, 60)) +SECTIONS_FILE_IDS = [61] + +PUBLIC_FOLDER_RANGE = FOLDER_1_2_FILE_IDS +PUBLIC_FILE_IDS = list(range(55, 57)) +PUBLIC_RANGE = PUBLIC_FOLDER_RANGE + PUBLIC_FILE_IDS + +SHARED_DRIVE_1_URL = "https://drive.google.com/drive/folders/0AC_OJ4BkMd4kUk9PVA" +# Group 1 is given access to this folder +FOLDER_1_URL = ( + "https://drive.google.com/drive/folders/1d3I7U3vUZMDziF1OQqYRkB8Jp2s_GWUn" +) +FOLDER_1_1_URL = ( + "https://drive.google.com/drive/folders/1aR33-zwzl_mnRAwH55GgtWTE-4A4yWWI" +) +FOLDER_1_2_URL = ( + "https://drive.google.com/drive/folders/1IO0X55VhvLXf4mdxzHxuKf4wxrDBB6jq" +) +SHARED_DRIVE_2_URL = "https://drive.google.com/drive/folders/0ABKspIh7P4f4Uk9PVA" +FOLDER_2_URL = ( + "https://drive.google.com/drive/folders/1lNpCJ1teu8Se0louwL0oOHK9nEalskof" +) +FOLDER_2_1_URL = ( + "https://drive.google.com/drive/folders/1XeDOMWwxTDiVr9Ig2gKum3Zq_Wivv6zY" +) +FOLDER_2_2_URL = ( + "https://drive.google.com/drive/folders/1RKlsexA8h7NHvBAWRbU27MJotic7KXe3" +) +FOLDER_3_URL = ( + "https://drive.google.com/drive/folders/1LHibIEXfpUmqZ-XjBea44SocA91Nkveu" +) +SECTIONS_FOLDER_URL = ( + "https://drive.google.com/drive/u/5/folders/1loe6XJ-pJxu9YYPv7cF3Hmz296VNzA33" +) + +ADMIN_EMAIL = "admin@onyx-test.com" +TEST_USER_1_EMAIL = "test_user_1@onyx-test.com" +TEST_USER_2_EMAIL = "test_user_2@onyx-test.com" +TEST_USER_3_EMAIL = "test_user_3@onyx-test.com" + +# Dictionary for access permissions +# All users have access to their own My Drive as well as public files +ACCESS_MAPPING: dict[str, list[int]] = { + # Admin has access to everything in shared + ADMIN_EMAIL: ( + ADMIN_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + SHARED_DRIVE_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + SECTIONS_FILE_IDS + ), + TEST_USER_1_EMAIL: ( + TEST_USER_1_FILE_IDS + # This user has access to drive 1 + + SHARED_DRIVE_1_FILE_IDS + # This user has redundant access to folder 1 because of group access + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + # This user has been given shared access to folder 3 in Admin's My Drive + + ADMIN_FOLDER_3_FILE_IDS + # This user has been given shared access to files 0 and 1 in Admin's My Drive + + list(range(0, 2)) + ), + TEST_USER_2_EMAIL: ( + TEST_USER_2_FILE_IDS + # Group 1 includes this user, giving access to folder 1 + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + # This folder is public + + FOLDER_1_2_FILE_IDS + # Folder 2-1 is shared with this user + + FOLDER_2_1_FILE_IDS + # This user has been given shared access to files 45 and 46 in folder 2 + + list(range(45, 47)) + ), + # This user can only see his own files and public files + TEST_USER_3_EMAIL: TEST_USER_3_FILE_IDS, +} + +SPECIAL_FILE_ID_TO_CONTENT_MAP: dict[int, str] = { + 61: ( + "Title\n\n" + "This is a Google Doc with sections - " + "Section 1\n\n" + "Section 1 content - " + "Sub-Section 1-1\n\n" + "Sub-Section 1-1 content - " + "Sub-Section 1-2\n\n" + "Sub-Section 1-2 content - " + "Section 2\n\n" + "Section 2 content" + ), +} + + +file_name_template = "file_{}.txt" +file_text_template = "This is file {}" + + +def print_discrepencies(expected: set[str], retrieved: set[str]) -> None: + if expected != retrieved: + print(expected) + print(retrieved) + print("Extra:") + print(retrieved - expected) + print("Missing:") + print(expected - retrieved) + + +def get_file_content(file_id: int) -> str: + if file_id in SPECIAL_FILE_ID_TO_CONTENT_MAP: + return SPECIAL_FILE_ID_TO_CONTENT_MAP[file_id] + + return file_text_template.format(file_id) + + +def assert_retrieved_docs_match_expected( + retrieved_docs: list[Document], expected_file_ids: Sequence[int] +) -> None: + expected_file_names = { + file_name_template.format(file_id) for file_id in expected_file_ids + } + expected_file_texts = {get_file_content(file_id) for file_id in expected_file_ids} + + retrieved_file_names = set([doc.semantic_identifier for doc in retrieved_docs]) + retrieved_texts = set( + [ + " - ".join([section.text for section in doc.sections]) + for doc in retrieved_docs + ] + ) + + # Check file names + print_discrepencies(expected_file_names, retrieved_file_names) + assert expected_file_names == retrieved_file_names + + # Check file texts + print_discrepencies(expected_file_texts, retrieved_texts) + assert expected_file_texts == retrieved_texts diff --git a/backend/tests/daily/connectors/google_drive/test_admin_oauth.py b/backend/tests/daily/connectors/google_drive/test_admin_oauth.py new file mode 100644 index 00000000000..74625ed0b2d --- /dev/null +++ b/backend/tests/daily/connectors/google_drive/test_admin_oauth.py @@ -0,0 +1,292 @@ +import time +from collections.abc import Callable +from unittest.mock import MagicMock +from unittest.mock import patch + +from danswer.connectors.google_drive.connector import GoogleDriveConnector +from danswer.connectors.models import Document +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FOLDER_3_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import ( + assert_retrieved_docs_match_expected, +) +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_2_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_3_URL +from tests.daily.connectors.google_drive.consts_and_utils import SECTIONS_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_2_FILE_IDS + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_include_all( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_include_all") + connector = google_drive_oauth_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=True, + include_my_drives=True, + include_files_shared_with_me=False, + shared_folder_urls=None, + my_drive_emails=None, + shared_drive_urls=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + # Should get everything in shared and admin's My Drive with oauth + expected_file_ids = ( + ADMIN_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + SHARED_DRIVE_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + SECTIONS_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_include_shared_drives_only( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_include_shared_drives_only") + connector = google_drive_oauth_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=True, + include_my_drives=False, + include_files_shared_with_me=False, + shared_folder_urls=None, + my_drive_emails=None, + shared_drive_urls=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + # Should only get shared drives + expected_file_ids = ( + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + SHARED_DRIVE_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + SECTIONS_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_include_my_drives_only( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_include_my_drives_only") + connector = google_drive_oauth_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=False, + include_my_drives=True, + include_files_shared_with_me=False, + shared_folder_urls=None, + my_drive_emails=None, + shared_drive_urls=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + # Should only get primary_admins My Drive because we are impersonating them + expected_file_ids = ADMIN_FILE_IDS + ADMIN_FOLDER_3_FILE_IDS + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_drive_one_only( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_drive_one_only") + drive_urls = [SHARED_DRIVE_1_URL] + connector = google_drive_oauth_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=True, + include_my_drives=False, + include_files_shared_with_me=False, + shared_folder_urls=None, + my_drive_emails=None, + shared_drive_urls=",".join([str(url) for url in drive_urls]), + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = ( + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_folder_and_shared_drive( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_folder_and_shared_drive") + drive_urls = [SHARED_DRIVE_1_URL] + folder_urls = [FOLDER_2_URL] + connector = google_drive_oauth_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=True, + include_my_drives=False, + include_files_shared_with_me=False, + shared_folder_urls=",".join([str(url) for url in folder_urls]), + my_drive_emails=None, + shared_drive_urls=",".join([str(url) for url in drive_urls]), + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = ( + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_folders_only( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_folders_only") + folder_urls = [ + FOLDER_1_2_URL, + FOLDER_2_1_URL, + FOLDER_2_2_URL, + FOLDER_3_URL, + ] + # This should get converted to a drive request and spit out a warning in the logs + shared_drive_urls = [ + FOLDER_1_1_URL, + ] + connector = google_drive_oauth_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=True, + include_my_drives=False, + include_files_shared_with_me=False, + shared_folder_urls=",".join([str(url) for url in folder_urls]), + my_drive_emails=None, + shared_drive_urls=",".join([str(url) for url in shared_drive_urls]), + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = ( + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_personal_folders_only( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_personal_folders_only") + folder_urls = [ + FOLDER_3_URL, + ] + connector = google_drive_oauth_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=True, + include_my_drives=False, + include_files_shared_with_me=False, + shared_folder_urls=",".join([str(url) for url in folder_urls]), + my_drive_emails=None, + shared_drive_urls=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = ADMIN_FOLDER_3_FILE_IDS + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) diff --git a/backend/tests/daily/connectors/google_drive/test_sections.py b/backend/tests/daily/connectors/google_drive/test_sections.py new file mode 100644 index 00000000000..989bf9e9e7b --- /dev/null +++ b/backend/tests/daily/connectors/google_drive/test_sections.py @@ -0,0 +1,76 @@ +import time +from collections.abc import Callable +from unittest.mock import MagicMock +from unittest.mock import patch + +from danswer.connectors.google_drive.connector import GoogleDriveConnector +from danswer.connectors.models import Document +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import SECTIONS_FOLDER_URL + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_google_drive_sections( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], + google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + oauth_connector = google_drive_oauth_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=False, + include_my_drives=False, + include_files_shared_with_me=False, + shared_folder_urls=SECTIONS_FOLDER_URL, + shared_drive_urls=None, + my_drive_emails=None, + ) + service_acct_connector = google_drive_service_acct_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=False, + include_my_drives=False, + include_files_shared_with_me=False, + shared_folder_urls=SECTIONS_FOLDER_URL, + shared_drive_urls=None, + my_drive_emails=None, + ) + for connector in [oauth_connector, service_acct_connector]: + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + # Verify we got the 1 doc with sections + assert len(retrieved_docs) == 1 + + # Verify each section has the expected structure + doc = retrieved_docs[0] + assert len(doc.sections) == 5 + + header_section = doc.sections[0] + assert header_section.text == "Title\n\nThis is a Google Doc with sections" + assert header_section.link is not None + assert header_section.link.endswith( + "?tab=t.0#heading=h.hfjc17k6qwzt" + ) or header_section.link.endswith("?tab=t.0#heading=h.hfjc17k6qwzt") + + section_1 = doc.sections[1] + assert section_1.text == "Section 1\n\nSection 1 content" + assert section_1.link is not None + assert section_1.link.endswith("?tab=t.0#heading=h.8slfx752a3g5") + + section_2 = doc.sections[2] + assert section_2.text == "Sub-Section 1-1\n\nSub-Section 1-1 content" + assert section_2.link is not None + assert section_2.link.endswith("?tab=t.0#heading=h.4kj3ayade1bp") + + section_3 = doc.sections[3] + assert section_3.text == "Sub-Section 1-2\n\nSub-Section 1-2 content" + assert section_3.link is not None + assert section_3.link.endswith("?tab=t.0#heading=h.pm6wrpzgk69l") + + section_4 = doc.sections[4] + assert section_4.text == "Section 2\n\nSection 2 content" + assert section_4.link is not None + assert section_4.link.endswith("?tab=t.0#heading=h.2m0s9youe2k9") diff --git a/backend/tests/daily/connectors/google_drive/test_service_acct.py b/backend/tests/daily/connectors/google_drive/test_service_acct.py new file mode 100644 index 00000000000..602849a41dc --- /dev/null +++ b/backend/tests/daily/connectors/google_drive/test_service_acct.py @@ -0,0 +1,341 @@ +import time +from collections.abc import Callable +from unittest.mock import MagicMock +from unittest.mock import patch + +from danswer.connectors.google_drive.connector import GoogleDriveConnector +from danswer.connectors.models import Document +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FOLDER_3_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import ( + assert_retrieved_docs_match_expected, +) +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_2_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_3_URL +from tests.daily.connectors.google_drive.consts_and_utils import SECTIONS_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_3_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_3_FILE_IDS + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_include_all( + mock_get_api_key: MagicMock, + google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_include_all") + connector = google_drive_service_acct_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=True, + include_my_drives=True, + include_files_shared_with_me=False, + shared_folder_urls=None, + shared_drive_urls=None, + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + # Should get everything + expected_file_ids = ( + ADMIN_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + + TEST_USER_1_FILE_IDS + + TEST_USER_2_FILE_IDS + + TEST_USER_3_FILE_IDS + + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + SHARED_DRIVE_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + SECTIONS_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_include_shared_drives_only( + mock_get_api_key: MagicMock, + google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_include_shared_drives_only") + connector = google_drive_service_acct_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=True, + include_my_drives=False, + include_files_shared_with_me=False, + shared_folder_urls=None, + shared_drive_urls=None, + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + # Should only get shared drives + expected_file_ids = ( + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + SHARED_DRIVE_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + SECTIONS_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_include_my_drives_only( + mock_get_api_key: MagicMock, + google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_include_my_drives_only") + connector = google_drive_service_acct_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=False, + include_my_drives=True, + include_files_shared_with_me=False, + shared_folder_urls=None, + shared_drive_urls=None, + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + # Should only get everyone's My Drives + expected_file_ids = ( + ADMIN_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + + TEST_USER_1_FILE_IDS + + TEST_USER_2_FILE_IDS + + TEST_USER_3_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_drive_one_only( + mock_get_api_key: MagicMock, + google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_drive_one_only") + urls = [SHARED_DRIVE_1_URL] + connector = google_drive_service_acct_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=False, + include_my_drives=False, + include_files_shared_with_me=False, + shared_folder_urls=None, + shared_drive_urls=",".join([str(url) for url in urls]), + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + # We ignore shared_drive_urls if include_shared_drives is False + expected_file_ids = ( + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_folder_and_shared_drive( + mock_get_api_key: MagicMock, + google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_folder_and_shared_drive") + drive_urls = [SHARED_DRIVE_1_URL] + folder_urls = [FOLDER_2_URL] + connector = google_drive_service_acct_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=False, + include_my_drives=False, + include_files_shared_with_me=False, + shared_drive_urls=",".join([str(url) for url in drive_urls]), + shared_folder_urls=",".join([str(url) for url in folder_urls]), + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + # Should get everything except for the top level files in drive 2 + expected_file_ids = ( + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + FOLDER_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_folders_only( + mock_get_api_key: MagicMock, + google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_folders_only") + folder_urls = [ + FOLDER_1_2_URL, + FOLDER_2_1_URL, + FOLDER_2_2_URL, + FOLDER_3_URL, + ] + # This should get converted to a drive request and spit out a warning in the logs + shared_drive_urls = [ + FOLDER_1_1_URL, + ] + connector = google_drive_service_acct_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=False, + include_my_drives=False, + include_files_shared_with_me=False, + shared_drive_urls=",".join([str(url) for url in shared_drive_urls]), + shared_folder_urls=",".join([str(url) for url in folder_urls]), + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = ( + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + + FOLDER_2_1_FILE_IDS + + FOLDER_2_2_FILE_IDS + + ADMIN_FOLDER_3_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_specific_emails( + mock_get_api_key: MagicMock, + google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_specific_emails") + my_drive_emails = [ + TEST_USER_1_EMAIL, + TEST_USER_3_EMAIL, + ] + connector = google_drive_service_acct_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=False, + include_my_drives=False, + include_files_shared_with_me=False, + shared_folder_urls=None, + shared_drive_urls=None, + my_drive_emails=",".join([str(email) for email in my_drive_emails]), + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = TEST_USER_1_FILE_IDS + TEST_USER_3_FILE_IDS + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def get_specific_folders_in_my_drive( + mock_get_api_key: MagicMock, + google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning get_specific_folders_in_my_drive") + folder_urls = [ + FOLDER_3_URL, + ] + connector = google_drive_service_acct_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=False, + include_my_drives=False, + include_files_shared_with_me=False, + shared_folder_urls=",".join([str(url) for url in folder_urls]), + shared_drive_urls=None, + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = ADMIN_FOLDER_3_FILE_IDS + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) diff --git a/backend/tests/daily/connectors/google_drive/test_slim_docs.py b/backend/tests/daily/connectors/google_drive/test_slim_docs.py new file mode 100644 index 00000000000..7a836421317 --- /dev/null +++ b/backend/tests/daily/connectors/google_drive/test_slim_docs.py @@ -0,0 +1,206 @@ +import time +from collections.abc import Callable +from unittest.mock import MagicMock +from unittest.mock import patch + +from danswer.access.models import ExternalAccess +from danswer.connectors.google_drive.connector import GoogleDriveConnector +from danswer.connectors.google_utils.google_utils import execute_paginated_retrieval +from danswer.connectors.google_utils.resources import get_admin_service +from ee.danswer.external_permissions.google_drive.doc_sync import ( + _get_permissions_from_slim_doc, +) +from tests.daily.connectors.google_drive.consts_and_utils import ACCESS_MAPPING +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FOLDER_3_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import file_name_template +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import print_discrepencies +from tests.daily.connectors.google_drive.consts_and_utils import PUBLIC_RANGE +from tests.daily.connectors.google_drive.consts_and_utils import SECTIONS_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_2_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_3_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_3_FILE_IDS + + +def get_keys_available_to_user_from_access_map( + user_email: str, + group_map: dict[str, list[str]], + access_map: dict[str, ExternalAccess], +) -> list[str]: + """ + Extracts the names of the files available to the user from the access map + through their own email or group memberships or public access + """ + group_emails_for_user = [] + for group_email, user_in_group_email_list in group_map.items(): + if user_email in user_in_group_email_list: + group_emails_for_user.append(group_email) + + accessible_file_names_for_user = [] + for file_name, external_access in access_map.items(): + if external_access.is_public: + accessible_file_names_for_user.append(file_name) + elif user_email in external_access.external_user_emails: + accessible_file_names_for_user.append(file_name) + elif any( + group_email in external_access.external_user_group_ids + for group_email in group_emails_for_user + ): + accessible_file_names_for_user.append(file_name) + return accessible_file_names_for_user + + +def assert_correct_access_for_user( + user_email: str, + expected_access_ids: list[int], + group_map: dict[str, list[str]], + retrieved_access_map: dict[str, ExternalAccess], +) -> None: + """ + compares the expected access range of the user to the keys available to the user + retrieved from the source + """ + retrieved_keys_available_to_user = get_keys_available_to_user_from_access_map( + user_email, group_map, retrieved_access_map + ) + retrieved_file_names = set(retrieved_keys_available_to_user) + + # Combine public and user-specific access IDs + all_accessible_ids = expected_access_ids + PUBLIC_RANGE + expected_file_names = {file_name_template.format(i) for i in all_accessible_ids} + + print_discrepencies(expected_file_names, retrieved_file_names) + + assert expected_file_names == retrieved_file_names + + +# This function is supposed to map to the group_sync.py file for the google drive connector +# TODO: Call it directly +def get_group_map(google_drive_connector: GoogleDriveConnector) -> dict[str, list[str]]: + admin_service = get_admin_service( + creds=google_drive_connector.creds, + user_email=google_drive_connector.primary_admin_email, + ) + + group_map: dict[str, list[str]] = {} + for group in execute_paginated_retrieval( + admin_service.groups().list, + list_key="groups", + domain=google_drive_connector.google_domain, + fields="groups(email)", + ): + # The id is the group email + group_email = group["email"] + + # Gather group member emails + group_member_emails: list[str] = [] + for member in execute_paginated_retrieval( + admin_service.members().list, + list_key="members", + groupKey=group_email, + fields="members(email)", + ): + group_member_emails.append(member["email"]) + group_map[group_email] = group_member_emails + return group_map + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_all_permissions( + mock_get_api_key: MagicMock, + google_drive_service_acct_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + google_drive_connector = google_drive_service_acct_connector_factory( + primary_admin_email=ADMIN_EMAIL, + include_shared_drives=True, + include_my_drives=True, + include_files_shared_with_me=False, + shared_folder_urls=None, + shared_drive_urls=None, + my_drive_emails=None, + ) + + access_map: dict[str, ExternalAccess] = {} + found_file_names = set() + for slim_doc_batch in google_drive_connector.retrieve_all_slim_documents( + 0, time.time() + ): + for slim_doc in slim_doc_batch: + name = (slim_doc.perm_sync_data or {})["name"] + access_map[name] = _get_permissions_from_slim_doc( + google_drive_connector=google_drive_connector, + slim_doc=slim_doc, + ) + found_file_names.add(name) + + for file_name, external_access in access_map.items(): + print(file_name, external_access) + + expected_file_range = ( + ADMIN_FILE_IDS # Admin's My Drive + + ADMIN_FOLDER_3_FILE_IDS # Admin's Folder 3 + + TEST_USER_1_FILE_IDS # TEST_USER_1's My Drive + + TEST_USER_2_FILE_IDS # TEST_USER_2's My Drive + + TEST_USER_3_FILE_IDS # TEST_USER_3's My Drive + + SHARED_DRIVE_1_FILE_IDS # Shared Drive 1 + + FOLDER_1_FILE_IDS # Folder 1 + + FOLDER_1_1_FILE_IDS # Folder 1_1 + + FOLDER_1_2_FILE_IDS # Folder 1_2 + + SHARED_DRIVE_2_FILE_IDS # Shared Drive 2 + + FOLDER_2_FILE_IDS # Folder 2 + + FOLDER_2_1_FILE_IDS # Folder 2_1 + + FOLDER_2_2_FILE_IDS # Folder 2_2 + + SECTIONS_FILE_IDS # Sections + ) + expected_file_names = { + file_name_template.format(file_id) for file_id in expected_file_range + } + + # Should get everything + print_discrepencies(expected_file_names, found_file_names) + assert expected_file_names == found_file_names + + group_map = get_group_map(google_drive_connector) + + print("groups:\n", group_map) + + assert_correct_access_for_user( + user_email=ADMIN_EMAIL, + expected_access_ids=ACCESS_MAPPING[ADMIN_EMAIL], + group_map=group_map, + retrieved_access_map=access_map, + ) + assert_correct_access_for_user( + user_email=TEST_USER_1_EMAIL, + expected_access_ids=ACCESS_MAPPING[TEST_USER_1_EMAIL], + group_map=group_map, + retrieved_access_map=access_map, + ) + + assert_correct_access_for_user( + user_email=TEST_USER_2_EMAIL, + expected_access_ids=ACCESS_MAPPING[TEST_USER_2_EMAIL], + group_map=group_map, + retrieved_access_map=access_map, + ) + assert_correct_access_for_user( + user_email=TEST_USER_3_EMAIL, + expected_access_ids=ACCESS_MAPPING[TEST_USER_3_EMAIL], + group_map=group_map, + retrieved_access_map=access_map, + ) diff --git a/backend/tests/daily/connectors/google_drive/test_user_1_oauth.py b/backend/tests/daily/connectors/google_drive/test_user_1_oauth.py new file mode 100644 index 00000000000..9f173536269 --- /dev/null +++ b/backend/tests/daily/connectors/google_drive/test_user_1_oauth.py @@ -0,0 +1,218 @@ +import time +from collections.abc import Callable +from unittest.mock import MagicMock +from unittest.mock import patch + +from danswer.connectors.google_drive.connector import GoogleDriveConnector +from danswer.connectors.models import Document +from tests.daily.connectors.google_drive.consts_and_utils import ADMIN_FOLDER_3_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import ( + assert_retrieved_docs_match_expected, +) +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_2_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_1_URL +from tests.daily.connectors.google_drive.consts_and_utils import FOLDER_3_URL +from tests.daily.connectors.google_drive.consts_and_utils import SHARED_DRIVE_1_FILE_IDS +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_EMAIL +from tests.daily.connectors.google_drive.consts_and_utils import TEST_USER_1_FILE_IDS + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_all( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_all") + connector = google_drive_oauth_connector_factory( + primary_admin_email=TEST_USER_1_EMAIL, + include_files_shared_with_me=True, + include_shared_drives=True, + include_my_drives=True, + shared_folder_urls=None, + shared_drive_urls=None, + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = ( + # These are the files from my drive + TEST_USER_1_FILE_IDS + # These are the files from shared drives + + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + # These are the files shared with me from admin + + ADMIN_FOLDER_3_FILE_IDS + + list(range(0, 2)) + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_shared_drives_only( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_shared_drives_only") + connector = google_drive_oauth_connector_factory( + primary_admin_email=TEST_USER_1_EMAIL, + include_files_shared_with_me=False, + include_shared_drives=True, + include_my_drives=False, + shared_folder_urls=None, + shared_drive_urls=None, + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = ( + # These are the files from shared drives + SHARED_DRIVE_1_FILE_IDS + + FOLDER_1_FILE_IDS + + FOLDER_1_1_FILE_IDS + + FOLDER_1_2_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_shared_with_me_only( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_shared_with_me_only") + connector = google_drive_oauth_connector_factory( + primary_admin_email=TEST_USER_1_EMAIL, + include_files_shared_with_me=True, + include_shared_drives=False, + include_my_drives=False, + shared_folder_urls=None, + shared_drive_urls=None, + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = ( + # These are the files shared with me from admin + ADMIN_FOLDER_3_FILE_IDS + + list(range(0, 2)) + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_my_drive_only( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_my_drive_only") + connector = google_drive_oauth_connector_factory( + primary_admin_email=TEST_USER_1_EMAIL, + include_files_shared_with_me=False, + include_shared_drives=False, + include_my_drives=True, + shared_folder_urls=None, + shared_drive_urls=None, + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + # These are the files from my drive + expected_file_ids = TEST_USER_1_FILE_IDS + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_shared_my_drive_folder( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_shared_my_drive_folder") + connector = google_drive_oauth_connector_factory( + primary_admin_email=TEST_USER_1_EMAIL, + include_files_shared_with_me=False, + include_shared_drives=False, + include_my_drives=True, + shared_folder_urls=FOLDER_3_URL, + shared_drive_urls=None, + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = ( + # this is a folder from admin's drive that is shared with me + ADMIN_FOLDER_3_FILE_IDS + ) + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) + + +@patch( + "danswer.file_processing.extract_file_text.get_unstructured_api_key", + return_value=None, +) +def test_shared_drive_folder( + mock_get_api_key: MagicMock, + google_drive_oauth_connector_factory: Callable[..., GoogleDriveConnector], +) -> None: + print("\n\nRunning test_shared_drive_folder") + connector = google_drive_oauth_connector_factory( + primary_admin_email=TEST_USER_1_EMAIL, + include_files_shared_with_me=False, + include_shared_drives=False, + include_my_drives=True, + shared_folder_urls=FOLDER_1_URL, + shared_drive_urls=None, + my_drive_emails=None, + ) + retrieved_docs: list[Document] = [] + for doc_batch in connector.poll_source(0, time.time()): + retrieved_docs.extend(doc_batch) + + expected_file_ids = FOLDER_1_FILE_IDS + FOLDER_1_1_FILE_IDS + FOLDER_1_2_FILE_IDS + assert_retrieved_docs_match_expected( + retrieved_docs=retrieved_docs, + expected_file_ids=expected_file_ids, + ) diff --git a/backend/tests/daily/connectors/jira/test_jira_basic.py b/backend/tests/daily/connectors/jira/test_jira_basic.py new file mode 100644 index 00000000000..19d69dfadcf --- /dev/null +++ b/backend/tests/daily/connectors/jira/test_jira_basic.py @@ -0,0 +1,48 @@ +import os +import time + +import pytest + +from danswer.configs.constants import DocumentSource +from danswer.connectors.danswer_jira.connector import JiraConnector + + +@pytest.fixture +def jira_connector() -> JiraConnector: + connector = JiraConnector( + "https://danswerai.atlassian.net/jira/software/c/projects/AS/boards/6", + comment_email_blacklist=[], + ) + connector.load_credentials( + { + "jira_user_email": os.environ["JIRA_USER_EMAIL"], + "jira_api_token": os.environ["JIRA_API_TOKEN"], + } + ) + return connector + + +def test_jira_connector_basic(jira_connector: JiraConnector) -> None: + doc_batch_generator = jira_connector.poll_source(0, time.time()) + + doc_batch = next(doc_batch_generator) + with pytest.raises(StopIteration): + next(doc_batch_generator) + + assert len(doc_batch) == 1 + + doc = doc_batch[0] + + assert doc.id == "https://danswerai.atlassian.net/browse/AS-2" + assert doc.semantic_identifier == "test123small" + assert doc.source == DocumentSource.JIRA + assert doc.metadata == {"priority": "Medium", "status": "Backlog"} + assert doc.secondary_owners is None + assert doc.title is None + assert doc.from_ingestion_api is False + assert doc.additional_info is None + + assert len(doc.sections) == 1 + section = doc.sections[0] + assert section.text == "example_text\n" + assert section.link == "https://danswerai.atlassian.net/browse/AS-2" diff --git a/backend/tests/daily/embedding/test_embeddings.py b/backend/tests/daily/embedding/test_embeddings.py index b736f374741..7182510214f 100644 --- a/backend/tests/daily/embedding/test_embeddings.py +++ b/backend/tests/daily/embedding/test_embeddings.py @@ -7,6 +7,7 @@ from shared_configs.model_server_models import EmbeddingProvider VALID_SAMPLE = ["hi", "hello my name is bob", "woah there!!!. 😃"] +VALID_LONG_SAMPLE = ["hi " * 999] # openai limit is 2048, cohere is supposed to be 96 but in practice that doesn't # seem to be true TOO_LONG_SAMPLE = ["a"] * 2500 @@ -61,6 +62,26 @@ def test_cohere_embedding(cohere_embedding_model: EmbeddingModel) -> None: _run_embeddings(TOO_LONG_SAMPLE, cohere_embedding_model, 384) +@pytest.fixture +def litellm_embedding_model() -> EmbeddingModel: + return EmbeddingModel( + server_host="localhost", + server_port=9000, + model_name="text-embedding-3-small", + normalize=True, + query_prefix=None, + passage_prefix=None, + api_key=os.getenv("LITE_LLM_API_KEY"), + provider_type=EmbeddingProvider.LITELLM, + api_url=os.getenv("LITE_LLM_API_URL"), + ) + + +def test_litellm_embedding(litellm_embedding_model: EmbeddingModel) -> None: + _run_embeddings(VALID_SAMPLE, litellm_embedding_model, 1536) + _run_embeddings(TOO_LONG_SAMPLE, litellm_embedding_model, 1536) + + @pytest.fixture def local_nomic_embedding_model() -> EmbeddingModel: return EmbeddingModel( @@ -79,3 +100,42 @@ def local_nomic_embedding_model() -> EmbeddingModel: def test_local_nomic_embedding(local_nomic_embedding_model: EmbeddingModel) -> None: _run_embeddings(VALID_SAMPLE, local_nomic_embedding_model, 768) _run_embeddings(TOO_LONG_SAMPLE, local_nomic_embedding_model, 768) + + +@pytest.fixture +def azure_embedding_model() -> EmbeddingModel: + return EmbeddingModel( + server_host="localhost", + server_port=9000, + model_name="text-embedding-3-large", + normalize=True, + query_prefix=None, + passage_prefix=None, + api_key=os.getenv("AZURE_API_KEY"), + provider_type=EmbeddingProvider.AZURE, + api_url=os.getenv("AZURE_API_URL"), + ) + + +# NOTE (chris): this test doesn't work, and I do not know why +# def test_azure_embedding_model_rate_limit(azure_embedding_model: EmbeddingModel): +# """NOTE: this test relies on a very low rate limit for the Azure API + +# this test only being run once in a 1 minute window""" +# # VALID_LONG_SAMPLE is 999 tokens, so the second call should run into rate +# # limits assuming the limit is 1000 tokens per minute +# result = azure_embedding_model.encode(VALID_LONG_SAMPLE, EmbedTextType.QUERY) +# assert len(result) == 1 +# assert len(result[0]) == 1536 + +# # this should fail +# with pytest.raises(ModelServerRateLimitError): +# azure_embedding_model.encode(VALID_LONG_SAMPLE, EmbedTextType.QUERY) +# azure_embedding_model.encode(VALID_LONG_SAMPLE, EmbedTextType.QUERY) +# azure_embedding_model.encode(VALID_LONG_SAMPLE, EmbedTextType.QUERY) + +# # this should succeed, since passage requests retry up to 10 times +# start = time.time() +# result = azure_embedding_model.encode(VALID_LONG_SAMPLE, EmbedTextType.PASSAGE) +# assert len(result) == 1 +# assert len(result[0]) == 1536 +# assert time.time() - start > 30 # make sure we waited, even though we hit rate limits diff --git a/backend/tests/daily/llm/test_bedrock.py b/backend/tests/daily/llm/test_bedrock.py new file mode 100644 index 00000000000..1d5022abf99 --- /dev/null +++ b/backend/tests/daily/llm/test_bedrock.py @@ -0,0 +1,81 @@ +import os +from typing import Any + +import pytest +from fastapi.testclient import TestClient + +from danswer.llm.llm_provider_options import BEDROCK_PROVIDER_NAME +from danswer.llm.llm_provider_options import fetch_available_well_known_llms +from danswer.llm.llm_provider_options import WellKnownLLMProviderDescriptor + + +@pytest.fixture +def bedrock_provider() -> WellKnownLLMProviderDescriptor: + provider = next( + ( + provider + for provider in fetch_available_well_known_llms() + if provider.name == BEDROCK_PROVIDER_NAME + ), + None, + ) + assert provider is not None, "Bedrock provider not found" + return provider + + +def test_bedrock_llm_configuration( + client: TestClient, bedrock_provider: WellKnownLLMProviderDescriptor +) -> None: + # Prepare the test request payload + test_request: dict[str, Any] = { + "provider": BEDROCK_PROVIDER_NAME, + "default_model_name": bedrock_provider.default_model, + "fast_default_model_name": bedrock_provider.default_fast_model, + "api_key": None, + "api_base": None, + "api_version": None, + "custom_config": { + "AWS_REGION_NAME": os.environ.get("AWS_REGION_NAME", "us-east-1"), + "AWS_ACCESS_KEY_ID": os.environ.get("AWS_ACCESS_KEY_ID"), + "AWS_SECRET_ACCESS_KEY": os.environ.get("AWS_SECRET_ACCESS_KEY"), + }, + } + + # Send the test request + response = client.post("/admin/llm/test", json=test_request) + + # Assert the response + assert ( + response.status_code == 200 + ), f"Expected status code 200, but got {response.status_code}. Response: {response.text}" + + +def test_bedrock_llm_configuration_invalid_key( + client: TestClient, bedrock_provider: WellKnownLLMProviderDescriptor +) -> None: + # Prepare the test request payload with invalid credentials + test_request: dict[str, Any] = { + "provider": BEDROCK_PROVIDER_NAME, + "default_model_name": bedrock_provider.default_model, + "fast_default_model_name": bedrock_provider.default_fast_model, + "api_key": None, + "api_base": None, + "api_version": None, + "custom_config": { + "AWS_REGION_NAME": "us-east-1", + "AWS_ACCESS_KEY_ID": "invalid_access_key_id", + "AWS_SECRET_ACCESS_KEY": "invalid_secret_access_key", + }, + } + + # Send the test request + response = client.post("/admin/llm/test", json=test_request) + + # Assert the response + assert ( + response.status_code == 400 + ), f"Expected status code 400, but got {response.status_code}. Response: {response.text}" + assert ( + "Invalid credentials" in response.text + or "Invalid Authentication" in response.text + ), f"Expected error message about invalid credentials, but got: {response.text}" diff --git a/backend/tests/integration/Dockerfile b/backend/tests/integration/Dockerfile index ebd5e11b8a7..3eecb0d5683 100644 --- a/backend/tests/integration/Dockerfile +++ b/backend/tests/integration/Dockerfile @@ -72,6 +72,7 @@ COPY ./danswer /app/danswer COPY ./shared_configs /app/shared_configs COPY ./alembic /app/alembic COPY ./alembic.ini /app/alembic.ini +COPY ./pytest.ini /app/pytest.ini COPY supervisord.conf /usr/etc/supervisord.conf # Integration test stuff @@ -80,6 +81,7 @@ RUN pip install --no-cache-dir --upgrade \ -r /tmp/dev-requirements.txt COPY ./tests/integration /app/tests/integration -ENV PYTHONPATH /app +ENV PYTHONPATH=/app -CMD ["pytest", "-s", "/app/tests/integration"] +ENTRYPOINT ["pytest", "-s"] +CMD ["/app/tests/integration", "--ignore=/app/tests/integration/multitenant_tests"] \ No newline at end of file diff --git a/backend/tests/integration/common_utils/constants.py b/backend/tests/integration/common_utils/constants.py index 7d729191cf6..57db1ad9a32 100644 --- a/backend/tests/integration/common_utils/constants.py +++ b/backend/tests/integration/common_utils/constants.py @@ -4,7 +4,7 @@ API_SERVER_HOST = os.getenv("API_SERVER_HOST") or "localhost" API_SERVER_PORT = os.getenv("API_SERVER_PORT") or "8080" API_SERVER_URL = f"{API_SERVER_PROTOCOL}://{API_SERVER_HOST}:{API_SERVER_PORT}" -MAX_DELAY = 30 +MAX_DELAY = 45 GENERAL_HEADERS = {"Content-Type": "application/json"} diff --git a/backend/tests/integration/common_utils/managers/api_key.py b/backend/tests/integration/common_utils/managers/api_key.py index b6d2c29b732..9df6b1c4f2f 100644 --- a/backend/tests/integration/common_utils/managers/api_key.py +++ b/backend/tests/integration/common_utils/managers/api_key.py @@ -3,11 +3,11 @@ import requests from danswer.db.models import UserRole -from ee.danswer.server.api_key.models import APIKeyArgs +from danswer.server.api_key.models import APIKeyArgs from tests.integration.common_utils.constants import API_SERVER_URL from tests.integration.common_utils.constants import GENERAL_HEADERS -from tests.integration.common_utils.test_models import TestAPIKey -from tests.integration.common_utils.test_models import TestUser +from tests.integration.common_utils.test_models import DATestAPIKey +from tests.integration.common_utils.test_models import DATestUser class APIKeyManager: @@ -15,8 +15,8 @@ class APIKeyManager: def create( name: str | None = None, api_key_role: UserRole = UserRole.ADMIN, - user_performing_action: TestUser | None = None, - ) -> TestAPIKey: + user_performing_action: DATestUser | None = None, + ) -> DATestAPIKey: name = f"{name}-api-key" if name else f"test-api-key-{uuid4()}" api_key_request = APIKeyArgs( name=name, @@ -31,7 +31,7 @@ def create( ) api_key_response.raise_for_status() api_key = api_key_response.json() - result_api_key = TestAPIKey( + result_api_key = DATestAPIKey( api_key_id=api_key["api_key_id"], api_key_display=api_key["api_key_display"], api_key=api_key["api_key"], @@ -45,8 +45,8 @@ def create( @staticmethod def delete( - api_key: TestAPIKey, - user_performing_action: TestUser | None = None, + api_key: DATestAPIKey, + user_performing_action: DATestUser | None = None, ) -> None: api_key_response = requests.delete( f"{API_SERVER_URL}/admin/api-key/{api_key.api_key_id}", @@ -58,8 +58,8 @@ def delete( @staticmethod def get_all( - user_performing_action: TestUser | None = None, - ) -> list[TestAPIKey]: + user_performing_action: DATestUser | None = None, + ) -> list[DATestAPIKey]: api_key_response = requests.get( f"{API_SERVER_URL}/admin/api-key", headers=user_performing_action.headers @@ -67,13 +67,13 @@ def get_all( else GENERAL_HEADERS, ) api_key_response.raise_for_status() - return [TestAPIKey(**api_key) for api_key in api_key_response.json()] + return [DATestAPIKey(**api_key) for api_key in api_key_response.json()] @staticmethod def verify( - api_key: TestAPIKey, + api_key: DATestAPIKey, verify_deleted: bool = False, - user_performing_action: TestUser | None = None, + user_performing_action: DATestUser | None = None, ) -> None: retrieved_keys = APIKeyManager.get_all( user_performing_action=user_performing_action diff --git a/backend/tests/integration/common_utils/managers/cc_pair.py b/backend/tests/integration/common_utils/managers/cc_pair.py index 6498252bbe8..b37822d3496 100644 --- a/backend/tests/integration/common_utils/managers/cc_pair.py +++ b/backend/tests/integration/common_utils/managers/cc_pair.py @@ -1,36 +1,40 @@ import time +from datetime import datetime from typing import Any from uuid import uuid4 import requests from danswer.connectors.models import InputType +from danswer.db.enums import AccessType from danswer.db.enums import ConnectorCredentialPairStatus +from danswer.server.documents.models import CCPairFullInfo from danswer.server.documents.models import ConnectorCredentialPairIdentifier from danswer.server.documents.models import ConnectorIndexingStatus from danswer.server.documents.models import DocumentSource +from danswer.server.documents.models import DocumentSyncStatus from tests.integration.common_utils.constants import API_SERVER_URL from tests.integration.common_utils.constants import GENERAL_HEADERS from tests.integration.common_utils.constants import MAX_DELAY from tests.integration.common_utils.managers.connector import ConnectorManager from tests.integration.common_utils.managers.credential import CredentialManager -from tests.integration.common_utils.test_models import TestCCPair -from tests.integration.common_utils.test_models import TestUser +from tests.integration.common_utils.test_models import DATestCCPair +from tests.integration.common_utils.test_models import DATestUser def _cc_pair_creator( connector_id: int, credential_id: int, name: str | None = None, - is_public: bool = True, + access_type: AccessType = AccessType.PUBLIC, groups: list[int] | None = None, - user_performing_action: TestUser | None = None, -) -> TestCCPair: + user_performing_action: DATestUser | None = None, +) -> DATestCCPair: name = f"{name}-cc-pair" if name else f"test-cc-pair-{uuid4()}" request = { "name": name, - "is_public": is_public, + "access_type": access_type, "groups": groups or [], } @@ -42,12 +46,12 @@ def _cc_pair_creator( else GENERAL_HEADERS, ) response.raise_for_status() - return TestCCPair( + return DATestCCPair( id=response.json()["data"], name=name, connector_id=connector_id, credential_id=credential_id, - is_public=is_public, + access_type=access_type, groups=groups or [], ) @@ -56,20 +60,20 @@ class CCPairManager: @staticmethod def create_from_scratch( name: str | None = None, - is_public: bool = True, + access_type: AccessType = AccessType.PUBLIC, groups: list[int] | None = None, source: DocumentSource = DocumentSource.FILE, input_type: InputType = InputType.LOAD_STATE, connector_specific_config: dict[str, Any] | None = None, credential_json: dict[str, Any] | None = None, - user_performing_action: TestUser | None = None, - ) -> TestCCPair: + user_performing_action: DATestUser | None = None, + ) -> DATestCCPair: connector = ConnectorManager.create( name=name, source=source, input_type=input_type, connector_specific_config=connector_specific_config, - is_public=is_public, + access_type=access_type, groups=groups, user_performing_action=user_performing_action, ) @@ -77,41 +81,43 @@ def create_from_scratch( credential_json=credential_json, name=name, source=source, - curator_public=is_public, + curator_public=(access_type == AccessType.PUBLIC), groups=groups, user_performing_action=user_performing_action, ) - return _cc_pair_creator( + cc_pair = _cc_pair_creator( connector_id=connector.id, credential_id=credential.id, name=name, - is_public=is_public, + access_type=access_type, groups=groups, user_performing_action=user_performing_action, ) + return cc_pair @staticmethod def create( connector_id: int, credential_id: int, name: str | None = None, - is_public: bool = True, + access_type: AccessType = AccessType.PUBLIC, groups: list[int] | None = None, - user_performing_action: TestUser | None = None, - ) -> TestCCPair: - return _cc_pair_creator( + user_performing_action: DATestUser | None = None, + ) -> DATestCCPair: + cc_pair = _cc_pair_creator( connector_id=connector_id, credential_id=credential_id, name=name, - is_public=is_public, + access_type=access_type, groups=groups, user_performing_action=user_performing_action, ) + return cc_pair @staticmethod def pause_cc_pair( - cc_pair: TestCCPair, - user_performing_action: TestUser | None = None, + cc_pair: DATestCCPair, + user_performing_action: DATestUser | None = None, ) -> None: result = requests.put( url=f"{API_SERVER_URL}/manage/admin/cc-pair/{cc_pair.id}/status", @@ -124,8 +130,8 @@ def pause_cc_pair( @staticmethod def delete( - cc_pair: TestCCPair, - user_performing_action: TestUser | None = None, + cc_pair: DATestCCPair, + user_performing_action: DATestUser | None = None, ) -> None: cc_pair_identifier = ConnectorCredentialPairIdentifier( connector_id=cc_pair.connector_id, @@ -141,8 +147,42 @@ def delete( result.raise_for_status() @staticmethod - def get_all( - user_performing_action: TestUser | None = None, + def get_single( + cc_pair_id: int, + user_performing_action: DATestUser | None = None, + ) -> CCPairFullInfo | None: + response = requests.get( + f"{API_SERVER_URL}/manage/admin/cc-pair/{cc_pair_id}", + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + response.raise_for_status() + cc_pair_json = response.json() + return CCPairFullInfo(**cc_pair_json) + + @staticmethod + def get_indexing_status_by_id( + cc_pair_id: int, + user_performing_action: DATestUser | None = None, + ) -> ConnectorIndexingStatus | None: + response = requests.get( + f"{API_SERVER_URL}/manage/admin/connector/indexing-status", + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + response.raise_for_status() + for cc_pair_json in response.json(): + cc_pair = ConnectorIndexingStatus(**cc_pair_json) + if cc_pair.cc_pair_id == cc_pair_id: + return cc_pair + + return None + + @staticmethod + def get_indexing_statuses( + user_performing_action: DATestUser | None = None, ) -> list[ConnectorIndexingStatus]: response = requests.get( f"{API_SERVER_URL}/manage/admin/connector/indexing-status", @@ -155,11 +195,11 @@ def get_all( @staticmethod def verify( - cc_pair: TestCCPair, + cc_pair: DATestCCPair, verify_deleted: bool = False, - user_performing_action: TestUser | None = None, + user_performing_action: DATestUser | None = None, ) -> None: - all_cc_pairs = CCPairManager.get_all(user_performing_action) + all_cc_pairs = CCPairManager.get_indexing_statuses(user_performing_action) for retrieved_cc_pair in all_cc_pairs: if retrieved_cc_pair.cc_pair_id == cc_pair.id: if verify_deleted: @@ -172,7 +212,7 @@ def verify( retrieved_cc_pair.name == cc_pair.name and retrieved_cc_pair.connector.id == cc_pair.connector_id and retrieved_cc_pair.credential.id == cc_pair.credential_id - and retrieved_cc_pair.public_doc == cc_pair.is_public + and retrieved_cc_pair.access_type == cc_pair.access_type and set(retrieved_cc_pair.groups) == set(cc_pair.groups) ): return @@ -180,20 +220,295 @@ def verify( if not verify_deleted: raise ValueError(f"CC pair {cc_pair.id} not found") + @staticmethod + def run_once( + cc_pair: DATestCCPair, + user_performing_action: DATestUser | None = None, + ) -> None: + body = { + "connector_id": cc_pair.connector_id, + "credential_ids": [cc_pair.credential_id], + "from_beginning": True, + } + result = requests.post( + url=f"{API_SERVER_URL}/manage/admin/connector/run-once", + json=body, + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + result.raise_for_status() + + @staticmethod + def wait_for_indexing( + cc_pair: DATestCCPair, + after: datetime, + timeout: float = MAX_DELAY, + user_performing_action: DATestUser | None = None, + ) -> None: + """after: Wait for an indexing success time after this time""" + start = time.monotonic() + while True: + fetched_cc_pairs = CCPairManager.get_indexing_statuses( + user_performing_action + ) + for fetched_cc_pair in fetched_cc_pairs: + if fetched_cc_pair.cc_pair_id != cc_pair.id: + continue + + if fetched_cc_pair.in_progress: + continue + + if ( + fetched_cc_pair.last_success + and fetched_cc_pair.last_success > after + ): + print(f"Indexing complete: cc_pair={cc_pair.id}") + return + + elapsed = time.monotonic() - start + if elapsed > timeout: + raise TimeoutError( + f"Indexing wait timed out: cc_pair={cc_pair.id} timeout={timeout}s" + ) + + print( + f"Indexing wait for completion: cc_pair={cc_pair.id} elapsed={elapsed:.2f} timeout={timeout}s" + ) + time.sleep(5) + + @staticmethod + def prune( + cc_pair: DATestCCPair, + user_performing_action: DATestUser | None = None, + ) -> None: + result = requests.post( + url=f"{API_SERVER_URL}/manage/admin/cc-pair/{cc_pair.id}/prune", + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + result.raise_for_status() + + @staticmethod + def last_pruned( + cc_pair: DATestCCPair, + user_performing_action: DATestUser | None = None, + ) -> datetime | None: + response = requests.get( + url=f"{API_SERVER_URL}/manage/admin/cc-pair/{cc_pair.id}/last_pruned", + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + response.raise_for_status() + response_str = response.json() + + # If the response itself is a datetime string, parse it + if not isinstance(response_str, str): + return None + + try: + return datetime.fromisoformat(response_str) + except ValueError: + return None + + @staticmethod + def wait_for_prune( + cc_pair: DATestCCPair, + after: datetime, + timeout: float = MAX_DELAY, + user_performing_action: DATestUser | None = None, + ) -> None: + """after: The task register time must be after this time.""" + start = time.monotonic() + while True: + last_pruned = CCPairManager.last_pruned(cc_pair, user_performing_action) + if last_pruned and last_pruned > after: + print(f"Pruning complete: cc_pair={cc_pair.id}") + break + + elapsed = time.monotonic() - start + if elapsed > timeout: + raise TimeoutError( + f"CC pair pruning was not completed within {timeout} seconds" + ) + + print( + f"Waiting for CC pruning to complete. elapsed={elapsed:.2f} timeout={timeout}" + ) + time.sleep(5) + + @staticmethod + def sync( + cc_pair: DATestCCPair, + user_performing_action: DATestUser | None = None, + ) -> None: + """This function triggers a permission sync. + Naming / intent of this function probably could use improvement, but currently it's letting + 409 Conflict pass through since if it's running that's what we were trying to do anyway. + """ + result = requests.post( + url=f"{API_SERVER_URL}/manage/admin/cc-pair/{cc_pair.id}/sync-permissions", + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + # + if result.status_code != 409: + result.raise_for_status() + + @staticmethod + def get_sync_task( + cc_pair: DATestCCPair, + user_performing_action: DATestUser | None = None, + ) -> datetime | None: + response = requests.get( + url=f"{API_SERVER_URL}/manage/admin/cc-pair/{cc_pair.id}/sync-permissions", + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + response.raise_for_status() + response_str = response.json() + + # If the response itself is a datetime string, parse it + if not isinstance(response_str, str): + return None + + try: + return datetime.fromisoformat(response_str) + except ValueError: + return None + + @staticmethod + def get_doc_sync_statuses( + cc_pair: DATestCCPair, + user_performing_action: DATestUser | None = None, + ) -> list[DocumentSyncStatus]: + response = requests.get( + url=f"{API_SERVER_URL}/manage/admin/cc-pair/{cc_pair.id}/get-docs-sync-status", + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + response.raise_for_status() + doc_sync_statuses: list[DocumentSyncStatus] = [] + for doc_sync_status in response.json(): + last_synced = doc_sync_status.get("last_synced") + if last_synced: + last_synced = datetime.fromisoformat(last_synced) + + last_modified = doc_sync_status.get("last_modified") + if last_modified: + last_modified = datetime.fromisoformat(last_modified) + + doc_sync_statuses.append( + DocumentSyncStatus( + doc_id=doc_sync_status["doc_id"], + last_synced=last_synced, + last_modified=last_modified, + ) + ) + + return doc_sync_statuses + + @staticmethod + def wait_for_sync( + cc_pair: DATestCCPair, + after: datetime, + timeout: float = MAX_DELAY, + number_of_updated_docs: int = 0, + user_performing_action: DATestUser | None = None, + ) -> None: + """after: The task register time must be after this time.""" + start = time.monotonic() + while True: + last_synced = CCPairManager.get_sync_task(cc_pair, user_performing_action) + if last_synced and last_synced > after: + print(f"last_synced: {last_synced}") + print(f"sync command start time: {after}") + print(f"permission sync complete: cc_pair={cc_pair.id}") + break + + elapsed = time.monotonic() - start + if elapsed > timeout: + raise TimeoutError( + f"Permission sync was not completed within {timeout} seconds" + ) + + print( + f"Waiting for CC sync to complete. elapsed={elapsed:.2f} timeout={timeout}" + ) + time.sleep(5) + + # TODO: remove this sleep, + # this shouldnt be necessary but something is off with the timing for the sync jobs + time.sleep(5) + + print("waiting for vespa sync") + # wait for the vespa sync to complete once the permission sync is complete + start = time.monotonic() + while True: + doc_sync_statuses = CCPairManager.get_doc_sync_statuses( + cc_pair=cc_pair, + user_performing_action=user_performing_action, + ) + synced_docs = 0 + for doc_sync_status in doc_sync_statuses: + if ( + doc_sync_status.last_synced is not None + and doc_sync_status.last_modified is not None + and doc_sync_status.last_synced >= doc_sync_status.last_modified + and doc_sync_status.last_synced >= after + and doc_sync_status.last_modified >= after + ): + synced_docs += 1 + + if synced_docs >= number_of_updated_docs: + print(f"all docs synced: cc_pair={cc_pair.id}") + break + + elapsed = time.monotonic() - start + if elapsed > timeout: + raise TimeoutError( + f"Vespa sync was not completed within {timeout} seconds" + ) + + print( + f"Waiting for vespa sync to complete. elapsed={elapsed:.2f} timeout={timeout}" + ) + time.sleep(5) + @staticmethod def wait_for_deletion_completion( - user_performing_action: TestUser | None = None, + cc_pair_id: int | None = None, + user_performing_action: DATestUser | None = None, ) -> None: - start = time.time() + """if cc_pair_id is not specified, just waits until no connectors are in the deleting state. + if cc_pair_id is specified, checks to ensure the specific cc_pair_id is gone. + We had a bug where the connector was paused in the middle of deleting, so specifying the + cc_pair_id is good to do.""" + start = time.monotonic() while True: - cc_pairs = CCPairManager.get_all(user_performing_action) - if all( - cc_pair.cc_pair_status != ConnectorCredentialPairStatus.DELETING - for cc_pair in cc_pairs - ): - return - - if time.time() - start > MAX_DELAY: + cc_pairs = CCPairManager.get_indexing_statuses(user_performing_action) + if cc_pair_id: + found = False + for cc_pair in cc_pairs: + if cc_pair.cc_pair_id == cc_pair_id: + found = True + break + + if not found: + return + else: + if all( + cc_pair.cc_pair_status != ConnectorCredentialPairStatus.DELETING + for cc_pair in cc_pairs + ): + return + + if time.monotonic() - start > MAX_DELAY: raise TimeoutError( f"CC pairs deletion was not completed within the {MAX_DELAY} seconds" ) diff --git a/backend/tests/integration/common_utils/managers/chat.py b/backend/tests/integration/common_utils/managers/chat.py index 3d62817641d..106aa26a791 100644 --- a/backend/tests/integration/common_utils/managers/chat.py +++ b/backend/tests/integration/common_utils/managers/chat.py @@ -1,31 +1,32 @@ import json +from uuid import UUID import requests from requests.models import Response +from danswer.context.search.models import RetrievalDetails from danswer.file_store.models import FileDescriptor from danswer.llm.override_models import LLMOverride from danswer.llm.override_models import PromptOverride from danswer.one_shot_answer.models import DirectQARequest from danswer.one_shot_answer.models import ThreadMessage -from danswer.search.models import RetrievalDetails from danswer.server.query_and_chat.models import ChatSessionCreationRequest from danswer.server.query_and_chat.models import CreateChatMessageRequest from tests.integration.common_utils.constants import API_SERVER_URL from tests.integration.common_utils.constants import GENERAL_HEADERS +from tests.integration.common_utils.test_models import DATestChatMessage +from tests.integration.common_utils.test_models import DATestChatSession +from tests.integration.common_utils.test_models import DATestUser from tests.integration.common_utils.test_models import StreamedResponse -from tests.integration.common_utils.test_models import TestChatMessage -from tests.integration.common_utils.test_models import TestChatSession -from tests.integration.common_utils.test_models import TestUser class ChatSessionManager: @staticmethod def create( - persona_id: int = -1, + persona_id: int = 0, description: str = "Test chat session", - user_performing_action: TestUser | None = None, - ) -> TestChatSession: + user_performing_action: DATestUser | None = None, + ) -> DATestChatSession: chat_session_creation_req = ChatSessionCreationRequest( persona_id=persona_id, description=description ) @@ -38,16 +39,16 @@ def create( ) response.raise_for_status() chat_session_id = response.json()["chat_session_id"] - return TestChatSession( + return DATestChatSession( id=chat_session_id, persona_id=persona_id, description=description ) @staticmethod def send_message( - chat_session_id: int, + chat_session_id: UUID, message: str, parent_message_id: int | None = None, - user_performing_action: TestUser | None = None, + user_performing_action: DATestUser | None = None, file_descriptors: list[FileDescriptor] = [], prompt_id: int | None = None, search_doc_ids: list[int] | None = None, @@ -90,7 +91,7 @@ def send_message( def get_answer_with_quote( persona_id: int, message: str, - user_performing_action: TestUser | None = None, + user_performing_action: DATestUser | None = None, ) -> StreamedResponse: direct_qa_request = DirectQARequest( messages=[ThreadMessage(message=message)], @@ -137,11 +138,11 @@ def analyze_response(response: Response) -> StreamedResponse: @staticmethod def get_chat_history( - chat_session: TestChatSession, - user_performing_action: TestUser | None = None, - ) -> list[TestChatMessage]: + chat_session: DATestChatSession, + user_performing_action: DATestUser | None = None, + ) -> list[DATestChatMessage]: response = requests.get( - f"{API_SERVER_URL}/chat/history/{chat_session.id}", + f"{API_SERVER_URL}/chat/get-chat-session/{chat_session.id}", headers=user_performing_action.headers if user_performing_action else GENERAL_HEADERS, @@ -149,12 +150,11 @@ def get_chat_history( response.raise_for_status() return [ - TestChatMessage( - id=msg["id"], + DATestChatMessage( + id=msg["message_id"], chat_session_id=chat_session.id, - parent_message_id=msg.get("parent_message_id"), + parent_message_id=msg.get("parent_message"), message=msg["message"], - response=msg.get("response", ""), ) - for msg in response.json() + for msg in response.json()["messages"] ] diff --git a/backend/tests/integration/common_utils/managers/connector.py b/backend/tests/integration/common_utils/managers/connector.py index f72d079683b..5c090c387dd 100644 --- a/backend/tests/integration/common_utils/managers/connector.py +++ b/backend/tests/integration/common_utils/managers/connector.py @@ -4,12 +4,13 @@ import requests from danswer.connectors.models import InputType +from danswer.db.enums import AccessType from danswer.server.documents.models import ConnectorUpdateRequest from danswer.server.documents.models import DocumentSource from tests.integration.common_utils.constants import API_SERVER_URL from tests.integration.common_utils.constants import GENERAL_HEADERS -from tests.integration.common_utils.test_models import TestConnector -from tests.integration.common_utils.test_models import TestUser +from tests.integration.common_utils.test_models import DATestConnector +from tests.integration.common_utils.test_models import DATestUser class ConnectorManager: @@ -19,10 +20,10 @@ def create( source: DocumentSource = DocumentSource.FILE, input_type: InputType = InputType.LOAD_STATE, connector_specific_config: dict[str, Any] | None = None, - is_public: bool = True, + access_type: AccessType = AccessType.PUBLIC, groups: list[int] | None = None, - user_performing_action: TestUser | None = None, - ) -> TestConnector: + user_performing_action: DATestUser | None = None, + ) -> DATestConnector: name = f"{name}-connector" if name else f"test-connector-{uuid4()}" connector_update_request = ConnectorUpdateRequest( @@ -30,7 +31,7 @@ def create( source=source, input_type=input_type, connector_specific_config=connector_specific_config or {}, - is_public=is_public, + access_type=access_type, groups=groups or [], ) @@ -44,20 +45,20 @@ def create( response.raise_for_status() response_data = response.json() - return TestConnector( + return DATestConnector( id=response_data.get("id"), name=name, source=source, input_type=input_type, connector_specific_config=connector_specific_config or {}, groups=groups, - is_public=is_public, + access_type=access_type, ) @staticmethod def edit( - connector: TestConnector, - user_performing_action: TestUser | None = None, + connector: DATestConnector, + user_performing_action: DATestUser | None = None, ) -> None: response = requests.patch( url=f"{API_SERVER_URL}/manage/admin/connector/{connector.id}", @@ -70,8 +71,8 @@ def edit( @staticmethod def delete( - connector: TestConnector, - user_performing_action: TestUser | None = None, + connector: DATestConnector, + user_performing_action: DATestUser | None = None, ) -> None: response = requests.delete( url=f"{API_SERVER_URL}/manage/admin/connector/{connector.id}", @@ -83,8 +84,8 @@ def delete( @staticmethod def get_all( - user_performing_action: TestUser | None = None, - ) -> list[TestConnector]: + user_performing_action: DATestUser | None = None, + ) -> list[DATestConnector]: response = requests.get( url=f"{API_SERVER_URL}/manage/connector", headers=user_performing_action.headers @@ -93,7 +94,7 @@ def get_all( ) response.raise_for_status() return [ - TestConnector( + DATestConnector( id=conn.get("id"), name=conn.get("name", ""), source=conn.get("source", DocumentSource.FILE), @@ -105,8 +106,8 @@ def get_all( @staticmethod def get( - connector_id: int, user_performing_action: TestUser | None = None - ) -> TestConnector: + connector_id: int, user_performing_action: DATestUser | None = None + ) -> DATestConnector: response = requests.get( url=f"{API_SERVER_URL}/manage/connector/{connector_id}", headers=user_performing_action.headers @@ -115,7 +116,7 @@ def get( ) response.raise_for_status() conn = response.json() - return TestConnector( + return DATestConnector( id=conn.get("id"), name=conn.get("name", ""), source=conn.get("source", DocumentSource.FILE), diff --git a/backend/tests/integration/common_utils/managers/credential.py b/backend/tests/integration/common_utils/managers/credential.py index c05cd1b5a3e..8c8a59d4856 100644 --- a/backend/tests/integration/common_utils/managers/credential.py +++ b/backend/tests/integration/common_utils/managers/credential.py @@ -7,8 +7,8 @@ from danswer.server.documents.models import DocumentSource from tests.integration.common_utils.constants import API_SERVER_URL from tests.integration.common_utils.constants import GENERAL_HEADERS -from tests.integration.common_utils.test_models import TestCredential -from tests.integration.common_utils.test_models import TestUser +from tests.integration.common_utils.test_models import DATestCredential +from tests.integration.common_utils.test_models import DATestUser class CredentialManager: @@ -20,8 +20,8 @@ def create( source: DocumentSource = DocumentSource.FILE, curator_public: bool = True, groups: list[int] | None = None, - user_performing_action: TestUser | None = None, - ) -> TestCredential: + user_performing_action: DATestUser | None = None, + ) -> DATestCredential: name = f"{name}-credential" if name else f"test-credential-{uuid4()}" credential_request = { @@ -32,6 +32,7 @@ def create( "curator_public": curator_public, "groups": groups or [], } + response = requests.post( url=f"{API_SERVER_URL}/manage/credential", json=credential_request, @@ -41,7 +42,7 @@ def create( ) response.raise_for_status() - return TestCredential( + return DATestCredential( id=response.json()["id"], name=name, credential_json=credential_json or {}, @@ -53,8 +54,8 @@ def create( @staticmethod def edit( - credential: TestCredential, - user_performing_action: TestUser | None = None, + credential: DATestCredential, + user_performing_action: DATestUser | None = None, ) -> None: request = credential.model_dump(include={"name", "credential_json"}) response = requests.put( @@ -68,8 +69,8 @@ def edit( @staticmethod def delete( - credential: TestCredential, - user_performing_action: TestUser | None = None, + credential: DATestCredential, + user_performing_action: DATestUser | None = None, ) -> None: response = requests.delete( url=f"{API_SERVER_URL}/manage/credential/{credential.id}", @@ -81,7 +82,7 @@ def delete( @staticmethod def get( - credential_id: int, user_performing_action: TestUser | None = None + credential_id: int, user_performing_action: DATestUser | None = None ) -> CredentialSnapshot: response = requests.get( url=f"{API_SERVER_URL}/manage/credential/{credential_id}", @@ -94,7 +95,7 @@ def get( @staticmethod def get_all( - user_performing_action: TestUser | None = None, + user_performing_action: DATestUser | None = None, ) -> list[CredentialSnapshot]: response = requests.get( f"{API_SERVER_URL}/manage/credential", @@ -107,9 +108,9 @@ def get_all( @staticmethod def verify( - credential: TestCredential, + credential: DATestCredential, verify_deleted: bool = False, - user_performing_action: TestUser | None = None, + user_performing_action: DATestUser | None = None, ) -> None: all_credentials = CredentialManager.get_all(user_performing_action) for fetched_credential in all_credentials: diff --git a/backend/tests/integration/common_utils/managers/document.py b/backend/tests/integration/common_utils/managers/document.py index 3f691eca8f9..eecd75fa157 100644 --- a/backend/tests/integration/common_utils/managers/document.py +++ b/backend/tests/integration/common_utils/managers/document.py @@ -3,26 +3,27 @@ import requests from danswer.configs.constants import DocumentSource +from danswer.db.enums import AccessType from tests.integration.common_utils.constants import API_SERVER_URL from tests.integration.common_utils.constants import GENERAL_HEADERS from tests.integration.common_utils.constants import NUM_DOCS -from tests.integration.common_utils.managers.api_key import TestAPIKey -from tests.integration.common_utils.managers.cc_pair import TestCCPair +from tests.integration.common_utils.managers.api_key import DATestAPIKey +from tests.integration.common_utils.managers.cc_pair import DATestCCPair +from tests.integration.common_utils.test_models import DATestUser from tests.integration.common_utils.test_models import SimpleTestDocument -from tests.integration.common_utils.test_models import TestUser -from tests.integration.common_utils.vespa import TestVespaClient +from tests.integration.common_utils.vespa import vespa_fixture def _verify_document_permissions( retrieved_doc: dict, - cc_pair: TestCCPair, + cc_pair: DATestCCPair, doc_set_names: list[str] | None = None, group_names: list[str] | None = None, - doc_creating_user: TestUser | None = None, + doc_creating_user: DATestUser | None = None, ) -> None: acl_keys = set(retrieved_doc["access_control_list"].keys()) print(f"ACL keys: {acl_keys}") - if cc_pair.is_public: + if cc_pair.access_type == AccessType.PUBLIC: if "PUBLIC" not in acl_keys: raise ValueError( f"Document {retrieved_doc['document_id']} is public but" @@ -30,10 +31,10 @@ def _verify_document_permissions( ) if doc_creating_user is not None: - if f"user_id:{doc_creating_user.id}" not in acl_keys: + if f"user_email:{doc_creating_user.email}" not in acl_keys: raise ValueError( f"Document {retrieved_doc['document_id']} was created by user" - f" {doc_creating_user.id} but does not have the user_id:{doc_creating_user.id} ACL key" + f" {doc_creating_user.email} but does not have the user_email:{doc_creating_user.email} ACL key" ) if group_names is not None: @@ -54,13 +55,18 @@ def _verify_document_permissions( ) -def _generate_dummy_document(document_id: str, cc_pair_id: int) -> dict: +def _generate_dummy_document( + document_id: str, + cc_pair_id: int, + content: str | None = None, +) -> dict: + text = content if content else f"This is test document {document_id}" return { "document": { "id": document_id, "sections": [ { - "text": f"This is test document {document_id}", + "text": text, "link": f"{document_id}", } ], @@ -76,12 +82,12 @@ def _generate_dummy_document(document_id: str, cc_pair_id: int) -> dict: class DocumentManager: @staticmethod - def seed_and_attach_docs( - cc_pair: TestCCPair, + def seed_dummy_docs( + cc_pair: DATestCCPair, num_docs: int = NUM_DOCS, document_ids: list[str] | None = None, - api_key: TestAPIKey | None = None, - ) -> TestCCPair: + api_key: DATestAPIKey | None = None, + ) -> list[SimpleTestDocument]: # Use provided document_ids if available, otherwise generate random UUIDs if document_ids is None: document_ids = [f"test-doc-{uuid4()}" for _ in range(num_docs)] @@ -100,24 +106,49 @@ def seed_and_attach_docs( response.raise_for_status() print("Seeding completed successfully.") - cc_pair.documents = [ + return [ SimpleTestDocument( id=document["document"]["id"], content=document["document"]["sections"][0]["text"], ) for document in documents ] - return cc_pair + + @staticmethod + def seed_doc_with_content( + cc_pair: DATestCCPair, + content: str, + document_id: str | None = None, + api_key: DATestAPIKey | None = None, + ) -> SimpleTestDocument: + # Use provided document_ids if available, otherwise generate random UUIDs + if document_id is None: + document_id = f"test-doc-{uuid4()}" + # Create and ingest some documents + document: dict = _generate_dummy_document(document_id, cc_pair.id, content) + response = requests.post( + f"{API_SERVER_URL}/danswer-api/ingestion", + json=document, + headers=api_key.headers if api_key else GENERAL_HEADERS, + ) + response.raise_for_status() + + print("Seeding completed successfully.") + + return SimpleTestDocument( + id=document["document"]["id"], + content=document["document"]["sections"][0]["text"], + ) @staticmethod def verify( - vespa_client: TestVespaClient, - cc_pair: TestCCPair, + vespa_client: vespa_fixture, + cc_pair: DATestCCPair, # If None, will not check doc sets or groups # If empty list, will check for empty doc sets or groups doc_set_names: list[str] | None = None, group_names: list[str] | None = None, - doc_creating_user: TestUser | None = None, + doc_creating_user: DATestUser | None = None, verify_deleted: bool = False, ) -> None: doc_ids = [document.id for document in cc_pair.documents] diff --git a/backend/tests/integration/common_utils/managers/document_search.py b/backend/tests/integration/common_utils/managers/document_search.py new file mode 100644 index 00000000000..82fdaf27db1 --- /dev/null +++ b/backend/tests/integration/common_utils/managers/document_search.py @@ -0,0 +1,39 @@ +import requests + +from danswer.context.search.enums import LLMEvaluationType +from danswer.context.search.enums import SearchType +from danswer.context.search.models import RetrievalDetails +from danswer.context.search.models import SavedSearchDocWithContent +from ee.danswer.server.query_and_chat.models import DocumentSearchRequest +from tests.integration.common_utils.constants import API_SERVER_URL +from tests.integration.common_utils.constants import GENERAL_HEADERS +from tests.integration.common_utils.test_models import DATestUser + + +class DocumentSearchManager: + @staticmethod + def search_documents( + query: str, + search_type: SearchType = SearchType.KEYWORD, + user_performing_action: DATestUser | None = None, + ) -> list[str]: + search_request = DocumentSearchRequest( + message=query, + search_type=search_type, + retrieval_options=RetrievalDetails(), + evaluation_type=LLMEvaluationType.SKIP, + ) + result = requests.post( + url=f"{API_SERVER_URL}/query/document-search", + json=search_request.model_dump(), + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + result.raise_for_status() + result_json = result.json() + top_documents: list[SavedSearchDocWithContent] = [ + SavedSearchDocWithContent(**doc) for doc in result_json["top_documents"] + ] + document_content_list: list[str] = [doc.content for doc in top_documents] + return document_content_list diff --git a/backend/tests/integration/common_utils/managers/document_set.py b/backend/tests/integration/common_utils/managers/document_set.py index 8133ccc8712..7670f42fa3c 100644 --- a/backend/tests/integration/common_utils/managers/document_set.py +++ b/backend/tests/integration/common_utils/managers/document_set.py @@ -6,8 +6,8 @@ from tests.integration.common_utils.constants import API_SERVER_URL from tests.integration.common_utils.constants import GENERAL_HEADERS from tests.integration.common_utils.constants import MAX_DELAY -from tests.integration.common_utils.test_models import TestDocumentSet -from tests.integration.common_utils.test_models import TestUser +from tests.integration.common_utils.test_models import DATestDocumentSet +from tests.integration.common_utils.test_models import DATestUser class DocumentSetManager: @@ -19,8 +19,8 @@ def create( is_public: bool = True, users: list[str] | None = None, groups: list[int] | None = None, - user_performing_action: TestUser | None = None, - ) -> TestDocumentSet: + user_performing_action: DATestUser | None = None, + ) -> DATestDocumentSet: if name is None: name = f"test_doc_set_{str(uuid4())}" @@ -42,7 +42,7 @@ def create( ) response.raise_for_status() - return TestDocumentSet( + return DATestDocumentSet( id=int(response.json()), name=name, description=description or name, @@ -55,8 +55,8 @@ def create( @staticmethod def edit( - document_set: TestDocumentSet, - user_performing_action: TestUser | None = None, + document_set: DATestDocumentSet, + user_performing_action: DATestUser | None = None, ) -> bool: doc_set_update_request = { "id": document_set.id, @@ -78,8 +78,8 @@ def edit( @staticmethod def delete( - document_set: TestDocumentSet, - user_performing_action: TestUser | None = None, + document_set: DATestDocumentSet, + user_performing_action: DATestUser | None = None, ) -> bool: response = requests.delete( f"{API_SERVER_URL}/manage/admin/document-set/{document_set.id}", @@ -92,8 +92,8 @@ def delete( @staticmethod def get_all( - user_performing_action: TestUser | None = None, - ) -> list[TestDocumentSet]: + user_performing_action: DATestUser | None = None, + ) -> list[DATestDocumentSet]: response = requests.get( f"{API_SERVER_URL}/manage/document-set", headers=user_performing_action.headers @@ -102,7 +102,7 @@ def get_all( ) response.raise_for_status() return [ - TestDocumentSet( + DATestDocumentSet( id=doc_set["id"], name=doc_set["name"], description=doc_set["description"], @@ -119,8 +119,8 @@ def get_all( @staticmethod def wait_for_sync( - document_sets_to_check: list[TestDocumentSet] | None = None, - user_performing_action: TestUser | None = None, + document_sets_to_check: list[DATestDocumentSet] | None = None, + user_performing_action: DATestUser | None = None, ) -> None: # wait for document sets to be synced start = time.time() @@ -135,6 +135,7 @@ def wait_for_sync( all_up_to_date = all(doc_set.is_up_to_date for doc_set in doc_sets) if all_up_to_date: + print("Document sets synced successfully.") break if time.time() - start > MAX_DELAY: @@ -148,9 +149,9 @@ def wait_for_sync( @staticmethod def verify( - document_set: TestDocumentSet, + document_set: DATestDocumentSet, verify_deleted: bool = False, - user_performing_action: TestUser | None = None, + user_performing_action: DATestUser | None = None, ) -> None: doc_sets = DocumentSetManager.get_all(user_performing_action) for doc_set in doc_sets: diff --git a/backend/tests/integration/common_utils/llm.py b/backend/tests/integration/common_utils/managers/llm_provider.py similarity index 56% rename from backend/tests/integration/common_utils/llm.py rename to backend/tests/integration/common_utils/managers/llm_provider.py index f74b40073c9..6ac4693496d 100644 --- a/backend/tests/integration/common_utils/llm.py +++ b/backend/tests/integration/common_utils/managers/llm_provider.py @@ -3,11 +3,12 @@ import requests +from danswer.server.manage.llm.models import FullLLMProvider from danswer.server.manage.llm.models import LLMProviderUpsertRequest from tests.integration.common_utils.constants import API_SERVER_URL from tests.integration.common_utils.constants import GENERAL_HEADERS -from tests.integration.common_utils.test_models import TestLLMProvider -from tests.integration.common_utils.test_models import TestUser +from tests.integration.common_utils.test_models import DATestLLMProvider +from tests.integration.common_utils.test_models import DATestUser class LLMProviderManager: @@ -21,8 +22,8 @@ def create( api_version: str | None = None, groups: list[int] | None = None, is_public: bool | None = None, - user_performing_action: TestUser | None = None, - ) -> TestLLMProvider: + user_performing_action: DATestUser | None = None, + ) -> DATestLLMProvider: print("Seeding LLM Providers...") llm_provider = LLMProviderUpsertRequest( @@ -49,7 +50,8 @@ def create( ) llm_response.raise_for_status() response_data = llm_response.json() - result_llm = TestLLMProvider( + + result_llm = DATestLLMProvider( id=response_data["id"], name=response_data["name"], provider=response_data["provider"], @@ -73,11 +75,9 @@ def create( @staticmethod def delete( - llm_provider: TestLLMProvider, - user_performing_action: TestUser | None = None, + llm_provider: DATestLLMProvider, + user_performing_action: DATestUser | None = None, ) -> bool: - if not llm_provider.id: - raise ValueError("LLM Provider ID is required to delete a provider") response = requests.delete( f"{API_SERVER_URL}/admin/llm/provider/{llm_provider.id}", headers=user_performing_action.headers @@ -86,3 +86,43 @@ def delete( ) response.raise_for_status() return True + + @staticmethod + def get_all( + user_performing_action: DATestUser | None = None, + ) -> list[FullLLMProvider]: + response = requests.get( + f"{API_SERVER_URL}/admin/llm/provider", + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + response.raise_for_status() + return [FullLLMProvider(**ug) for ug in response.json()] + + @staticmethod + def verify( + llm_provider: DATestLLMProvider, + verify_deleted: bool = False, + user_performing_action: DATestUser | None = None, + ) -> None: + all_llm_providers = LLMProviderManager.get_all(user_performing_action) + for fetched_llm_provider in all_llm_providers: + if llm_provider.id == fetched_llm_provider.id: + if verify_deleted: + raise ValueError( + f"User group {llm_provider.id} found but should be deleted" + ) + fetched_llm_groups = set(fetched_llm_provider.groups) + llm_provider_groups = set(llm_provider.groups) + if ( + fetched_llm_groups == llm_provider_groups + and llm_provider.provider == fetched_llm_provider.provider + and llm_provider.api_key == fetched_llm_provider.api_key + and llm_provider.default_model_name + == fetched_llm_provider.default_model_name + and llm_provider.is_public == fetched_llm_provider.is_public + ): + return + if not verify_deleted: + raise ValueError(f"User group {llm_provider.id} not found") diff --git a/backend/tests/integration/common_utils/managers/persona.py b/backend/tests/integration/common_utils/managers/persona.py index 41ff43edb6f..de2d9db25c1 100644 --- a/backend/tests/integration/common_utils/managers/persona.py +++ b/backend/tests/integration/common_utils/managers/persona.py @@ -2,12 +2,13 @@ import requests -from danswer.search.enums import RecencyBiasSetting +from danswer.context.search.enums import RecencyBiasSetting from danswer.server.features.persona.models import PersonaSnapshot from tests.integration.common_utils.constants import API_SERVER_URL from tests.integration.common_utils.constants import GENERAL_HEADERS -from tests.integration.common_utils.test_models import TestPersona -from tests.integration.common_utils.test_models import TestUser +from tests.integration.common_utils.test_models import DATestPersona +from tests.integration.common_utils.test_models import DATestPersonaCategory +from tests.integration.common_utils.test_models import DATestUser class PersonaManager: @@ -27,8 +28,9 @@ def create( llm_model_version_override: str | None = None, users: list[str] | None = None, groups: list[int] | None = None, - user_performing_action: TestUser | None = None, - ) -> TestPersona: + category_id: int | None = None, + user_performing_action: DATestUser | None = None, + ) -> DATestPersona: name = name or f"test-persona-{uuid4()}" description = description or f"Description for {name}" @@ -59,7 +61,7 @@ def create( response.raise_for_status() persona_data = response.json() - return TestPersona( + return DATestPersona( id=persona_data["id"], name=name, description=description, @@ -79,7 +81,7 @@ def create( @staticmethod def edit( - persona: TestPersona, + persona: DATestPersona, name: str | None = None, description: str | None = None, num_chunks: float | None = None, @@ -94,8 +96,8 @@ def edit( llm_model_version_override: str | None = None, users: list[str] | None = None, groups: list[int] | None = None, - user_performing_action: TestUser | None = None, - ) -> TestPersona: + user_performing_action: DATestUser | None = None, + ) -> DATestPersona: persona_update_request = { "name": name or persona.name, "description": description or persona.description, @@ -127,7 +129,7 @@ def edit( response.raise_for_status() updated_persona_data = response.json() - return TestPersona( + return DATestPersona( id=updated_persona_data["id"], name=updated_persona_data["name"], description=updated_persona_data["description"], @@ -151,7 +153,7 @@ def edit( @staticmethod def get_all( - user_performing_action: TestUser | None = None, + user_performing_action: DATestUser | None = None, ) -> list[PersonaSnapshot]: response = requests.get( f"{API_SERVER_URL}/admin/persona", @@ -164,38 +166,46 @@ def get_all( @staticmethod def verify( - test_persona: TestPersona, - user_performing_action: TestUser | None = None, + persona: DATestPersona, + user_performing_action: DATestUser | None = None, ) -> bool: all_personas = PersonaManager.get_all(user_performing_action) - for persona in all_personas: - if persona.id == test_persona.id: + for fetched_persona in all_personas: + if fetched_persona.id == persona.id: return ( - persona.name == test_persona.name - and persona.description == test_persona.description - and persona.num_chunks == test_persona.num_chunks - and persona.llm_relevance_filter - == test_persona.llm_relevance_filter - and persona.is_public == test_persona.is_public - and persona.llm_filter_extraction - == test_persona.llm_filter_extraction - and persona.llm_model_provider_override - == test_persona.llm_model_provider_override - and persona.llm_model_version_override - == test_persona.llm_model_version_override - and set(persona.prompts) == set(test_persona.prompt_ids) - and set(persona.document_sets) == set(test_persona.document_set_ids) - and set(persona.tools) == set(test_persona.tool_ids) - and set(user.email for user in persona.users) - == set(test_persona.users) - and set(persona.groups) == set(test_persona.groups) + fetched_persona.name == persona.name + and fetched_persona.description == persona.description + and fetched_persona.num_chunks == persona.num_chunks + and fetched_persona.llm_relevance_filter + == persona.llm_relevance_filter + and fetched_persona.is_public == persona.is_public + and fetched_persona.llm_filter_extraction + == persona.llm_filter_extraction + and fetched_persona.llm_model_provider_override + == persona.llm_model_provider_override + and fetched_persona.llm_model_version_override + == persona.llm_model_version_override + and set([prompt.id for prompt in fetched_persona.prompts]) + == set(persona.prompt_ids) + and set( + [ + document_set.id + for document_set in fetched_persona.document_sets + ] + ) + == set(persona.document_set_ids) + and set([tool.id for tool in fetched_persona.tools]) + == set(persona.tool_ids) + and set(user.email for user in fetched_persona.users) + == set(persona.users) + and set(fetched_persona.groups) == set(persona.groups) ) return False @staticmethod def delete( - persona: TestPersona, - user_performing_action: TestUser | None = None, + persona: DATestPersona, + user_performing_action: DATestUser | None = None, ) -> bool: response = requests.delete( f"{API_SERVER_URL}/persona/{persona.id}", @@ -204,3 +214,83 @@ def delete( else GENERAL_HEADERS, ) return response.ok + + +class PersonaCategoryManager: + @staticmethod + def create( + category: DATestPersonaCategory, + user_performing_action: DATestUser | None = None, + ) -> DATestPersonaCategory: + response = requests.post( + f"{API_SERVER_URL}/admin/persona/categories", + json={ + "name": category.name, + "description": category.description, + }, + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + response.raise_for_status() + response_data = response.json() + category.id = response_data["id"] + return category + + @staticmethod + def get_all( + user_performing_action: DATestUser | None = None, + ) -> list[DATestPersonaCategory]: + response = requests.get( + f"{API_SERVER_URL}/persona/categories", + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + response.raise_for_status() + return [DATestPersonaCategory(**category) for category in response.json()] + + @staticmethod + def update( + category: DATestPersonaCategory, + user_performing_action: DATestUser | None = None, + ) -> DATestPersonaCategory: + response = requests.patch( + f"{API_SERVER_URL}/admin/persona/category/{category.id}", + json={ + "category_name": category.name, + "category_description": category.description, + }, + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + response.raise_for_status() + return category + + @staticmethod + def delete( + category: DATestPersonaCategory, + user_performing_action: DATestUser | None = None, + ) -> bool: + response = requests.delete( + f"{API_SERVER_URL}/admin/persona/category/{category.id}", + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + return response.ok + + @staticmethod + def verify( + category: DATestPersonaCategory, + user_performing_action: DATestUser | None = None, + ) -> bool: + all_categories = PersonaCategoryManager.get_all(user_performing_action) + for fetched_category in all_categories: + if fetched_category.id == category.id: + return ( + fetched_category.name == category.name + and fetched_category.description == category.description + ) + return False diff --git a/backend/tests/integration/common_utils/managers/tenant.py b/backend/tests/integration/common_utils/managers/tenant.py new file mode 100644 index 00000000000..fc411018df7 --- /dev/null +++ b/backend/tests/integration/common_utils/managers/tenant.py @@ -0,0 +1,84 @@ +from datetime import datetime +from datetime import timedelta + +import jwt +import requests + +from danswer.server.manage.models import AllUsersResponse +from danswer.server.models import FullUserSnapshot +from danswer.server.models import InvitedUserSnapshot +from tests.integration.common_utils.constants import API_SERVER_URL +from tests.integration.common_utils.constants import GENERAL_HEADERS +from tests.integration.common_utils.test_models import DATestUser + + +def generate_auth_token() -> str: + payload = { + "iss": "control_plane", + "exp": datetime.utcnow() + timedelta(minutes=5), + "iat": datetime.utcnow(), + "scope": "tenant:create", + } + token = jwt.encode(payload, "", algorithm="HS256") + return token + + +class TenantManager: + @staticmethod + def create( + tenant_id: str | None = None, + initial_admin_email: str | None = None, + referral_source: str | None = None, + ) -> dict[str, str]: + body = { + "tenant_id": tenant_id, + "initial_admin_email": initial_admin_email, + "referral_source": referral_source, + } + + token = generate_auth_token() + headers = { + "Authorization": f"Bearer {token}", + "X-API-KEY": "", + "Content-Type": "application/json", + } + + response = requests.post( + url=f"{API_SERVER_URL}/tenants/create", + json=body, + headers=headers, + ) + + response.raise_for_status() + + return response.json() + + @staticmethod + def get_all_users( + user_performing_action: DATestUser | None = None, + ) -> AllUsersResponse: + response = requests.get( + url=f"{API_SERVER_URL}/manage/users", + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + response.raise_for_status() + + data = response.json() + return AllUsersResponse( + accepted=[FullUserSnapshot(**user) for user in data["accepted"]], + invited=[InvitedUserSnapshot(**user) for user in data["invited"]], + accepted_pages=data["accepted_pages"], + invited_pages=data["invited_pages"], + ) + + @staticmethod + def verify_user_in_tenant( + user: DATestUser, user_performing_action: DATestUser | None = None + ) -> None: + all_users = TenantManager.get_all_users(user_performing_action) + for accepted_user in all_users.accepted: + if accepted_user.email == user.email and accepted_user.id == user.id: + return + raise ValueError(f"User {user.email} not found in tenant") diff --git a/backend/tests/integration/common_utils/managers/user.py b/backend/tests/integration/common_utils/managers/user.py index 0946b8b1fca..43286c6a716 100644 --- a/backend/tests/integration/common_utils/managers/user.py +++ b/backend/tests/integration/common_utils/managers/user.py @@ -10,19 +10,30 @@ from danswer.server.models import InvitedUserSnapshot from tests.integration.common_utils.constants import API_SERVER_URL from tests.integration.common_utils.constants import GENERAL_HEADERS -from tests.integration.common_utils.test_models import TestUser +from tests.integration.common_utils.test_models import DATestUser + + +DOMAIN = "test.com" +DEFAULT_PASSWORD = "test" + + +def build_email(name: str) -> str: + return f"{name}@test.com" class UserManager: @staticmethod def create( name: str | None = None, - ) -> TestUser: + email: str | None = None, + ) -> DATestUser: if name is None: name = f"test{str(uuid4())}" - email = f"{name}@test.com" - password = "test" + if email is None: + email = build_email(name) + + password = DEFAULT_PASSWORD body = { "email": email, @@ -36,7 +47,7 @@ def create( ) response.raise_for_status() - test_user = TestUser( + test_user = DATestUser( id=response.json()["id"], email=email, password=password, @@ -44,12 +55,10 @@ def create( ) print(f"Created user {test_user.email}") - test_user.headers["Cookie"] = UserManager.login_as_user(test_user) - - return test_user + return UserManager.login_as_user(test_user) @staticmethod - def login_as_user(test_user: TestUser) -> str: + def login_as_user(test_user: DATestUser) -> DATestUser: data = urlencode( { "username": test_user.email, @@ -64,17 +73,23 @@ def login_as_user(test_user: TestUser) -> str: data=data, headers=headers, ) + response.raise_for_status() - result_cookie = next(iter(response.cookies), None) - if not result_cookie: + cookies = response.cookies.get_dict() + session_cookie = cookies.get("fastapiusersauth") + + if not session_cookie: raise Exception("Failed to login") print(f"Logged in as {test_user.email}") - return f"{result_cookie.name}={result_cookie.value}" + + # Set cookies in the headers + test_user.headers["Cookie"] = f"fastapiusersauth={session_cookie}; " + return test_user @staticmethod - def verify_role(user_to_verify: TestUser, target_role: UserRole) -> bool: + def verify_role(user_to_verify: DATestUser, target_role: UserRole) -> bool: response = requests.get( url=f"{API_SERVER_URL}/me", headers=user_to_verify.headers, @@ -84,9 +99,9 @@ def verify_role(user_to_verify: TestUser, target_role: UserRole) -> bool: @staticmethod def set_role( - user_to_set: TestUser, + user_to_set: DATestUser, target_role: UserRole, - user_to_perform_action: TestUser | None = None, + user_to_perform_action: DATestUser | None = None, ) -> None: if user_to_perform_action is None: user_to_perform_action = user_to_set @@ -98,7 +113,9 @@ def set_role( response.raise_for_status() @staticmethod - def verify(user: TestUser, user_to_perform_action: TestUser | None = None) -> None: + def verify( + user: DATestUser, user_to_perform_action: DATestUser | None = None + ) -> None: if user_to_perform_action is None: user_to_perform_action = user response = requests.get( diff --git a/backend/tests/integration/common_utils/managers/user_group.py b/backend/tests/integration/common_utils/managers/user_group.py index 5f5ac6b0e30..e8a26fa34a7 100644 --- a/backend/tests/integration/common_utils/managers/user_group.py +++ b/backend/tests/integration/common_utils/managers/user_group.py @@ -7,8 +7,8 @@ from tests.integration.common_utils.constants import API_SERVER_URL from tests.integration.common_utils.constants import GENERAL_HEADERS from tests.integration.common_utils.constants import MAX_DELAY -from tests.integration.common_utils.test_models import TestUser -from tests.integration.common_utils.test_models import TestUserGroup +from tests.integration.common_utils.test_models import DATestUser +from tests.integration.common_utils.test_models import DATestUserGroup class UserGroupManager: @@ -17,8 +17,8 @@ def create( name: str | None = None, user_ids: list[str] | None = None, cc_pair_ids: list[int] | None = None, - user_performing_action: TestUser | None = None, - ) -> TestUserGroup: + user_performing_action: DATestUser | None = None, + ) -> DATestUserGroup: name = f"{name}-user-group" if name else f"test-user-group-{uuid4()}" request = { @@ -34,7 +34,7 @@ def create( else GENERAL_HEADERS, ) response.raise_for_status() - test_user_group = TestUserGroup( + test_user_group = DATestUserGroup( id=response.json()["id"], name=response.json()["name"], user_ids=[user["id"] for user in response.json()["users"]], @@ -44,11 +44,9 @@ def create( @staticmethod def edit( - user_group: TestUserGroup, - user_performing_action: TestUser | None = None, + user_group: DATestUserGroup, + user_performing_action: DATestUser | None = None, ) -> None: - if not user_group.id: - raise ValueError("User group has no ID") response = requests.patch( f"{API_SERVER_URL}/manage/admin/user-group/{user_group.id}", json=user_group.model_dump(), @@ -58,15 +56,26 @@ def edit( ) response.raise_for_status() + @staticmethod + def delete( + user_group: DATestUserGroup, + user_performing_action: DATestUser | None = None, + ) -> None: + response = requests.delete( + f"{API_SERVER_URL}/manage/admin/user-group/{user_group.id}", + headers=user_performing_action.headers + if user_performing_action + else GENERAL_HEADERS, + ) + response.raise_for_status() + @staticmethod def set_curator_status( - test_user_group: TestUserGroup, - user_to_set_as_curator: TestUser, + test_user_group: DATestUserGroup, + user_to_set_as_curator: DATestUser, is_curator: bool = True, - user_performing_action: TestUser | None = None, + user_performing_action: DATestUser | None = None, ) -> None: - if not user_to_set_as_curator.id: - raise ValueError("User has no ID") set_curator_request = { "user_id": user_to_set_as_curator.id, "is_curator": is_curator, @@ -82,7 +91,7 @@ def set_curator_status( @staticmethod def get_all( - user_performing_action: TestUser | None = None, + user_performing_action: DATestUser | None = None, ) -> list[UserGroup]: response = requests.get( f"{API_SERVER_URL}/manage/admin/user-group", @@ -95,9 +104,9 @@ def get_all( @staticmethod def verify( - user_group: TestUserGroup, + user_group: DATestUserGroup, verify_deleted: bool = False, - user_performing_action: TestUser | None = None, + user_performing_action: DATestUser | None = None, ) -> None: all_user_groups = UserGroupManager.get_all(user_performing_action) for fetched_user_group in all_user_groups: @@ -120,8 +129,8 @@ def verify( @staticmethod def wait_for_sync( - user_groups_to_check: list[TestUserGroup] | None = None, - user_performing_action: TestUser | None = None, + user_groups_to_check: list[DATestUserGroup] | None = None, + user_performing_action: DATestUser | None = None, ) -> None: start = time.time() while True: @@ -130,13 +139,14 @@ def wait_for_sync( check_ids = {user_group.id for user_group in user_groups_to_check} user_group_ids = {user_group.id for user_group in user_groups} if not check_ids.issubset(user_group_ids): - raise RuntimeError("Document set not found") + raise RuntimeError("User group not found") user_groups = [ user_group for user_group in user_groups if user_group.id in check_ids ] if all(ug.is_up_to_date for ug in user_groups): + print("User groups synced successfully.") return if time.time() - start > MAX_DELAY: @@ -146,3 +156,26 @@ def wait_for_sync( else: print("User groups were not synced yet, waiting...") time.sleep(2) + + @staticmethod + def wait_for_deletion_completion( + user_groups_to_check: list[DATestUserGroup], + user_performing_action: DATestUser | None = None, + ) -> None: + start = time.time() + user_group_ids_to_check = {user_group.id for user_group in user_groups_to_check} + while True: + fetched_user_groups = UserGroupManager.get_all(user_performing_action) + fetched_user_group_ids = { + user_group.id for user_group in fetched_user_groups + } + if not user_group_ids_to_check.intersection(fetched_user_group_ids): + return + + if time.time() - start > MAX_DELAY: + raise TimeoutError( + f"User groups deletion was not completed within the {MAX_DELAY} seconds" + ) + else: + print("Some user groups are still being deleted, waiting...") + time.sleep(2) diff --git a/backend/tests/integration/common_utils/reset.py b/backend/tests/integration/common_utils/reset.py index a13ec184b45..1792af9dbf9 100644 --- a/backend/tests/integration/common_utils/reset.py +++ b/backend/tests/integration/common_utils/reset.py @@ -1,5 +1,6 @@ import logging import time +from types import SimpleNamespace import psycopg2 import requests @@ -11,19 +12,28 @@ from danswer.configs.app_configs import POSTGRES_PORT from danswer.configs.app_configs import POSTGRES_USER from danswer.db.engine import build_connection_string +from danswer.db.engine import get_all_tenant_ids from danswer.db.engine import get_session_context_manager +from danswer.db.engine import get_session_with_tenant from danswer.db.engine import SYNC_DB_API from danswer.db.search_settings import get_current_search_settings from danswer.db.swap_index import check_index_swap from danswer.document_index.vespa.index import DOCUMENT_ID_ENDPOINT from danswer.document_index.vespa.index import VespaIndex from danswer.indexing.models import IndexingSetting -from danswer.main import setup_postgres -from danswer.main import setup_vespa +from danswer.setup import setup_postgres +from danswer.setup import setup_vespa +from danswer.utils.logger import setup_logger + +logger = setup_logger() def _run_migrations( - database_url: str, direction: str = "upgrade", revision: str = "head" + database_url: str, + config_name: str, + direction: str = "upgrade", + revision: str = "head", + schema: str = "public", ) -> None: # hide info logs emitted during migration logging.getLogger("alembic").setLevel(logging.CRITICAL) @@ -32,6 +42,10 @@ def _run_migrations( alembic_cfg = Config("alembic.ini") alembic_cfg.set_section_option("logger_alembic", "level", "WARN") alembic_cfg.attributes["configure_logger"] = False + alembic_cfg.config_ini_section = config_name + + alembic_cfg.cmd_opts = SimpleNamespace() # type: ignore + alembic_cfg.cmd_opts.x = [f"schema={schema}"] # type: ignore # Set the SQLAlchemy URL in the Alembic configuration alembic_cfg.set_main_option("sqlalchemy.url", database_url) @@ -49,7 +63,9 @@ def _run_migrations( logging.getLogger("alembic").setLevel(logging.INFO) -def reset_postgres(database: str = "postgres") -> None: +def reset_postgres( + database: str = "postgres", config_name: str = "alembic", setup_danswer: bool = True +) -> None: """Reset the Postgres database.""" # NOTE: need to delete all rows to allow migrations to be rolled back @@ -108,14 +124,18 @@ def reset_postgres(database: str = "postgres") -> None: ) _run_migrations( conn_str, + config_name, direction="downgrade", revision="base", ) _run_migrations( conn_str, + config_name, direction="upgrade", revision="head", ) + if not setup_danswer: + return # do the same thing as we do on API server startup with get_session_context_manager() as db_session: @@ -124,6 +144,7 @@ def reset_postgres(database: str = "postgres") -> None: def reset_vespa() -> None: """Wipe all data from the Vespa index.""" + with get_session_context_manager() as db_session: # swap to the correct default model check_index_swap(db_session) @@ -163,10 +184,98 @@ def reset_vespa() -> None: time.sleep(5) +def reset_postgres_multitenant() -> None: + """Reset the Postgres database for all tenants in a multitenant setup.""" + + conn = psycopg2.connect( + dbname="postgres", + user=POSTGRES_USER, + password=POSTGRES_PASSWORD, + host=POSTGRES_HOST, + port=POSTGRES_PORT, + ) + conn.autocommit = True + cur = conn.cursor() + + # Get all tenant schemas + cur.execute( + """ + SELECT schema_name + FROM information_schema.schemata + WHERE schema_name LIKE 'tenant_%' + """ + ) + tenant_schemas = cur.fetchall() + + # Drop all tenant schemas + for schema in tenant_schemas: + schema_name = schema[0] + cur.execute(f'DROP SCHEMA "{schema_name}" CASCADE') + + cur.close() + conn.close() + + reset_postgres(config_name="schema_private", setup_danswer=False) + + +def reset_vespa_multitenant() -> None: + """Wipe all data from the Vespa index for all tenants.""" + + for tenant_id in get_all_tenant_ids(): + with get_session_with_tenant(tenant_id=tenant_id) as db_session: + # swap to the correct default model for each tenant + check_index_swap(db_session) + + search_settings = get_current_search_settings(db_session) + index_name = search_settings.index_name + + success = setup_vespa( + document_index=VespaIndex(index_name=index_name, secondary_index_name=None), + index_setting=IndexingSetting.from_db_model(search_settings), + secondary_index_setting=None, + ) + + if not success: + raise RuntimeError( + f"Could not connect to Vespa for tenant {tenant_id} within the specified timeout." + ) + + for _ in range(5): + try: + continuation = None + should_continue = True + while should_continue: + params = {"selection": "true", "cluster": "danswer_index"} + if continuation: + params = {**params, "continuation": continuation} + response = requests.delete( + DOCUMENT_ID_ENDPOINT.format(index_name=index_name), + params=params, + ) + response.raise_for_status() + + response_json = response.json() + + continuation = response_json.get("continuation") + should_continue = bool(continuation) + + break + except Exception as e: + print(f"Error deleting documents for tenant {tenant_id}: {e}") + time.sleep(5) + + def reset_all() -> None: - """Reset both Postgres and Vespa.""" - print("Resetting Postgres...") + logger.info("Resetting Postgres...") reset_postgres() - print("Resetting Vespa...") + logger.info("Resetting Vespa...") reset_vespa() - print("Finished resetting all.") + + +def reset_all_multitenant() -> None: + """Reset both Postgres and Vespa for all tenants.""" + logger.info("Resetting Postgres for all tenants...") + reset_postgres_multitenant() + logger.info("Resetting Vespa for all tenants...") + reset_vespa_multitenant() + logger.info("Finished resetting all.") diff --git a/backend/tests/integration/common_utils/test_models.py b/backend/tests/integration/common_utils/test_models.py index 2d8744327df..65a90259d8b 100644 --- a/backend/tests/integration/common_utils/test_models.py +++ b/backend/tests/integration/common_utils/test_models.py @@ -5,7 +5,8 @@ from pydantic import Field from danswer.auth.schemas import UserRole -from danswer.search.enums import RecencyBiasSetting +from danswer.context.search.enums import RecencyBiasSetting +from danswer.db.enums import AccessType from danswer.server.documents.models import DocumentSource from danswer.server.documents.models import InputType @@ -19,7 +20,7 @@ """ -class TestAPIKey(BaseModel): +class DATestAPIKey(BaseModel): api_key_id: int api_key_display: str api_key: str | None = None # only present on initial creation @@ -30,14 +31,20 @@ class TestAPIKey(BaseModel): headers: dict -class TestUser(BaseModel): +class DATestUser(BaseModel): id: str email: str password: str headers: dict -class TestCredential(BaseModel): +class DATestPersonaCategory(BaseModel): + id: int | None = None + name: str + description: str | None + + +class DATestCredential(BaseModel): id: int name: str credential_json: dict[str, Any] @@ -47,14 +54,14 @@ class TestCredential(BaseModel): groups: list[int] -class TestConnector(BaseModel): +class DATestConnector(BaseModel): id: int name: str source: DocumentSource input_type: InputType connector_specific_config: dict[str, Any] groups: list[int] | None = None - is_public: bool | None = None + access_type: AccessType | None = None class SimpleTestDocument(BaseModel): @@ -62,36 +69,36 @@ class SimpleTestDocument(BaseModel): content: str -class TestCCPair(BaseModel): +class DATestCCPair(BaseModel): id: int name: str connector_id: int credential_id: int - is_public: bool + access_type: AccessType groups: list[int] documents: list[SimpleTestDocument] = Field(default_factory=list) -class TestUserGroup(BaseModel): +class DATestUserGroup(BaseModel): id: int name: str user_ids: list[str] cc_pair_ids: list[int] -class TestLLMProvider(BaseModel): +class DATestLLMProvider(BaseModel): id: int name: str provider: str api_key: str default_model_name: str is_public: bool - groups: list[TestUserGroup] + groups: list[int] api_base: str | None = None api_version: str | None = None -class TestDocumentSet(BaseModel): +class DATestDocumentSet(BaseModel): id: int name: str description: str @@ -102,7 +109,7 @@ class TestDocumentSet(BaseModel): groups: list[int] = Field(default_factory=list) -class TestPersona(BaseModel): +class DATestPersona(BaseModel): id: int name: str description: str @@ -118,21 +125,21 @@ class TestPersona(BaseModel): llm_model_version_override: str | None users: list[str] groups: list[int] + category_id: int | None = None # -class TestChatSession(BaseModel): - id: int +class DATestChatSession(BaseModel): + id: UUID persona_id: int description: str -class TestChatMessage(BaseModel): - id: str | None = None - chat_session_id: int - parent_message_id: str | None +class DATestChatMessage(BaseModel): + id: int + chat_session_id: UUID + parent_message_id: int | None message: str - response: str class StreamedResponse(BaseModel): diff --git a/backend/tests/integration/common_utils/vespa.py b/backend/tests/integration/common_utils/vespa.py index aff7ef5eca6..1bd0060d89b 100644 --- a/backend/tests/integration/common_utils/vespa.py +++ b/backend/tests/integration/common_utils/vespa.py @@ -3,7 +3,7 @@ from danswer.document_index.vespa.index import DOCUMENT_ID_ENDPOINT -class TestVespaClient: +class vespa_fixture: def __init__(self, index_name: str): self.index_name = index_name self.vespa_document_url = DOCUMENT_ID_ENDPOINT.format(index_name=index_name) diff --git a/backend/tests/integration/conftest.py b/backend/tests/integration/conftest.py index 314b78ad36f..91e61966643 100644 --- a/backend/tests/integration/conftest.py +++ b/backend/tests/integration/conftest.py @@ -6,8 +6,11 @@ from danswer.db.engine import get_session_context_manager from danswer.db.search_settings import get_current_search_settings +from tests.integration.common_utils.managers.user import UserManager from tests.integration.common_utils.reset import reset_all -from tests.integration.common_utils.vespa import TestVespaClient +from tests.integration.common_utils.reset import reset_all_multitenant +from tests.integration.common_utils.test_models import DATestUser +from tests.integration.common_utils.vespa import vespa_fixture def load_env_vars(env_file: str = ".env") -> None: @@ -36,11 +39,24 @@ def db_session() -> Generator[Session, None, None]: @pytest.fixture -def vespa_client(db_session: Session) -> TestVespaClient: +def vespa_client(db_session: Session) -> vespa_fixture: search_settings = get_current_search_settings(db_session) - return TestVespaClient(index_name=search_settings.index_name) + return vespa_fixture(index_name=search_settings.index_name) @pytest.fixture def reset() -> None: reset_all() + + +@pytest.fixture +def new_admin_user(reset: None) -> DATestUser | None: + try: + return UserManager.create(name="admin_user") + except Exception: + return None + + +@pytest.fixture +def reset_multitenant() -> None: + reset_all_multitenant() diff --git a/backend/tests/integration/connector_job_tests/slack/conftest.py b/backend/tests/integration/connector_job_tests/slack/conftest.py new file mode 100644 index 00000000000..38b851de809 --- /dev/null +++ b/backend/tests/integration/connector_job_tests/slack/conftest.py @@ -0,0 +1,32 @@ +import os +from collections.abc import Generator +from typing import Any + +import pytest + +from tests.integration.connector_job_tests.slack.slack_api_utils import SlackManager + +# from tests.load_env_vars import load_env_vars + +# load_env_vars() + + +@pytest.fixture() +def slack_test_setup() -> Generator[tuple[dict[str, Any], dict[str, Any]], None, None]: + slack_client = SlackManager.get_slack_client(os.environ["SLACK_BOT_TOKEN"]) + admin_user_id = SlackManager.build_slack_user_email_id_map(slack_client)[ + "admin@onyx-test.com" + ] + + ( + public_channel, + private_channel, + run_id, + ) = SlackManager.get_and_provision_available_slack_channels( + slack_client=slack_client, admin_user_id=admin_user_id + ) + + yield public_channel, private_channel + + # This part will always run after the test, even if it fails + SlackManager.cleanup_after_test(slack_client=slack_client, test_id=run_id) diff --git a/backend/tests/integration/connector_job_tests/slack/slack_api_utils.py b/backend/tests/integration/connector_job_tests/slack/slack_api_utils.py new file mode 100644 index 00000000000..f17c4211066 --- /dev/null +++ b/backend/tests/integration/connector_job_tests/slack/slack_api_utils.py @@ -0,0 +1,291 @@ +""" +Assumptions: +- The test users have already been created +- General is empty of messages +- In addition to the normal slack oauth permissions, the following scopes are needed: + - channels:manage + - groups:write + - chat:write + - chat:write.public +""" +from typing import Any +from uuid import uuid4 + +from slack_sdk import WebClient +from slack_sdk.errors import SlackApiError + +from danswer.connectors.slack.connector import default_msg_filter +from danswer.connectors.slack.connector import get_channel_messages +from danswer.connectors.slack.utils import make_paginated_slack_api_call_w_retries +from danswer.connectors.slack.utils import make_slack_api_call_w_retries + + +def _get_slack_channel_id(channel: dict[str, Any]) -> str: + if not (channel_id := channel.get("id")): + raise ValueError("Channel ID is missing") + return channel_id + + +def _get_non_general_channels( + slack_client: WebClient, + get_private: bool, + get_public: bool, + only_get_done: bool = False, +) -> list[dict[str, Any]]: + channel_types = [] + if get_private: + channel_types.append("private_channel") + if get_public: + channel_types.append("public_channel") + + conversations: list[dict[str, Any]] = [] + for result in make_paginated_slack_api_call_w_retries( + slack_client.conversations_list, + exclude_archived=False, + types=channel_types, + ): + conversations.extend(result["channels"]) + + filtered_conversations = [] + for conversation in conversations: + if conversation.get("is_general", False): + continue + if only_get_done and "done" not in conversation.get("name", ""): + continue + filtered_conversations.append(conversation) + return filtered_conversations + + +def _clear_slack_conversation_members( + slack_client: WebClient, + admin_user_id: str, + channel: dict[str, Any], +) -> None: + channel_id = _get_slack_channel_id(channel) + member_ids: list[str] = [] + for result in make_paginated_slack_api_call_w_retries( + slack_client.conversations_members, + channel=channel_id, + ): + member_ids.extend(result["members"]) + + for member_id in member_ids: + if member_id == admin_user_id: + continue + try: + slack_client.conversations_kick(channel=channel_id, user=member_id) + print(f"Kicked member: {member_id}") + except Exception as e: + if "cant_kick_self" in str(e): + continue + print(f"Error kicking member: {e}") + print(member_id) + try: + slack_client.conversations_unarchive(channel=channel_id) + channel["is_archived"] = False + except Exception: + # Channel is already unarchived + pass + + +def _add_slack_conversation_members( + slack_client: WebClient, channel: dict[str, Any], member_ids: list[str] +) -> None: + channel_id = _get_slack_channel_id(channel) + for user_id in member_ids: + try: + slack_client.conversations_invite(channel=channel_id, users=user_id) + except Exception as e: + if "already_in_channel" in str(e): + continue + print(f"Error inviting member: {e}") + print(user_id) + + +def _delete_slack_conversation_messages( + slack_client: WebClient, + channel: dict[str, Any], + message_to_delete: str | None = None, +) -> None: + """deletes all messages from a channel if message_to_delete is None""" + channel_id = _get_slack_channel_id(channel) + for message_batch in get_channel_messages(slack_client, channel): + for message in message_batch: + if default_msg_filter(message): + continue + + if message_to_delete and message.get("text") != message_to_delete: + continue + print(" removing message: ", message.get("text")) + + try: + if not (ts := message.get("ts")): + raise ValueError("Message timestamp is missing") + slack_client.chat_delete(channel=channel_id, ts=ts) + except Exception as e: + print(f"Error deleting message: {e}") + print(message) + + +def _build_slack_channel_from_name( + slack_client: WebClient, + admin_user_id: str, + suffix: str, + is_private: bool, + channel: dict[str, Any] | None, +) -> dict[str, Any]: + base = "public_channel" if not is_private else "private_channel" + channel_name = f"{base}-{suffix}" + if channel: + # If channel is provided, we rename it + channel_id = _get_slack_channel_id(channel) + channel_response = make_slack_api_call_w_retries( + slack_client.conversations_rename, + channel=channel_id, + name=channel_name, + ) + else: + # Otherwise, we create a new channel + channel_response = make_slack_api_call_w_retries( + slack_client.conversations_create, + name=channel_name, + is_private=is_private, + ) + + try: + slack_client.conversations_unarchive(channel=channel_response["channel"]["id"]) + except Exception: + # Channel is already unarchived + pass + try: + slack_client.conversations_invite( + channel=channel_response["channel"]["id"], + users=[admin_user_id], + ) + except Exception: + pass + + final_channel = channel_response["channel"] if channel_response else {} + return final_channel + + +class SlackManager: + @staticmethod + def get_slack_client(token: str) -> WebClient: + return WebClient(token=token) + + @staticmethod + def get_and_provision_available_slack_channels( + slack_client: WebClient, admin_user_id: str + ) -> tuple[dict[str, Any], dict[str, Any], str]: + run_id = str(uuid4()) + public_channels = _get_non_general_channels( + slack_client, get_private=False, get_public=True, only_get_done=True + ) + + first_available_channel = ( + None if len(public_channels) < 1 else public_channels[0] + ) + public_channel = _build_slack_channel_from_name( + slack_client=slack_client, + admin_user_id=admin_user_id, + suffix=run_id, + is_private=False, + channel=first_available_channel, + ) + _delete_slack_conversation_messages( + slack_client=slack_client, channel=public_channel + ) + + private_channels = _get_non_general_channels( + slack_client, get_private=True, get_public=False, only_get_done=True + ) + second_available_channel = ( + None if len(private_channels) < 1 else private_channels[0] + ) + private_channel = _build_slack_channel_from_name( + slack_client=slack_client, + admin_user_id=admin_user_id, + suffix=run_id, + is_private=True, + channel=second_available_channel, + ) + _delete_slack_conversation_messages( + slack_client=slack_client, channel=private_channel + ) + + return public_channel, private_channel, run_id + + @staticmethod + def build_slack_user_email_id_map(slack_client: WebClient) -> dict[str, str]: + users_results = make_slack_api_call_w_retries( + slack_client.users_list, + ) + users: list[dict[str, Any]] = users_results.get("members", []) + user_email_id_map = {} + for user in users: + if not (email := user.get("profile", {}).get("email")): + continue + if not (user_id := user.get("id")): + raise ValueError("User ID is missing") + user_email_id_map[email] = user_id + return user_email_id_map + + @staticmethod + def set_channel_members( + slack_client: WebClient, + admin_user_id: str, + channel: dict[str, Any], + user_ids: list[str], + ) -> None: + _clear_slack_conversation_members( + slack_client=slack_client, + channel=channel, + admin_user_id=admin_user_id, + ) + _add_slack_conversation_members( + slack_client=slack_client, channel=channel, member_ids=user_ids + ) + + @staticmethod + def add_message_to_channel( + slack_client: WebClient, channel: dict[str, Any], message: str + ) -> None: + channel_id = _get_slack_channel_id(channel) + make_slack_api_call_w_retries( + slack_client.chat_postMessage, + channel=channel_id, + text=message, + ) + + @staticmethod + def remove_message_from_channel( + slack_client: WebClient, channel: dict[str, Any], message: str + ) -> None: + _delete_slack_conversation_messages( + slack_client=slack_client, channel=channel, message_to_delete=message + ) + + @staticmethod + def cleanup_after_test( + slack_client: WebClient, + test_id: str, + ) -> None: + channel_types = ["private_channel", "public_channel"] + channels: list[dict[str, Any]] = [] + for result in make_paginated_slack_api_call_w_retries( + slack_client.conversations_list, + exclude_archived=False, + types=channel_types, + ): + channels.extend(result["channels"]) + + for channel in channels: + if test_id not in channel.get("name", ""): + continue + # "done" in the channel name indicates that this channel is free to be used for a new test + new_name = f"done_{str(uuid4())}" + try: + slack_client.conversations_rename(channel=channel["id"], name=new_name) + except SlackApiError as e: + print(f"Error renaming channel {channel['id']}: {e}") diff --git a/backend/tests/integration/connector_job_tests/slack/test_permission_sync.py b/backend/tests/integration/connector_job_tests/slack/test_permission_sync.py new file mode 100644 index 00000000000..3c37332547d --- /dev/null +++ b/backend/tests/integration/connector_job_tests/slack/test_permission_sync.py @@ -0,0 +1,339 @@ +import os +from datetime import datetime +from datetime import timezone +from typing import Any + +from danswer.connectors.models import InputType +from danswer.db.enums import AccessType +from danswer.server.documents.models import DocumentSource +from tests.integration.common_utils.managers.cc_pair import CCPairManager +from tests.integration.common_utils.managers.connector import ConnectorManager +from tests.integration.common_utils.managers.credential import CredentialManager +from tests.integration.common_utils.managers.document_search import ( + DocumentSearchManager, +) +from tests.integration.common_utils.managers.llm_provider import LLMProviderManager +from tests.integration.common_utils.managers.user import UserManager +from tests.integration.common_utils.managers.user_group import UserGroupManager +from tests.integration.common_utils.test_models import DATestCCPair +from tests.integration.common_utils.test_models import DATestConnector +from tests.integration.common_utils.test_models import DATestCredential +from tests.integration.common_utils.test_models import DATestUser +from tests.integration.common_utils.vespa import vespa_fixture +from tests.integration.connector_job_tests.slack.slack_api_utils import SlackManager + + +# @pytest.mark.xfail(reason="flaky - see DAN-789 for example", strict=False) +def test_slack_permission_sync( + reset: None, + vespa_client: vespa_fixture, + slack_test_setup: tuple[dict[str, Any], dict[str, Any]], +) -> None: + public_channel, private_channel = slack_test_setup + + # Creating an admin user (first user created is automatically an admin) + admin_user: DATestUser = UserManager.create( + email="admin@onyx-test.com", + ) + + # Creating a non-admin user + test_user_1: DATestUser = UserManager.create( + email="test_user_1@onyx-test.com", + ) + + # Creating a non-admin user + test_user_2: DATestUser = UserManager.create( + email="test_user_2@onyx-test.com", + ) + + slack_client = SlackManager.get_slack_client(os.environ["SLACK_BOT_TOKEN"]) + email_id_map = SlackManager.build_slack_user_email_id_map(slack_client) + admin_user_id = email_id_map[admin_user.email] + + LLMProviderManager.create(user_performing_action=admin_user) + + before = datetime.now(timezone.utc) + credential: DATestCredential = CredentialManager.create( + source=DocumentSource.SLACK, + credential_json={ + "slack_bot_token": os.environ["SLACK_BOT_TOKEN"], + }, + user_performing_action=admin_user, + ) + connector: DATestConnector = ConnectorManager.create( + name="Slack", + input_type=InputType.POLL, + source=DocumentSource.SLACK, + connector_specific_config={ + "workspace": "onyx-test-workspace", + "channels": [public_channel["name"], private_channel["name"]], + }, + access_type=AccessType.SYNC, + groups=[], + user_performing_action=admin_user, + ) + cc_pair: DATestCCPair = CCPairManager.create( + credential_id=credential.id, + connector_id=connector.id, + access_type=AccessType.SYNC, + user_performing_action=admin_user, + ) + CCPairManager.wait_for_indexing( + cc_pair=cc_pair, + after=before, + user_performing_action=admin_user, + ) + + # Add test_user_1 and admin_user to the private channel + desired_channel_members = [admin_user, test_user_1] + SlackManager.set_channel_members( + slack_client=slack_client, + admin_user_id=admin_user_id, + channel=private_channel, + user_ids=[email_id_map[user.email] for user in desired_channel_members], + ) + + public_message = "Steve's favorite number is 809752" + private_message = "Sara's favorite number is 346794" + + # Add messages to channels + print(f"\n Adding public message to channel: {public_message}") + SlackManager.add_message_to_channel( + slack_client=slack_client, + channel=public_channel, + message=public_message, + ) + print(f"\n Adding private message to channel: {private_message}") + SlackManager.add_message_to_channel( + slack_client=slack_client, + channel=private_channel, + message=private_message, + ) + + # Run indexing + before = datetime.now(timezone.utc) + CCPairManager.run_once(cc_pair, admin_user) + CCPairManager.wait_for_indexing( + cc_pair=cc_pair, + after=before, + user_performing_action=admin_user, + ) + + # Run permission sync + CCPairManager.sync( + cc_pair=cc_pair, + user_performing_action=admin_user, + ) + CCPairManager.wait_for_sync( + cc_pair=cc_pair, + after=before, + number_of_updated_docs=2, + user_performing_action=admin_user, + ) + + # Search as admin with access to both channels + print("\nSearching as admin user") + danswer_doc_message_strings = DocumentSearchManager.search_documents( + query="favorite number", + user_performing_action=admin_user, + ) + print( + "\n documents retrieved by admin user: ", + danswer_doc_message_strings, + ) + + # Ensure admin user can see messages from both channels + assert public_message in danswer_doc_message_strings + assert private_message in danswer_doc_message_strings + + # Search as test_user_2 with access to only the public channel + print("\n Searching as test_user_2") + danswer_doc_message_strings = DocumentSearchManager.search_documents( + query="favorite number", + user_performing_action=test_user_2, + ) + print( + "\n documents retrieved by test_user_2: ", + danswer_doc_message_strings, + ) + + # Ensure test_user_2 can only see messages from the public channel + assert public_message in danswer_doc_message_strings + assert private_message not in danswer_doc_message_strings + + # Search as test_user_1 with access to both channels + print("\n Searching as test_user_1") + danswer_doc_message_strings = DocumentSearchManager.search_documents( + query="favorite number", + user_performing_action=test_user_1, + ) + print( + "\n documents retrieved by test_user_1 before being removed from private channel: ", + danswer_doc_message_strings, + ) + + # Ensure test_user_1 can see messages from both channels + assert public_message in danswer_doc_message_strings + assert private_message in danswer_doc_message_strings + + # ----------------------MAKE THE CHANGES-------------------------- + print("\n Removing test_user_1 from the private channel") + before = datetime.now(timezone.utc) + # Remove test_user_1 from the private channel + desired_channel_members = [admin_user] + SlackManager.set_channel_members( + slack_client=slack_client, + admin_user_id=admin_user_id, + channel=private_channel, + user_ids=[email_id_map[user.email] for user in desired_channel_members], + ) + + # Run permission sync + CCPairManager.sync( + cc_pair=cc_pair, + user_performing_action=admin_user, + ) + CCPairManager.wait_for_sync( + cc_pair=cc_pair, + after=before, + number_of_updated_docs=1, + user_performing_action=admin_user, + ) + + # ----------------------------VERIFY THE CHANGES--------------------------- + # Ensure test_user_1 can no longer see messages from the private channel + # Search as test_user_1 with access to only the public channel + + danswer_doc_message_strings = DocumentSearchManager.search_documents( + query="favorite number", + user_performing_action=test_user_1, + ) + print( + "\n documents retrieved by test_user_1 after being removed from private channel: ", + danswer_doc_message_strings, + ) + + # Ensure test_user_1 can only see messages from the public channel + assert public_message in danswer_doc_message_strings + assert private_message not in danswer_doc_message_strings + + +def test_slack_group_permission_sync( + reset: None, + vespa_client: vespa_fixture, + slack_test_setup: tuple[dict[str, Any], dict[str, Any]], +) -> None: + """ + This test ensures that permission sync overrides danswer group access. + """ + public_channel, private_channel = slack_test_setup + + # Creating an admin user (first user created is automatically an admin) + admin_user: DATestUser = UserManager.create( + email="admin@onyx-test.com", + ) + + # Creating a non-admin user + test_user_1: DATestUser = UserManager.create( + email="test_user_1@onyx-test.com", + ) + + # Create a user group and adding the non-admin user to it + user_group = UserGroupManager.create( + name="test_group", + user_ids=[test_user_1.id], + cc_pair_ids=[], + user_performing_action=admin_user, + ) + UserGroupManager.wait_for_sync( + user_groups_to_check=[user_group], + user_performing_action=admin_user, + ) + + slack_client = SlackManager.get_slack_client(os.environ["SLACK_BOT_TOKEN"]) + email_id_map = SlackManager.build_slack_user_email_id_map(slack_client) + admin_user_id = email_id_map[admin_user.email] + + LLMProviderManager.create(user_performing_action=admin_user) + + # Add only admin to the private channel + SlackManager.set_channel_members( + slack_client=slack_client, + admin_user_id=admin_user_id, + channel=private_channel, + user_ids=[admin_user_id], + ) + + before = datetime.now(timezone.utc) + credential = CredentialManager.create( + source=DocumentSource.SLACK, + credential_json={ + "slack_bot_token": os.environ["SLACK_BOT_TOKEN"], + }, + user_performing_action=admin_user, + ) + + # Create connector with sync access and assign it to the user group + connector = ConnectorManager.create( + name="Slack", + input_type=InputType.POLL, + source=DocumentSource.SLACK, + connector_specific_config={ + "workspace": "onyx-test-workspace", + "channels": [private_channel["name"]], + }, + access_type=AccessType.SYNC, + groups=[user_group.id], + user_performing_action=admin_user, + ) + + cc_pair = CCPairManager.create( + credential_id=credential.id, + connector_id=connector.id, + access_type=AccessType.SYNC, + user_performing_action=admin_user, + groups=[user_group.id], + ) + + # Add a test message to the private channel + private_message = "This is a secret message: 987654" + SlackManager.add_message_to_channel( + slack_client=slack_client, + channel=private_channel, + message=private_message, + ) + + # Run indexing + CCPairManager.run_once(cc_pair, admin_user) + CCPairManager.wait_for_indexing( + cc_pair=cc_pair, + after=before, + user_performing_action=admin_user, + ) + + # Run permission sync + CCPairManager.sync( + cc_pair=cc_pair, + user_performing_action=admin_user, + ) + CCPairManager.wait_for_sync( + cc_pair=cc_pair, + after=before, + number_of_updated_docs=1, + user_performing_action=admin_user, + ) + + # Verify admin can see the message + admin_docs = DocumentSearchManager.search_documents( + query="secret message", + user_performing_action=admin_user, + ) + assert private_message in admin_docs + + # Verify test_user_1 cannot see the message despite being in the group + # (Slack permissions should take precedence) + user_1_docs = DocumentSearchManager.search_documents( + query="secret message", + user_performing_action=test_user_1, + ) + assert private_message not in user_1_docs diff --git a/backend/tests/integration/connector_job_tests/slack/test_prune.py b/backend/tests/integration/connector_job_tests/slack/test_prune.py new file mode 100644 index 00000000000..2dfc3d0750f --- /dev/null +++ b/backend/tests/integration/connector_job_tests/slack/test_prune.py @@ -0,0 +1,210 @@ +import os +from datetime import datetime +from datetime import timezone +from typing import Any + +import pytest + +from danswer.connectors.models import InputType +from danswer.db.enums import AccessType +from danswer.server.documents.models import DocumentSource +from tests.integration.common_utils.managers.cc_pair import CCPairManager +from tests.integration.common_utils.managers.connector import ConnectorManager +from tests.integration.common_utils.managers.credential import CredentialManager +from tests.integration.common_utils.managers.document_search import ( + DocumentSearchManager, +) +from tests.integration.common_utils.managers.llm_provider import LLMProviderManager +from tests.integration.common_utils.managers.user import UserManager +from tests.integration.common_utils.test_models import DATestCCPair +from tests.integration.common_utils.test_models import DATestConnector +from tests.integration.common_utils.test_models import DATestCredential +from tests.integration.common_utils.test_models import DATestUser +from tests.integration.common_utils.vespa import vespa_fixture +from tests.integration.connector_job_tests.slack.slack_api_utils import SlackManager + + +@pytest.mark.xfail(reason="flaky - see DAN-986 for details", strict=False) +def test_slack_prune( + reset: None, + vespa_client: vespa_fixture, + slack_test_setup: tuple[dict[str, Any], dict[str, Any]], +) -> None: + public_channel, private_channel = slack_test_setup + + # Creating an admin user (first user created is automatically an admin) + admin_user: DATestUser = UserManager.create( + email="admin@onyx-test.com", + ) + + # Creating a non-admin user + test_user_1: DATestUser = UserManager.create( + email="test_user_1@onyx-test.com", + ) + + slack_client = SlackManager.get_slack_client(os.environ["SLACK_BOT_TOKEN"]) + email_id_map = SlackManager.build_slack_user_email_id_map(slack_client) + admin_user_id = email_id_map[admin_user.email] + + LLMProviderManager.create(user_performing_action=admin_user) + + before = datetime.now(timezone.utc) + credential: DATestCredential = CredentialManager.create( + source=DocumentSource.SLACK, + credential_json={ + "slack_bot_token": os.environ["SLACK_BOT_TOKEN"], + }, + user_performing_action=admin_user, + ) + connector: DATestConnector = ConnectorManager.create( + name="Slack", + input_type=InputType.POLL, + source=DocumentSource.SLACK, + connector_specific_config={ + "workspace": "onyx-test-workspace", + "channels": [public_channel["name"], private_channel["name"]], + }, + access_type=AccessType.PUBLIC, + groups=[], + user_performing_action=admin_user, + ) + cc_pair: DATestCCPair = CCPairManager.create( + credential_id=credential.id, + connector_id=connector.id, + access_type=AccessType.SYNC, + user_performing_action=admin_user, + ) + CCPairManager.wait_for_indexing( + cc_pair=cc_pair, + after=before, + user_performing_action=admin_user, + ) + + # ----------------------SETUP INITIAL SLACK STATE-------------------------- + # Add test_user_1 and admin_user to the private channel + desired_channel_members = [admin_user, test_user_1] + SlackManager.set_channel_members( + slack_client=slack_client, + admin_user_id=admin_user_id, + channel=private_channel, + user_ids=[email_id_map[user.email] for user in desired_channel_members], + ) + + public_message = "Steve's favorite number is 809752" + private_message = "Sara's favorite number is 346794" + message_to_delete = "Rebecca's favorite number is 753468" + + SlackManager.add_message_to_channel( + slack_client=slack_client, + channel=public_channel, + message=public_message, + ) + SlackManager.add_message_to_channel( + slack_client=slack_client, + channel=private_channel, + message=private_message, + ) + SlackManager.add_message_to_channel( + slack_client=slack_client, + channel=private_channel, + message=message_to_delete, + ) + + # Run indexing + before = datetime.now(timezone.utc) + CCPairManager.run_once(cc_pair, admin_user) + CCPairManager.wait_for_indexing( + cc_pair=cc_pair, + after=before, + user_performing_action=admin_user, + ) + + # Run permission sync + before = datetime.now(timezone.utc) + CCPairManager.sync( + cc_pair=cc_pair, + user_performing_action=admin_user, + ) + CCPairManager.wait_for_sync( + cc_pair=cc_pair, + after=before, + user_performing_action=admin_user, + ) + + # ----------------------TEST THE SETUP-------------------------- + # Search as admin with access to both channels + danswer_doc_message_strings = DocumentSearchManager.search_documents( + query="favorite number", + user_performing_action=admin_user, + ) + print( + "\ntop_documents content before deleting for admin: ", + danswer_doc_message_strings, + ) + + # Ensure admin user can see all messages + assert public_message in danswer_doc_message_strings + assert private_message in danswer_doc_message_strings + assert message_to_delete in danswer_doc_message_strings + + # Search as test_user_1 with access to both channels + danswer_doc_message_strings = DocumentSearchManager.search_documents( + query="favorite number", + user_performing_action=test_user_1, + ) + print( + "\ntop_documents content before deleting for test_user_1: ", + danswer_doc_message_strings, + ) + + # Ensure test_user_1 can see all messages + assert public_message in danswer_doc_message_strings + assert private_message in danswer_doc_message_strings + assert message_to_delete in danswer_doc_message_strings + + # ----------------------MAKE THE CHANGES-------------------------- + # Delete messages + print("\nDeleting message: ", message_to_delete) + SlackManager.remove_message_from_channel( + slack_client=slack_client, + channel=private_channel, + message=message_to_delete, + ) + + # Prune the cc_pair + now = datetime.now(timezone.utc) + CCPairManager.prune(cc_pair, user_performing_action=admin_user) + CCPairManager.wait_for_prune(cc_pair, now, user_performing_action=admin_user) + + # ----------------------------VERIFY THE CHANGES--------------------------- + # Ensure admin user can't see deleted messages + # Search as admin user with access to only the public channel + danswer_doc_message_strings = DocumentSearchManager.search_documents( + query="favorite number", + user_performing_action=admin_user, + ) + print( + "\ntop_documents content after deleting for admin: ", + danswer_doc_message_strings, + ) + + # Ensure admin can't see deleted messages + assert public_message in danswer_doc_message_strings + assert private_message in danswer_doc_message_strings + assert message_to_delete not in danswer_doc_message_strings + + # Ensure test_user_1 can't see deleted messages + # Search as test_user_1 with access to only the public channel + danswer_doc_message_strings = DocumentSearchManager.search_documents( + query="favorite number", + user_performing_action=test_user_1, + ) + print( + "\ntop_documents content after prune for test_user_1: ", + danswer_doc_message_strings, + ) + + # Ensure test_user_1 can't see deleted messages + assert public_message in danswer_doc_message_strings + assert private_message in danswer_doc_message_strings + assert message_to_delete not in danswer_doc_message_strings diff --git a/backend/tests/integration/multitenant_tests/cc_Pair b/backend/tests/integration/multitenant_tests/cc_Pair new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backend/tests/integration/multitenant_tests/syncing/test_search_permissions.py b/backend/tests/integration/multitenant_tests/syncing/test_search_permissions.py new file mode 100644 index 00000000000..fead77387f6 --- /dev/null +++ b/backend/tests/integration/multitenant_tests/syncing/test_search_permissions.py @@ -0,0 +1,150 @@ +from danswer.db.models import UserRole +from tests.integration.common_utils.managers.api_key import APIKeyManager +from tests.integration.common_utils.managers.cc_pair import CCPairManager +from tests.integration.common_utils.managers.chat import ChatSessionManager +from tests.integration.common_utils.managers.document import DocumentManager +from tests.integration.common_utils.managers.llm_provider import LLMProviderManager +from tests.integration.common_utils.managers.tenant import TenantManager +from tests.integration.common_utils.managers.user import UserManager +from tests.integration.common_utils.test_models import DATestAPIKey +from tests.integration.common_utils.test_models import DATestCCPair +from tests.integration.common_utils.test_models import DATestChatSession +from tests.integration.common_utils.test_models import DATestUser + + +def test_multi_tenant_access_control(reset_multitenant: None) -> None: + # Create Tenant 1 and its Admin User + TenantManager.create("tenant_dev1", "test1@test.com", "Data Plane Registration") + test_user1: DATestUser = UserManager.create(name="test1", email="test1@test.com") + assert UserManager.verify_role(test_user1, UserRole.ADMIN) + + # Create Tenant 2 and its Admin User + TenantManager.create("tenant_dev2", "test2@test.com", "Data Plane Registration") + test_user2: DATestUser = UserManager.create(name="test2", email="test2@test.com") + assert UserManager.verify_role(test_user2, UserRole.ADMIN) + + # Create connectors for Tenant 1 + cc_pair_1: DATestCCPair = CCPairManager.create_from_scratch( + user_performing_action=test_user1, + ) + api_key_1: DATestAPIKey = APIKeyManager.create( + user_performing_action=test_user1, + ) + api_key_1.headers.update(test_user1.headers) + LLMProviderManager.create(user_performing_action=test_user1) + + # Seed documents for Tenant 1 + cc_pair_1.documents = [] + doc1_tenant1 = DocumentManager.seed_doc_with_content( + cc_pair=cc_pair_1, + content="Tenant 1 Document Content", + api_key=api_key_1, + ) + doc2_tenant1 = DocumentManager.seed_doc_with_content( + cc_pair=cc_pair_1, + content="Tenant 1 Document Content", + api_key=api_key_1, + ) + cc_pair_1.documents.extend([doc1_tenant1, doc2_tenant1]) + + # Create connectors for Tenant 2 + cc_pair_2: DATestCCPair = CCPairManager.create_from_scratch( + user_performing_action=test_user2, + ) + api_key_2: DATestAPIKey = APIKeyManager.create( + user_performing_action=test_user2, + ) + api_key_2.headers.update(test_user2.headers) + LLMProviderManager.create(user_performing_action=test_user2) + + # Seed documents for Tenant 2 + cc_pair_2.documents = [] + doc1_tenant2 = DocumentManager.seed_doc_with_content( + cc_pair=cc_pair_2, + content="Tenant 2 Document Content", + api_key=api_key_2, + ) + doc2_tenant2 = DocumentManager.seed_doc_with_content( + cc_pair=cc_pair_2, + content="Tenant 2 Document Content", + api_key=api_key_2, + ) + cc_pair_2.documents.extend([doc1_tenant2, doc2_tenant2]) + + tenant1_doc_ids = {doc1_tenant1.id, doc2_tenant1.id} + tenant2_doc_ids = {doc1_tenant2.id, doc2_tenant2.id} + + # Create chat sessions for each user + chat_session1: DATestChatSession = ChatSessionManager.create( + user_performing_action=test_user1 + ) + chat_session2: DATestChatSession = ChatSessionManager.create( + user_performing_action=test_user2 + ) + + # User 1 sends a message and gets a response + response1 = ChatSessionManager.send_message( + chat_session_id=chat_session1.id, + message="What is in Tenant 1's documents?", + user_performing_action=test_user1, + ) + # Assert that the search tool was used + assert response1.tool_name == "run_search" + + response_doc_ids = {doc["document_id"] for doc in response1.tool_result or []} + assert tenant1_doc_ids.issubset( + response_doc_ids + ), "Not all Tenant 1 document IDs are in the response" + assert not response_doc_ids.intersection( + tenant2_doc_ids + ), "Tenant 2 document IDs should not be in the response" + + # Assert that the contents are correct + for doc in response1.tool_result or []: + assert doc["content"] == "Tenant 1 Document Content" + + # User 2 sends a message and gets a response + response2 = ChatSessionManager.send_message( + chat_session_id=chat_session2.id, + message="What is in Tenant 2's documents?", + user_performing_action=test_user2, + ) + # Assert that the search tool was used + assert response2.tool_name == "run_search" + # Assert that the tool_result contains Tenant 2's documents + response_doc_ids = {doc["document_id"] for doc in response2.tool_result or []} + assert tenant2_doc_ids.issubset( + response_doc_ids + ), "Not all Tenant 2 document IDs are in the response" + assert not response_doc_ids.intersection( + tenant1_doc_ids + ), "Tenant 1 document IDs should not be in the response" + # Assert that the contents are correct + for doc in response2.tool_result or []: + assert doc["content"] == "Tenant 2 Document Content" + + # User 1 tries to access Tenant 2's documents + response_cross = ChatSessionManager.send_message( + chat_session_id=chat_session1.id, + message="What is in Tenant 2's documents?", + user_performing_action=test_user1, + ) + # Assert that the search tool was used + assert response_cross.tool_name == "run_search" + # Assert that the tool_result is empty or does not contain Tenant 2's documents + response_doc_ids = {doc["document_id"] for doc in response_cross.tool_result or []} + # Ensure none of Tenant 2's document IDs are in the response + assert not response_doc_ids.intersection(tenant2_doc_ids) + + # User 2 tries to access Tenant 1's documents + response_cross2 = ChatSessionManager.send_message( + chat_session_id=chat_session2.id, + message="What is in Tenant 1's documents?", + user_performing_action=test_user2, + ) + # Assert that the search tool was used + assert response_cross2.tool_name == "run_search" + # Assert that the tool_result is empty or does not contain Tenant 1's documents + response_doc_ids = {doc["document_id"] for doc in response_cross2.tool_result or []} + # Ensure none of Tenant 1's document IDs are in the response + assert not response_doc_ids.intersection(tenant1_doc_ids) diff --git a/backend/tests/integration/multitenant_tests/tenants/test_tenant_creation.py b/backend/tests/integration/multitenant_tests/tenants/test_tenant_creation.py new file mode 100644 index 00000000000..33110653557 --- /dev/null +++ b/backend/tests/integration/multitenant_tests/tenants/test_tenant_creation.py @@ -0,0 +1,41 @@ +from danswer.configs.constants import DocumentSource +from danswer.db.enums import AccessType +from danswer.db.models import UserRole +from tests.integration.common_utils.managers.cc_pair import CCPairManager +from tests.integration.common_utils.managers.connector import ConnectorManager +from tests.integration.common_utils.managers.credential import CredentialManager +from tests.integration.common_utils.managers.tenant import TenantManager +from tests.integration.common_utils.managers.user import UserManager +from tests.integration.common_utils.test_models import DATestUser + + +# Test flow from creating tenant to registering as a user +def test_tenant_creation(reset_multitenant: None) -> None: + TenantManager.create("tenant_dev", "test@test.com", "Data Plane Registration") + test_user: DATestUser = UserManager.create(name="test", email="test@test.com") + + assert UserManager.verify_role(test_user, UserRole.ADMIN) + + test_credential = CredentialManager.create( + name="admin_test_credential", + source=DocumentSource.FILE, + curator_public=False, + user_performing_action=test_user, + ) + + test_connector = ConnectorManager.create( + name="admin_test_connector", + source=DocumentSource.FILE, + access_type=AccessType.PRIVATE, + user_performing_action=test_user, + ) + + test_cc_pair = CCPairManager.create( + connector_id=test_connector.id, + credential_id=test_credential.id, + name="admin_test_cc_pair", + access_type=AccessType.PRIVATE, + user_performing_action=test_user, + ) + + CCPairManager.verify(cc_pair=test_cc_pair, user_performing_action=test_user) diff --git a/backend/tests/integration/openai_assistants_api/conftest.py b/backend/tests/integration/openai_assistants_api/conftest.py new file mode 100644 index 00000000000..172247dc391 --- /dev/null +++ b/backend/tests/integration/openai_assistants_api/conftest.py @@ -0,0 +1,55 @@ +from typing import Optional +from uuid import UUID + +import pytest +import requests + +from tests.integration.common_utils.constants import API_SERVER_URL +from tests.integration.common_utils.constants import GENERAL_HEADERS +from tests.integration.common_utils.managers.llm_provider import LLMProviderManager +from tests.integration.common_utils.managers.user import build_email +from tests.integration.common_utils.managers.user import DEFAULT_PASSWORD +from tests.integration.common_utils.managers.user import UserManager +from tests.integration.common_utils.test_models import DATestLLMProvider +from tests.integration.common_utils.test_models import DATestUser + +BASE_URL = f"{API_SERVER_URL}/openai-assistants" + + +@pytest.fixture +def admin_user() -> DATestUser | None: + try: + return UserManager.create("admin_user") + except Exception: + pass + + try: + return UserManager.login_as_user( + DATestUser( + id="", + email=build_email("admin_user"), + password=DEFAULT_PASSWORD, + headers=GENERAL_HEADERS, + ) + ) + except Exception: + pass + + return None + + +@pytest.fixture +def llm_provider(admin_user: DATestUser | None) -> DATestLLMProvider: + return LLMProviderManager.create(user_performing_action=admin_user) + + +@pytest.fixture +def thread_id(admin_user: Optional[DATestUser]) -> UUID: + # Create a thread to use in the tests + response = requests.post( + f"{BASE_URL}/threads", # Updated endpoint path + json={}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + return UUID(response.json()["id"]) diff --git a/backend/tests/integration/openai_assistants_api/test_assistants.py b/backend/tests/integration/openai_assistants_api/test_assistants.py new file mode 100644 index 00000000000..14f270f1a0e --- /dev/null +++ b/backend/tests/integration/openai_assistants_api/test_assistants.py @@ -0,0 +1,151 @@ +import requests + +from tests.integration.common_utils.constants import API_SERVER_URL +from tests.integration.common_utils.constants import GENERAL_HEADERS +from tests.integration.common_utils.test_models import DATestUser + +ASSISTANTS_URL = f"{API_SERVER_URL}/openai-assistants/assistants" + + +def test_create_assistant(admin_user: DATestUser | None) -> None: + response = requests.post( + ASSISTANTS_URL, + json={ + "model": "gpt-3.5-turbo", + "name": "Test Assistant", + "description": "A test assistant", + "instructions": "You are a helpful assistant.", + }, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + data = response.json() + assert data["name"] == "Test Assistant" + assert data["description"] == "A test assistant" + assert data["model"] == "gpt-3.5-turbo" + assert data["instructions"] == "You are a helpful assistant." + + +def test_retrieve_assistant(admin_user: DATestUser | None) -> None: + # First, create an assistant + create_response = requests.post( + ASSISTANTS_URL, + json={"model": "gpt-3.5-turbo", "name": "Retrieve Test"}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert create_response.status_code == 200 + assistant_id = create_response.json()["id"] + + # Now, retrieve the assistant + response = requests.get( + f"{ASSISTANTS_URL}/{assistant_id}", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + data = response.json() + assert data["id"] == assistant_id + assert data["name"] == "Retrieve Test" + + +def test_modify_assistant(admin_user: DATestUser | None) -> None: + # First, create an assistant + create_response = requests.post( + ASSISTANTS_URL, + json={"model": "gpt-3.5-turbo", "name": "Modify Test"}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert create_response.status_code == 200 + assistant_id = create_response.json()["id"] + + # Now, modify the assistant + response = requests.post( + f"{ASSISTANTS_URL}/{assistant_id}", + json={"name": "Modified Assistant", "instructions": "New instructions"}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + data = response.json() + assert data["id"] == assistant_id + assert data["name"] == "Modified Assistant" + assert data["instructions"] == "New instructions" + + +def test_delete_assistant(admin_user: DATestUser | None) -> None: + # First, create an assistant + create_response = requests.post( + ASSISTANTS_URL, + json={"model": "gpt-3.5-turbo", "name": "Delete Test"}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert create_response.status_code == 200 + assistant_id = create_response.json()["id"] + + # Now, delete the assistant + response = requests.delete( + f"{ASSISTANTS_URL}/{assistant_id}", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + data = response.json() + assert data["id"] == assistant_id + assert data["deleted"] is True + + +def test_list_assistants(admin_user: DATestUser | None) -> None: + # Create multiple assistants + for i in range(3): + requests.post( + ASSISTANTS_URL, + json={"model": "gpt-3.5-turbo", "name": f"List Test {i}"}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + + # Now, list the assistants + response = requests.get( + ASSISTANTS_URL, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + data = response.json() + assert data["object"] == "list" + assert len(data["data"]) >= 3 # At least the 3 we just created + assert all(assistant["object"] == "assistant" for assistant in data["data"]) + + +def test_list_assistants_pagination(admin_user: DATestUser | None) -> None: + # Create 5 assistants + for i in range(5): + requests.post( + ASSISTANTS_URL, + json={"model": "gpt-3.5-turbo", "name": f"Pagination Test {i}"}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + + # List assistants with limit + response = requests.get( + f"{ASSISTANTS_URL}?limit=2", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + data = response.json() + assert len(data["data"]) == 2 + assert data["has_more"] is True + + # Get next page + before = data["last_id"] + response = requests.get( + f"{ASSISTANTS_URL}?limit=2&before={before}", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + data = response.json() + assert len(data["data"]) == 2 + + +def test_assistant_not_found(admin_user: DATestUser | None) -> None: + non_existent_id = -99 + response = requests.get( + f"{ASSISTANTS_URL}/{non_existent_id}", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 404 diff --git a/backend/tests/integration/openai_assistants_api/test_messages.py b/backend/tests/integration/openai_assistants_api/test_messages.py new file mode 100644 index 00000000000..cbcf6869435 --- /dev/null +++ b/backend/tests/integration/openai_assistants_api/test_messages.py @@ -0,0 +1,133 @@ +import uuid +from typing import Optional + +import pytest +import requests + +from tests.integration.common_utils.constants import API_SERVER_URL +from tests.integration.common_utils.constants import GENERAL_HEADERS +from tests.integration.common_utils.test_models import DATestUser + +BASE_URL = f"{API_SERVER_URL}/openai-assistants/threads" + + +@pytest.fixture +def thread_id(admin_user: Optional[DATestUser]) -> str: + response = requests.post( + BASE_URL, + json={}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + return response.json()["id"] + + +def test_create_message(admin_user: Optional[DATestUser], thread_id: str) -> None: + response = requests.post( + f"{BASE_URL}/{thread_id}/messages", # URL structure matches API + json={ + "role": "user", + "content": "Hello, world!", + "file_ids": [], + "metadata": {"key": "value"}, + }, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + + response_json = response.json() + assert "id" in response_json + assert response_json["thread_id"] == thread_id + assert response_json["role"] == "user" + assert response_json["content"] == [{"type": "text", "text": "Hello, world!"}] + assert response_json["metadata"] == {"key": "value"} + + +def test_list_messages(admin_user: Optional[DATestUser], thread_id: str) -> None: + # Create a message first + requests.post( + f"{BASE_URL}/{thread_id}/messages", + json={"role": "user", "content": "Test message"}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + + # Now, list the messages + response = requests.get( + f"{BASE_URL}/{thread_id}/messages", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + + response_json = response.json() + assert response_json["object"] == "list" + assert isinstance(response_json["data"], list) + assert len(response_json["data"]) > 0 + assert "first_id" in response_json + assert "last_id" in response_json + assert "has_more" in response_json + + +def test_retrieve_message(admin_user: Optional[DATestUser], thread_id: str) -> None: + # Create a message first + create_response = requests.post( + f"{BASE_URL}/{thread_id}/messages", + json={"role": "user", "content": "Test message"}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + message_id = create_response.json()["id"] + + # Now, retrieve the message + response = requests.get( + f"{BASE_URL}/{thread_id}/messages/{message_id}", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + + response_json = response.json() + assert response_json["id"] == message_id + assert response_json["thread_id"] == thread_id + assert response_json["role"] == "user" + assert response_json["content"] == [{"type": "text", "text": "Test message"}] + + +def test_modify_message(admin_user: Optional[DATestUser], thread_id: str) -> None: + # Create a message first + create_response = requests.post( + f"{BASE_URL}/{thread_id}/messages", + json={"role": "user", "content": "Test message"}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + message_id = create_response.json()["id"] + + # Now, modify the message + response = requests.post( + f"{BASE_URL}/{thread_id}/messages/{message_id}", + json={"metadata": {"new_key": "new_value"}}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + + response_json = response.json() + assert response_json["id"] == message_id + assert response_json["thread_id"] == thread_id + assert response_json["metadata"] == {"new_key": "new_value"} + + +def test_error_handling(admin_user: Optional[DATestUser]) -> None: + non_existent_thread_id = str(uuid.uuid4()) + non_existent_message_id = -99 + + # Test with non-existent thread + response = requests.post( + f"{BASE_URL}/{non_existent_thread_id}/messages", + json={"role": "user", "content": "Test message"}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 404 + + # Test with non-existent message + response = requests.get( + f"{BASE_URL}/{non_existent_thread_id}/messages/{non_existent_message_id}", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 404 diff --git a/backend/tests/integration/openai_assistants_api/test_runs.py b/backend/tests/integration/openai_assistants_api/test_runs.py new file mode 100644 index 00000000000..2ee0dbd4ba9 --- /dev/null +++ b/backend/tests/integration/openai_assistants_api/test_runs.py @@ -0,0 +1,137 @@ +from uuid import UUID + +import pytest +import requests + +from tests.integration.common_utils.constants import API_SERVER_URL +from tests.integration.common_utils.constants import GENERAL_HEADERS +from tests.integration.common_utils.test_models import DATestLLMProvider +from tests.integration.common_utils.test_models import DATestUser + +BASE_URL = f"{API_SERVER_URL}/openai-assistants" + + +@pytest.fixture +def run_id(admin_user: DATestUser | None, thread_id: UUID) -> str: + """Create a run and return its ID.""" + response = requests.post( + f"{BASE_URL}/threads/{thread_id}/runs", + json={ + "assistant_id": 0, + }, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + return response.json()["id"] + + +def test_create_run( + admin_user: DATestUser | None, thread_id: UUID, llm_provider: DATestLLMProvider +) -> None: + response = requests.post( + f"{BASE_URL}/threads/{thread_id}/runs", + json={ + "assistant_id": 0, + "model": "gpt-3.5-turbo", + "instructions": "Test instructions", + }, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + + response_json = response.json() + assert "id" in response_json + assert response_json["object"] == "thread.run" + assert "created_at" in response_json + assert response_json["assistant_id"] == 0 + assert UUID(response_json["thread_id"]) == thread_id + assert response_json["status"] == "queued" + assert response_json["model"] == "gpt-3.5-turbo" + assert response_json["instructions"] == "Test instructions" + + +def test_retrieve_run( + admin_user: DATestUser | None, + thread_id: UUID, + run_id: str, + llm_provider: DATestLLMProvider, +) -> None: + retrieve_response = requests.get( + f"{BASE_URL}/threads/{thread_id}/runs/{run_id}", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert retrieve_response.status_code == 200 + + response_json = retrieve_response.json() + assert response_json["id"] == run_id + assert response_json["object"] == "thread.run" + assert "created_at" in response_json + assert UUID(response_json["thread_id"]) == thread_id + + +def test_cancel_run( + admin_user: DATestUser | None, + thread_id: UUID, + run_id: str, + llm_provider: DATestLLMProvider, +) -> None: + cancel_response = requests.post( + f"{BASE_URL}/threads/{thread_id}/runs/{run_id}/cancel", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert cancel_response.status_code == 200 + + response_json = cancel_response.json() + assert response_json["id"] == run_id + assert response_json["status"] == "cancelled" + + +def test_list_runs( + admin_user: DATestUser | None, thread_id: UUID, llm_provider: DATestLLMProvider +) -> None: + # Create a few runs + for _ in range(3): + requests.post( + f"{BASE_URL}/threads/{thread_id}/runs", + json={ + "assistant_id": 0, + }, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + + # Now, list the runs + list_response = requests.get( + f"{BASE_URL}/threads/{thread_id}/runs", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert list_response.status_code == 200 + + response_json = list_response.json() + assert isinstance(response_json, list) + assert len(response_json) >= 3 + + for run in response_json: + assert "id" in run + assert run["object"] == "thread.run" + assert "created_at" in run + assert UUID(run["thread_id"]) == thread_id + assert "status" in run + assert "model" in run + + +def test_list_run_steps( + admin_user: DATestUser | None, + thread_id: UUID, + run_id: str, + llm_provider: DATestLLMProvider, +) -> None: + steps_response = requests.get( + f"{BASE_URL}/threads/{thread_id}/runs/{run_id}/steps", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert steps_response.status_code == 200 + + response_json = steps_response.json() + assert isinstance(response_json, list) + # Since DAnswer doesn't have an equivalent to run steps, we expect an empty list + assert len(response_json) == 0 diff --git a/backend/tests/integration/openai_assistants_api/test_threads.py b/backend/tests/integration/openai_assistants_api/test_threads.py new file mode 100644 index 00000000000..4ae128b2612 --- /dev/null +++ b/backend/tests/integration/openai_assistants_api/test_threads.py @@ -0,0 +1,132 @@ +from uuid import UUID + +import requests + +from danswer.db.models import ChatSessionSharedStatus +from tests.integration.common_utils.constants import API_SERVER_URL +from tests.integration.common_utils.constants import GENERAL_HEADERS +from tests.integration.common_utils.test_models import DATestUser + +THREADS_URL = f"{API_SERVER_URL}/openai-assistants/threads" + + +def test_create_thread(admin_user: DATestUser | None) -> None: + response = requests.post( + THREADS_URL, + json={"messages": None, "metadata": {"key": "value"}}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + + response_json = response.json() + assert "id" in response_json + assert response_json["object"] == "thread" + assert "created_at" in response_json + assert response_json["metadata"] == {"key": "value"} + + +def test_retrieve_thread(admin_user: DATestUser | None) -> None: + # First, create a thread + create_response = requests.post( + THREADS_URL, + json={}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert create_response.status_code == 200 + thread_id = create_response.json()["id"] + + # Now, retrieve the thread + retrieve_response = requests.get( + f"{THREADS_URL}/{thread_id}", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert retrieve_response.status_code == 200 + + response_json = retrieve_response.json() + assert response_json["id"] == thread_id + assert response_json["object"] == "thread" + assert "created_at" in response_json + + +def test_modify_thread(admin_user: DATestUser | None) -> None: + # First, create a thread + create_response = requests.post( + THREADS_URL, + json={}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert create_response.status_code == 200 + thread_id = create_response.json()["id"] + + # Now, modify the thread + modify_response = requests.post( + f"{THREADS_URL}/{thread_id}", + json={"metadata": {"new_key": "new_value"}}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert modify_response.status_code == 200 + + response_json = modify_response.json() + assert response_json["id"] == thread_id + assert response_json["metadata"] == {"new_key": "new_value"} + + +def test_delete_thread(admin_user: DATestUser | None) -> None: + # First, create a thread + create_response = requests.post( + THREADS_URL, + json={}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert create_response.status_code == 200 + thread_id = create_response.json()["id"] + + # Now, delete the thread + delete_response = requests.delete( + f"{THREADS_URL}/{thread_id}", + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert delete_response.status_code == 200 + + response_json = delete_response.json() + assert response_json["id"] == thread_id + assert response_json["object"] == "thread.deleted" + assert response_json["deleted"] is True + + +def test_list_threads(admin_user: DATestUser | None) -> None: + # Create a few threads + for _ in range(3): + requests.post( + THREADS_URL, + json={}, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + + # Now, list the threads + list_response = requests.get( + THREADS_URL, + headers=admin_user.headers if admin_user else GENERAL_HEADERS, + ) + assert list_response.status_code == 200 + + response_json = list_response.json() + assert "sessions" in response_json + assert len(response_json["sessions"]) >= 3 + + for session in response_json["sessions"]: + assert "id" in session + assert "name" in session + assert "persona_id" in session + assert "time_created" in session + assert "shared_status" in session + assert "folder_id" in session + assert "current_alternate_model" in session + + # Validate UUID + UUID(session["id"]) + + # Validate shared_status + assert session["shared_status"] in [ + status.value for status in ChatSessionSharedStatus + ] diff --git a/backend/tests/integration/tests/api_key/test_api_key.py b/backend/tests/integration/tests/api_key/test_api_key.py new file mode 100644 index 00000000000..bd0618b962d --- /dev/null +++ b/backend/tests/integration/tests/api_key/test_api_key.py @@ -0,0 +1,42 @@ +import requests + +from danswer.auth.schemas import UserRole +from tests.integration.common_utils.constants import API_SERVER_URL +from tests.integration.common_utils.managers.api_key import APIKeyManager +from tests.integration.common_utils.managers.user import UserManager +from tests.integration.common_utils.test_models import DATestAPIKey +from tests.integration.common_utils.test_models import DATestUser + + +def test_limited(reset: None) -> None: + """Verify that with a limited role key, limited endpoints are accessible and + others are not.""" + + # Creating an admin user (first user created is automatically an admin) + admin_user: DATestUser = UserManager.create(name="admin_user") + + api_key: DATestAPIKey = APIKeyManager.create( + api_key_role=UserRole.LIMITED, + user_performing_action=admin_user, + ) + + # test limited endpoint + response = requests.get( + f"{API_SERVER_URL}/persona/0", + headers=api_key.headers, + ) + assert response.status_code == 200 + + # test basic endpoints + response = requests.get( + f"{API_SERVER_URL}/input_prompt", + headers=api_key.headers, + ) + assert response.status_code == 403 + + # test admin endpoints + response = requests.get( + f"{API_SERVER_URL}/admin/api-key", + headers=api_key.headers, + ) + assert response.status_code == 403 diff --git a/backend/tests/integration/tests/connector/test_connector_creation.py b/backend/tests/integration/tests/connector/test_connector_creation.py new file mode 100644 index 00000000000..acfafe9436d --- /dev/null +++ b/backend/tests/integration/tests/connector/test_connector_creation.py @@ -0,0 +1,84 @@ +import os +from datetime import datetime +from datetime import timezone + +from danswer.server.documents.models import DocumentSource +from tests.integration.common_utils.managers.cc_pair import CCPairManager +from tests.integration.common_utils.managers.user import UserManager +from tests.integration.common_utils.test_models import DATestUser + + +def test_connector_creation(reset: None) -> None: + # Creating an admin user (first user created is automatically an admin) + admin_user: DATestUser = UserManager.create(name="admin_user") + + # create connectors + cc_pair_1 = CCPairManager.create_from_scratch( + source=DocumentSource.INGESTION_API, + user_performing_action=admin_user, + ) + + cc_pair_info = CCPairManager.get_single( + cc_pair_1.id, user_performing_action=admin_user + ) + assert cc_pair_info + assert cc_pair_info.creator + assert str(cc_pair_info.creator) == admin_user.id + assert cc_pair_info.creator_email == admin_user.email + + +def test_overlapping_connector_creation(reset: None) -> None: + """Tests that connectors indexing the same documents don't interfere with each other. + A previous bug involved document by cc pair entries not being added for new connectors + when the docs existed already via another connector and were up to date relative to the source. + """ + admin_user: DATestUser = UserManager.create(name="admin_user") + + config = { + "wiki_base": os.environ["CONFLUENCE_TEST_SPACE_URL"], + "space": "DailyConne", + "is_cloud": True, + "page_id": "", + } + + credential = { + "confluence_username": os.environ["CONFLUENCE_USER_NAME"], + "confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"], + } + + # store the time before we create the connector so that we know after + # when the indexing should have started + now = datetime.now(timezone.utc) + + # create connector + cc_pair_1 = CCPairManager.create_from_scratch( + source=DocumentSource.CONFLUENCE, + connector_specific_config=config, + credential_json=credential, + user_performing_action=admin_user, + ) + + CCPairManager.wait_for_indexing( + cc_pair_1, now, timeout=120, user_performing_action=admin_user + ) + + now = datetime.now(timezone.utc) + + cc_pair_2 = CCPairManager.create_from_scratch( + source=DocumentSource.CONFLUENCE, + connector_specific_config=config, + credential_json=credential, + user_performing_action=admin_user, + ) + + CCPairManager.wait_for_indexing( + cc_pair_2, now, timeout=120, user_performing_action=admin_user + ) + + info_1 = CCPairManager.get_single(cc_pair_1.id, user_performing_action=admin_user) + assert info_1 + + info_2 = CCPairManager.get_single(cc_pair_2.id, user_performing_action=admin_user) + assert info_2 + + assert info_1.num_docs_indexed == info_2.num_docs_indexed diff --git a/backend/tests/integration/tests/connector/test_connector_deletion.py b/backend/tests/integration/tests/connector/test_connector_deletion.py index f0a83034b32..676ee4d9f4b 100644 --- a/backend/tests/integration/tests/connector/test_connector_deletion.py +++ b/backend/tests/integration/tests/connector/test_connector_deletion.py @@ -11,6 +11,7 @@ from danswer.db.engine import get_sqlalchemy_engine from danswer.db.enums import IndexingStatus +from danswer.db.index_attempt import create_index_attempt from danswer.db.index_attempt import create_index_attempt_error from danswer.db.models import IndexAttempt from danswer.db.search_settings import get_current_search_settings @@ -22,17 +23,17 @@ from tests.integration.common_utils.managers.document_set import DocumentSetManager from tests.integration.common_utils.managers.user import UserManager from tests.integration.common_utils.managers.user_group import UserGroupManager -from tests.integration.common_utils.test_models import TestAPIKey -from tests.integration.common_utils.test_models import TestUser -from tests.integration.common_utils.test_models import TestUserGroup -from tests.integration.common_utils.vespa import TestVespaClient +from tests.integration.common_utils.test_models import DATestAPIKey +from tests.integration.common_utils.test_models import DATestUser +from tests.integration.common_utils.test_models import DATestUserGroup +from tests.integration.common_utils.vespa import vespa_fixture -def test_connector_deletion(reset: None, vespa_client: TestVespaClient) -> None: +def test_connector_deletion(reset: None, vespa_client: vespa_fixture) -> None: # Creating an admin user (first user created is automatically an admin) - admin_user: TestUser = UserManager.create(name="admin_user") - # add api key to user - api_key: TestAPIKey = APIKeyManager.create( + admin_user: DATestUser = UserManager.create(name="admin_user") + # create api key + api_key: DATestAPIKey = APIKeyManager.create( user_performing_action=admin_user, ) @@ -47,12 +48,12 @@ def test_connector_deletion(reset: None, vespa_client: TestVespaClient) -> None: ) # seed documents - cc_pair_1 = DocumentManager.seed_and_attach_docs( + cc_pair_1.documents = DocumentManager.seed_dummy_docs( cc_pair=cc_pair_1, num_docs=NUM_DOCS, api_key=api_key, ) - cc_pair_2 = DocumentManager.seed_and_attach_docs( + cc_pair_2.documents = DocumentManager.seed_dummy_docs( cc_pair=cc_pair_2, num_docs=NUM_DOCS, api_key=api_key, @@ -76,11 +77,11 @@ def test_connector_deletion(reset: None, vespa_client: TestVespaClient) -> None: print("Document sets created and synced") # create user groups - user_group_1: TestUserGroup = UserGroupManager.create( + user_group_1: DATestUserGroup = UserGroupManager.create( cc_pair_ids=[cc_pair_1.id], user_performing_action=admin_user, ) - user_group_2: TestUserGroup = UserGroupManager.create( + user_group_2: DATestUserGroup = UserGroupManager.create( cc_pair_ids=[cc_pair_1.id, cc_pair_2.id], user_performing_action=admin_user, ) @@ -117,6 +118,22 @@ def test_connector_deletion(reset: None, vespa_client: TestVespaClient) -> None: user_performing_action=admin_user, ) + # inject an index attempt and index attempt error (exercises foreign key errors) + with Session(get_sqlalchemy_engine()) as db_session: + attempt_id = create_index_attempt( + connector_credential_pair_id=cc_pair_1.id, + search_settings_id=1, + db_session=db_session, + ) + create_index_attempt_error( + index_attempt_id=attempt_id, + batch=1, + docs=[], + exception_msg="", + exception_traceback="", + db_session=db_session, + ) + # Update local records to match the database for later comparison user_group_1.cc_pair_ids = [] user_group_2.cc_pair_ids = [cc_pair_2.id] @@ -125,7 +142,9 @@ def test_connector_deletion(reset: None, vespa_client: TestVespaClient) -> None: cc_pair_1.groups = [] cc_pair_2.groups = [user_group_2.id] - CCPairManager.wait_for_deletion_completion(user_performing_action=admin_user) + CCPairManager.wait_for_deletion_completion( + cc_pair_id=cc_pair_1.id, user_performing_action=admin_user + ) # validate vespa documents DocumentManager.verify( @@ -174,15 +193,15 @@ def test_connector_deletion(reset: None, vespa_client: TestVespaClient) -> None: def test_connector_deletion_for_overlapping_connectors( - reset: None, vespa_client: TestVespaClient + reset: None, vespa_client: vespa_fixture ) -> None: """Checks to make sure that connectors with overlapping documents work properly. Specifically, that the overlapping document (1) still exists and (2) has the right document set / group post-deletion of one of the connectors. """ # Creating an admin user (first user created is automatically an admin) - admin_user: TestUser = UserManager.create(name="admin_user") - # add api key to user - api_key: TestAPIKey = APIKeyManager.create( + admin_user: DATestUser = UserManager.create(name="admin_user") + # create api key + api_key: DATestAPIKey = APIKeyManager.create( user_performing_action=admin_user, ) @@ -197,12 +216,12 @@ def test_connector_deletion_for_overlapping_connectors( ) doc_ids = [str(uuid4())] - cc_pair_1 = DocumentManager.seed_and_attach_docs( + cc_pair_1.documents = DocumentManager.seed_dummy_docs( cc_pair=cc_pair_1, document_ids=doc_ids, api_key=api_key, ) - cc_pair_2 = DocumentManager.seed_and_attach_docs( + cc_pair_2.documents = DocumentManager.seed_dummy_docs( cc_pair=cc_pair_2, document_ids=doc_ids, api_key=api_key, @@ -251,7 +270,7 @@ def test_connector_deletion_for_overlapping_connectors( ) # create a user group and attach it to connector 1 - user_group_1: TestUserGroup = UserGroupManager.create( + user_group_1: DATestUserGroup = UserGroupManager.create( name="Test User Group 1", cc_pair_ids=[cc_pair_1.id], user_performing_action=admin_user, @@ -265,7 +284,7 @@ def test_connector_deletion_for_overlapping_connectors( print("User group 1 created and synced") # create a user group and attach it to connector 2 - user_group_2: TestUserGroup = UserGroupManager.create( + user_group_2: DATestUserGroup = UserGroupManager.create( name="Test User Group 2", cc_pair_ids=[cc_pair_2.id], user_performing_action=admin_user, @@ -303,7 +322,9 @@ def test_connector_deletion_for_overlapping_connectors( ) # wait for deletion to finish - CCPairManager.wait_for_deletion_completion(user_performing_action=admin_user) + CCPairManager.wait_for_deletion_completion( + cc_pair_id=cc_pair_1.id, user_performing_action=admin_user + ) print("Connector 1 deleted") diff --git a/backend/tests/integration/tests/dev_apis/test_knowledge_chat.py b/backend/tests/integration/tests/dev_apis/test_knowledge_chat.py new file mode 100644 index 00000000000..475085c6777 --- /dev/null +++ b/backend/tests/integration/tests/dev_apis/test_knowledge_chat.py @@ -0,0 +1,191 @@ +import requests + +from danswer.configs.constants import MessageType +from tests.integration.common_utils.constants import API_SERVER_URL +from tests.integration.common_utils.managers.api_key import APIKeyManager +from tests.integration.common_utils.managers.cc_pair import CCPairManager +from tests.integration.common_utils.managers.document import DocumentManager +from tests.integration.common_utils.managers.llm_provider import LLMProviderManager +from tests.integration.common_utils.managers.user import UserManager +from tests.integration.common_utils.test_models import DATestAPIKey +from tests.integration.common_utils.test_models import DATestCCPair +from tests.integration.common_utils.test_models import DATestUser + + +def test_all_stream_chat_message_objects_outputs(reset: None) -> None: + # Creating an admin user (first user created is automatically an admin) + admin_user: DATestUser = UserManager.create(name="admin_user") + + # create connector + cc_pair_1: DATestCCPair = CCPairManager.create_from_scratch( + user_performing_action=admin_user, + ) + api_key: DATestAPIKey = APIKeyManager.create( + user_performing_action=admin_user, + ) + LLMProviderManager.create(user_performing_action=admin_user) + + # SEEDING DOCUMENTS + cc_pair_1.documents = [] + cc_pair_1.documents.append( + DocumentManager.seed_doc_with_content( + cc_pair=cc_pair_1, + content="Pablo's favorite color is blue", + api_key=api_key, + ) + ) + cc_pair_1.documents.append( + DocumentManager.seed_doc_with_content( + cc_pair=cc_pair_1, + content="Chris's favorite color is red", + api_key=api_key, + ) + ) + cc_pair_1.documents.append( + DocumentManager.seed_doc_with_content( + cc_pair=cc_pair_1, + content="Pika's favorite color is green", + api_key=api_key, + ) + ) + + # TESTING RESPONSE FOR QUESTION 1 + response = requests.post( + f"{API_SERVER_URL}/chat/send-message-simple-with-history", + json={ + "messages": [ + { + "message": "What is Pablo's favorite color?", + "role": MessageType.USER.value, + } + ], + "persona_id": 0, + "prompt_id": 0, + }, + headers=admin_user.headers, + ) + assert response.status_code == 200 + response_json = response.json() + + # check that the answer is correct + answer_1 = response_json["answer"] + assert "blue" in answer_1.lower() + + # FLAKY - check that the llm selected a document + # assert 0 in response_json["llm_selected_doc_indices"] + + # check that the final context documents are correct + # (it should contain all documents because there arent enough to exclude any) + assert 0 in response_json["final_context_doc_indices"] + assert 1 in response_json["final_context_doc_indices"] + assert 2 in response_json["final_context_doc_indices"] + + # FLAKY - check that the cited documents are correct + # assert cc_pair_1.documents[0].id in response_json["cited_documents"].values() + + # flakiness likely due to non-deterministic rephrasing + # FLAKY - check that the top documents are correct + # assert response_json["top_documents"][0]["document_id"] == cc_pair_1.documents[0].id + print("response 1/3 passed") + + # TESTING RESPONSE FOR QUESTION 2 + response = requests.post( + f"{API_SERVER_URL}/chat/send-message-simple-with-history", + json={ + "messages": [ + { + "message": "What is Pablo's favorite color?", + "role": MessageType.USER.value, + }, + { + "message": answer_1, + "role": MessageType.ASSISTANT.value, + }, + { + "message": "What is Chris's favorite color?", + "role": MessageType.USER.value, + }, + ], + "persona_id": 0, + "prompt_id": 0, + }, + headers=admin_user.headers, + ) + assert response.status_code == 200 + response_json = response.json() + + # check that the answer is correct + answer_2 = response_json["answer"] + assert "red" in answer_2.lower() + + # FLAKY - check that the llm selected a document + # assert 0 in response_json["llm_selected_doc_indices"] + + # check that the final context documents are correct + # (it should contain all documents because there arent enough to exclude any) + assert 0 in response_json["final_context_doc_indices"] + assert 1 in response_json["final_context_doc_indices"] + assert 2 in response_json["final_context_doc_indices"] + + # FLAKY - check that the cited documents are correct + # assert cc_pair_1.documents[1].id in response_json["cited_documents"].values() + + # flakiness likely due to non-deterministic rephrasing + # FLAKY - check that the top documents are correct + # assert response_json["top_documents"][0]["document_id"] == cc_pair_1.documents[1].id + print("response 2/3 passed") + + # TESTING RESPONSE FOR QUESTION 3 + response = requests.post( + f"{API_SERVER_URL}/chat/send-message-simple-with-history", + json={ + "messages": [ + { + "message": "What is Pablo's favorite color?", + "role": MessageType.USER.value, + }, + { + "message": answer_1, + "role": MessageType.ASSISTANT.value, + }, + { + "message": "What is Chris's favorite color?", + "role": MessageType.USER.value, + }, + { + "message": answer_2, + "role": MessageType.ASSISTANT.value, + }, + { + "message": "What is Pika's favorite color?", + "role": MessageType.USER.value, + }, + ], + "persona_id": 0, + "prompt_id": 0, + }, + headers=admin_user.headers, + ) + assert response.status_code == 200 + response_json = response.json() + + # check that the answer is correct + answer_3 = response_json["answer"] + assert "green" in answer_3.lower() + + # FLAKY - check that the llm selected a document + # assert 0 in response_json["llm_selected_doc_indices"] + + # check that the final context documents are correct + # (it should contain all documents because there arent enough to exclude any) + assert 0 in response_json["final_context_doc_indices"] + assert 1 in response_json["final_context_doc_indices"] + assert 2 in response_json["final_context_doc_indices"] + + # FLAKY - check that the cited documents are correct + # assert cc_pair_1.documents[2].id in response_json["cited_documents"].values() + + # flakiness likely due to non-deterministic rephrasing + # FLAKY - check that the top documents are correct + # assert response_json["top_documents"][0]["document_id"] == cc_pair_1.documents[2].id + print("response 3/3 passed") diff --git a/backend/tests/integration/tests/dev_apis/test_simple_chat_api.py b/backend/tests/integration/tests/dev_apis/test_simple_chat_api.py index d4edcc583aa..0ed40c758d0 100644 --- a/backend/tests/integration/tests/dev_apis/test_simple_chat_api.py +++ b/backend/tests/integration/tests/dev_apis/test_simple_chat_api.py @@ -1,31 +1,34 @@ +import json + import requests from danswer.configs.constants import MessageType from tests.integration.common_utils.constants import API_SERVER_URL +from tests.integration.common_utils.constants import GENERAL_HEADERS from tests.integration.common_utils.constants import NUM_DOCS -from tests.integration.common_utils.llm import LLMProviderManager from tests.integration.common_utils.managers.api_key import APIKeyManager from tests.integration.common_utils.managers.cc_pair import CCPairManager from tests.integration.common_utils.managers.document import DocumentManager +from tests.integration.common_utils.managers.llm_provider import LLMProviderManager from tests.integration.common_utils.managers.user import UserManager -from tests.integration.common_utils.test_models import TestAPIKey -from tests.integration.common_utils.test_models import TestCCPair -from tests.integration.common_utils.test_models import TestUser +from tests.integration.common_utils.test_models import DATestAPIKey +from tests.integration.common_utils.test_models import DATestCCPair +from tests.integration.common_utils.test_models import DATestUser def test_send_message_simple_with_history(reset: None) -> None: # Creating an admin user (first user created is automatically an admin) - admin_user: TestUser = UserManager.create(name="admin_user") + admin_user: DATestUser = UserManager.create(name="admin_user") # create connectors - cc_pair_1: TestCCPair = CCPairManager.create_from_scratch( + cc_pair_1: DATestCCPair = CCPairManager.create_from_scratch( user_performing_action=admin_user, ) - api_key: TestAPIKey = APIKeyManager.create( + api_key: DATestAPIKey = APIKeyManager.create( user_performing_action=admin_user, ) LLMProviderManager.create(user_performing_action=admin_user) - cc_pair_1 = DocumentManager.seed_and_attach_docs( + cc_pair_1.documents = DocumentManager.seed_dummy_docs( cc_pair=cc_pair_1, num_docs=NUM_DOCS, api_key=api_key, @@ -60,3 +63,174 @@ def test_send_message_simple_with_history(reset: None) -> None: ) assert found_doc assert found_doc["metadata"]["document_id"] == doc.id + + +def test_using_reference_docs_with_simple_with_history_api_flow(reset: None) -> None: + # Creating an admin user (first user created is automatically an admin) + admin_user: DATestUser = UserManager.create(name="admin_user") + + # create connector + cc_pair_1: DATestCCPair = CCPairManager.create_from_scratch( + user_performing_action=admin_user, + ) + api_key: DATestAPIKey = APIKeyManager.create( + user_performing_action=admin_user, + ) + LLMProviderManager.create(user_performing_action=admin_user) + + # SEEDING DOCUMENTS + cc_pair_1.documents = [] + cc_pair_1.documents.append( + DocumentManager.seed_doc_with_content( + cc_pair=cc_pair_1, + content="Chris's favorite color is blue", + api_key=api_key, + ) + ) + cc_pair_1.documents.append( + DocumentManager.seed_doc_with_content( + cc_pair=cc_pair_1, + content="Hagen's favorite color is red", + api_key=api_key, + ) + ) + cc_pair_1.documents.append( + DocumentManager.seed_doc_with_content( + cc_pair=cc_pair_1, + content="Pablo's favorite color is green", + api_key=api_key, + ) + ) + + # SEINDING MESSAGE 1 + response = requests.post( + f"{API_SERVER_URL}/chat/send-message-simple-with-history", + json={ + "messages": [ + { + "message": "What is Pablo's favorite color?", + "role": MessageType.USER.value, + } + ], + "persona_id": 0, + "prompt_id": 0, + }, + headers=admin_user.headers, + ) + assert response.status_code == 200 + response_json = response.json() + + # get the db_doc_id of the top document to use as a search doc id for second message + first_db_doc_id = response_json["top_documents"][0]["db_doc_id"] + + # SEINDING MESSAGE 2 + response = requests.post( + f"{API_SERVER_URL}/chat/send-message-simple-with-history", + json={ + "messages": [ + { + "message": "What is Pablo's favorite color?", + "role": MessageType.USER.value, + } + ], + "persona_id": 0, + "prompt_id": 0, + "search_doc_ids": [first_db_doc_id], + }, + headers=admin_user.headers, + ) + assert response.status_code == 200 + response_json = response.json() + + # make sure there is an answer + assert response_json["answer"] + + # since we only gave it one search doc, all responses should only contain that doc + assert response_json["final_context_doc_indices"] == [0] + assert response_json["llm_selected_doc_indices"] == [0] + assert cc_pair_1.documents[2].id in response_json["cited_documents"].values() + # This ensures the the document we think we are referencing when we send the search_doc_ids in the second + # message is the document that we expect it to be + assert response_json["top_documents"][0]["document_id"] == cc_pair_1.documents[2].id + + +def test_send_message_simple_with_history_strict_json( + new_admin_user: DATestUser | None, +) -> None: + # create connectors + LLMProviderManager.create(user_performing_action=new_admin_user) + + response = requests.post( + f"{API_SERVER_URL}/chat/send-message-simple-with-history", + json={ + # intentionally not relevant prompt to ensure that the + # structured response format is actually used + "messages": [ + { + "message": "What is green?", + "role": MessageType.USER.value, + } + ], + "persona_id": 0, + "prompt_id": 0, + "structured_response_format": { + "type": "json_schema", + "json_schema": { + "name": "presidents", + "schema": { + "type": "object", + "properties": { + "presidents": { + "type": "array", + "items": {"type": "string"}, + "description": "List of the first three US presidents", + } + }, + "required": ["presidents"], + "additionalProperties": False, + }, + "strict": True, + }, + }, + }, + headers=new_admin_user.headers if new_admin_user else GENERAL_HEADERS, + ) + assert response.status_code == 200 + + response_json = response.json() + + # Check that the answer is present + assert "answer" in response_json + assert response_json["answer"] is not None + + # helper + def clean_json_string(json_string: str) -> str: + return json_string.strip().removeprefix("```json").removesuffix("```").strip() + + # Attempt to parse the answer as JSON + try: + clean_answer = clean_json_string(response_json["answer"]) + parsed_answer = json.loads(clean_answer) + + # NOTE: do not check content, just the structure + assert isinstance(parsed_answer, dict) + assert "presidents" in parsed_answer + assert isinstance(parsed_answer["presidents"], list) + for president in parsed_answer["presidents"]: + assert isinstance(president, str) + except json.JSONDecodeError: + assert ( + False + ), f"The answer is not a valid JSON object - '{response_json['answer']}'" + + # Check that the answer_citationless is also valid JSON + assert "answer_citationless" in response_json + assert response_json["answer_citationless"] is not None + try: + clean_answer_citationless = clean_json_string( + response_json["answer_citationless"] + ) + parsed_answer_citationless = json.loads(clean_answer_citationless) + assert isinstance(parsed_answer_citationless, dict) + except json.JSONDecodeError: + assert False, "The answer_citationless is not a valid JSON object" diff --git a/backend/tests/integration/tests/document_set/test_syncing.py b/backend/tests/integration/tests/document_set/test_syncing.py index 217d106af4d..ed00870663a 100644 --- a/backend/tests/integration/tests/document_set/test_syncing.py +++ b/backend/tests/integration/tests/document_set/test_syncing.py @@ -5,19 +5,19 @@ from tests.integration.common_utils.managers.document import DocumentManager from tests.integration.common_utils.managers.document_set import DocumentSetManager from tests.integration.common_utils.managers.user import UserManager -from tests.integration.common_utils.test_models import TestAPIKey -from tests.integration.common_utils.test_models import TestUser -from tests.integration.common_utils.vespa import TestVespaClient +from tests.integration.common_utils.test_models import DATestAPIKey +from tests.integration.common_utils.test_models import DATestUser +from tests.integration.common_utils.vespa import vespa_fixture def test_multiple_document_sets_syncing_same_connnector( - reset: None, vespa_client: TestVespaClient + reset: None, vespa_client: vespa_fixture ) -> None: # Creating an admin user (first user created is automatically an admin) - admin_user: TestUser = UserManager.create(name="admin_user") + admin_user: DATestUser = UserManager.create(name="admin_user") - # add api key to user - api_key: TestAPIKey = APIKeyManager.create( + # create api key + api_key: DATestAPIKey = APIKeyManager.create( user_performing_action=admin_user, ) @@ -28,7 +28,7 @@ def test_multiple_document_sets_syncing_same_connnector( ) # seed documents - cc_pair_1 = DocumentManager.seed_and_attach_docs( + cc_pair_1.documents = DocumentManager.seed_dummy_docs( cc_pair=cc_pair_1, num_docs=NUM_DOCS, api_key=api_key, @@ -66,12 +66,12 @@ def test_multiple_document_sets_syncing_same_connnector( ) -def test_removing_connector(reset: None, vespa_client: TestVespaClient) -> None: +def test_removing_connector(reset: None, vespa_client: vespa_fixture) -> None: # Creating an admin user (first user created is automatically an admin) - admin_user: TestUser = UserManager.create(name="admin_user") + admin_user: DATestUser = UserManager.create(name="admin_user") - # add api key to user - api_key: TestAPIKey = APIKeyManager.create( + # create api key + api_key: DATestAPIKey = APIKeyManager.create( user_performing_action=admin_user, ) @@ -86,13 +86,13 @@ def test_removing_connector(reset: None, vespa_client: TestVespaClient) -> None: ) # seed documents - cc_pair_1 = DocumentManager.seed_and_attach_docs( + cc_pair_1.documents = DocumentManager.seed_dummy_docs( cc_pair=cc_pair_1, num_docs=NUM_DOCS, api_key=api_key, ) - cc_pair_2 = DocumentManager.seed_and_attach_docs( + cc_pair_2.documents = DocumentManager.seed_dummy_docs( cc_pair=cc_pair_2, num_docs=NUM_DOCS, api_key=api_key, diff --git a/backend/tests/integration/tests/permissions/test_cc_pair_permissions.py b/backend/tests/integration/tests/permissions/test_cc_pair_permissions.py index c52c5826eae..19c2d7535a6 100644 --- a/backend/tests/integration/tests/permissions/test_cc_pair_permissions.py +++ b/backend/tests/integration/tests/permissions/test_cc_pair_permissions.py @@ -5,21 +5,22 @@ import pytest from requests.exceptions import HTTPError +from danswer.db.enums import AccessType from danswer.server.documents.models import DocumentSource from tests.integration.common_utils.managers.cc_pair import CCPairManager from tests.integration.common_utils.managers.connector import ConnectorManager from tests.integration.common_utils.managers.credential import CredentialManager -from tests.integration.common_utils.managers.user import TestUser +from tests.integration.common_utils.managers.user import DATestUser from tests.integration.common_utils.managers.user import UserManager from tests.integration.common_utils.managers.user_group import UserGroupManager def test_cc_pair_permissions(reset: None) -> None: # Creating an admin user (first user created is automatically an admin) - admin_user: TestUser = UserManager.create(name="admin_user") + admin_user: DATestUser = UserManager.create(name="admin_user") # Creating a curator - curator: TestUser = UserManager.create(name="curator") + curator: DATestUser = UserManager.create(name="curator") # Creating a user group user_group_1 = UserGroupManager.create( @@ -49,12 +50,11 @@ def test_cc_pair_permissions(reset: None) -> None: user_groups_to_check=[user_group_1], user_performing_action=admin_user ) - # Create a credentials that the curator is and is not curator of connector_1 = ConnectorManager.create( - name="curator_owned_connector", + name="admin_owned_connector", source=DocumentSource.CONFLUENCE, groups=[user_group_1.id], - is_public=False, + access_type=AccessType.PRIVATE, user_performing_action=admin_user, ) # currently we dont enforce permissions at the connector level @@ -66,6 +66,7 @@ def test_cc_pair_permissions(reset: None) -> None: # is_public=False, # user_performing_action=admin_user, # ) + # Create a credentials that the curator is and is not curator of credential_1 = CredentialManager.create( name="curator_owned_credential", source=DocumentSource.CONFLUENCE, @@ -91,8 +92,8 @@ def test_cc_pair_permissions(reset: None) -> None: connector_id=connector_1.id, credential_id=credential_1.id, name="invalid_cc_pair_1", + access_type=AccessType.PUBLIC, groups=[user_group_1.id], - is_public=True, user_performing_action=curator, ) @@ -103,8 +104,8 @@ def test_cc_pair_permissions(reset: None) -> None: connector_id=connector_1.id, credential_id=credential_1.id, name="invalid_cc_pair_2", + access_type=AccessType.PRIVATE, groups=[user_group_1.id, user_group_2.id], - is_public=False, user_performing_action=curator, ) @@ -115,8 +116,8 @@ def test_cc_pair_permissions(reset: None) -> None: connector_id=connector_1.id, credential_id=credential_1.id, name="invalid_cc_pair_2", + access_type=AccessType.PRIVATE, groups=[], - is_public=False, user_performing_action=curator, ) @@ -129,8 +130,8 @@ def test_cc_pair_permissions(reset: None) -> None: # connector_id=connector_2.id, # credential_id=credential_1.id, # name="invalid_cc_pair_3", + # access_type=AccessType.PRIVATE, # groups=[user_group_1.id], - # is_public=False, # user_performing_action=curator, # ) @@ -141,8 +142,8 @@ def test_cc_pair_permissions(reset: None) -> None: connector_id=connector_1.id, credential_id=credential_2.id, name="invalid_cc_pair_4", + access_type=AccessType.PRIVATE, groups=[user_group_1.id], - is_public=False, user_performing_action=curator, ) @@ -154,8 +155,8 @@ def test_cc_pair_permissions(reset: None) -> None: name="valid_cc_pair", connector_id=connector_1.id, credential_id=credential_1.id, + access_type=AccessType.PRIVATE, groups=[user_group_1.id], - is_public=False, user_performing_action=curator, ) @@ -170,7 +171,9 @@ def test_cc_pair_permissions(reset: None) -> None: # Test deleting the cc pair CCPairManager.delete(valid_cc_pair, user_performing_action=curator) - CCPairManager.wait_for_deletion_completion(user_performing_action=curator) + CCPairManager.wait_for_deletion_completion( + cc_pair_id=valid_cc_pair.id, user_performing_action=curator + ) CCPairManager.verify( cc_pair=valid_cc_pair, diff --git a/backend/tests/integration/tests/permissions/test_connector_permissions.py b/backend/tests/integration/tests/permissions/test_connector_permissions.py index 279c0568bfb..9b4bc21f942 100644 --- a/backend/tests/integration/tests/permissions/test_connector_permissions.py +++ b/backend/tests/integration/tests/permissions/test_connector_permissions.py @@ -5,19 +5,20 @@ import pytest from requests.exceptions import HTTPError +from danswer.db.enums import AccessType from danswer.server.documents.models import DocumentSource from tests.integration.common_utils.managers.connector import ConnectorManager -from tests.integration.common_utils.managers.user import TestUser +from tests.integration.common_utils.managers.user import DATestUser from tests.integration.common_utils.managers.user import UserManager from tests.integration.common_utils.managers.user_group import UserGroupManager def test_connector_permissions(reset: None) -> None: # Creating an admin user (first user created is automatically an admin) - admin_user: TestUser = UserManager.create(name="admin_user") + admin_user: DATestUser = UserManager.create(name="admin_user") # Creating a curator - curator: TestUser = UserManager.create(name="curator") + curator: DATestUser = UserManager.create(name="curator") # Creating a user group user_group_1 = UserGroupManager.create( @@ -57,7 +58,7 @@ def test_connector_permissions(reset: None) -> None: name="invalid_connector_1", source=DocumentSource.CONFLUENCE, groups=[user_group_1.id], - is_public=True, + access_type=AccessType.PUBLIC, user_performing_action=curator, ) @@ -68,7 +69,7 @@ def test_connector_permissions(reset: None) -> None: name="invalid_connector_2", source=DocumentSource.CONFLUENCE, groups=[user_group_1.id, user_group_2.id], - is_public=False, + access_type=AccessType.PRIVATE, user_performing_action=curator, ) @@ -80,7 +81,7 @@ def test_connector_permissions(reset: None) -> None: name="valid_connector", source=DocumentSource.CONFLUENCE, groups=[user_group_1.id], - is_public=False, + access_type=AccessType.PRIVATE, user_performing_action=curator, ) assert valid_connector.id is not None @@ -121,7 +122,7 @@ def test_connector_permissions(reset: None) -> None: name="invalid_connector_3", source=DocumentSource.CONFLUENCE, groups=[user_group_2.id], - is_public=False, + access_type=AccessType.PRIVATE, user_performing_action=curator, ) @@ -131,6 +132,6 @@ def test_connector_permissions(reset: None) -> None: name="invalid_connector_4", source=DocumentSource.CONFLUENCE, groups=[user_group_1.id], - is_public=True, + access_type=AccessType.PUBLIC, user_performing_action=curator, ) diff --git a/backend/tests/integration/tests/permissions/test_credential_permissions.py b/backend/tests/integration/tests/permissions/test_credential_permissions.py index 1311f1a3d2d..7433389feb2 100644 --- a/backend/tests/integration/tests/permissions/test_credential_permissions.py +++ b/backend/tests/integration/tests/permissions/test_credential_permissions.py @@ -7,17 +7,17 @@ from danswer.server.documents.models import DocumentSource from tests.integration.common_utils.managers.credential import CredentialManager -from tests.integration.common_utils.managers.user import TestUser +from tests.integration.common_utils.managers.user import DATestUser from tests.integration.common_utils.managers.user import UserManager from tests.integration.common_utils.managers.user_group import UserGroupManager def test_credential_permissions(reset: None) -> None: # Creating an admin user (first user created is automatically an admin) - admin_user: TestUser = UserManager.create(name="admin_user") + admin_user: DATestUser = UserManager.create(name="admin_user") # Creating a curator - curator: TestUser = UserManager.create(name="curator") + curator: DATestUser = UserManager.create(name="curator") # Creating a user group user_group_1 = UserGroupManager.create( diff --git a/backend/tests/integration/tests/permissions/test_doc_set_permissions.py b/backend/tests/integration/tests/permissions/test_doc_set_permissions.py index 412b5d41fad..e352d5eb303 100644 --- a/backend/tests/integration/tests/permissions/test_doc_set_permissions.py +++ b/backend/tests/integration/tests/permissions/test_doc_set_permissions.py @@ -1,20 +1,21 @@ import pytest from requests.exceptions import HTTPError +from danswer.db.enums import AccessType from danswer.server.documents.models import DocumentSource from tests.integration.common_utils.managers.cc_pair import CCPairManager from tests.integration.common_utils.managers.document_set import DocumentSetManager -from tests.integration.common_utils.managers.user import TestUser +from tests.integration.common_utils.managers.user import DATestUser from tests.integration.common_utils.managers.user import UserManager from tests.integration.common_utils.managers.user_group import UserGroupManager def test_doc_set_permissions_setup(reset: None) -> None: # Creating an admin user (first user created is automatically an admin) - admin_user: TestUser = UserManager.create(name="admin_user") + admin_user: DATestUser = UserManager.create(name="admin_user") # Creating a second user (curator) - curator: TestUser = UserManager.create(name="curator") + curator: DATestUser = UserManager.create(name="curator") # Creating the first user group user_group_1 = UserGroupManager.create( @@ -47,14 +48,14 @@ def test_doc_set_permissions_setup(reset: None) -> None: # Admin creates a cc_pair private_cc_pair = CCPairManager.create_from_scratch( - is_public=False, + access_type=AccessType.PRIVATE, source=DocumentSource.INGESTION_API, user_performing_action=admin_user, ) # Admin creates a public cc_pair public_cc_pair = CCPairManager.create_from_scratch( - is_public=True, + access_type=AccessType.PUBLIC, source=DocumentSource.INGESTION_API, user_performing_action=admin_user, ) diff --git a/backend/tests/integration/tests/permissions/test_user_role_permissions.py b/backend/tests/integration/tests/permissions/test_user_role_permissions.py index 5da91a57af8..5be49d25c5e 100644 --- a/backend/tests/integration/tests/permissions/test_user_role_permissions.py +++ b/backend/tests/integration/tests/permissions/test_user_role_permissions.py @@ -5,22 +5,22 @@ from requests.exceptions import HTTPError from danswer.db.models import UserRole -from tests.integration.common_utils.managers.user import TestUser +from tests.integration.common_utils.managers.user import DATestUser from tests.integration.common_utils.managers.user import UserManager from tests.integration.common_utils.managers.user_group import UserGroupManager def test_user_role_setting_permissions(reset: None) -> None: # Creating an admin user (first user created is automatically an admin) - admin_user: TestUser = UserManager.create(name="admin_user") + admin_user: DATestUser = UserManager.create(name="admin_user") assert UserManager.verify_role(admin_user, UserRole.ADMIN) # Creating a basic user - basic_user: TestUser = UserManager.create(name="basic_user") + basic_user: DATestUser = UserManager.create(name="basic_user") assert UserManager.verify_role(basic_user, UserRole.BASIC) # Creating a curator - curator: TestUser = UserManager.create(name="curator") + curator: DATestUser = UserManager.create(name="curator") assert UserManager.verify_role(curator, UserRole.BASIC) # Creating a curator without adding to a group should not work @@ -31,7 +31,7 @@ def test_user_role_setting_permissions(reset: None) -> None: user_to_perform_action=admin_user, ) - global_curator: TestUser = UserManager.create(name="global_curator") + global_curator: DATestUser = UserManager.create(name="global_curator") assert UserManager.verify_role(global_curator, UserRole.BASIC) # Setting the role of a global curator should not work for a basic user diff --git a/backend/tests/integration/tests/permissions/test_whole_curator_flow.py b/backend/tests/integration/tests/permissions/test_whole_curator_flow.py index 878ba1e17e8..53c1a0b8a94 100644 --- a/backend/tests/integration/tests/permissions/test_whole_curator_flow.py +++ b/backend/tests/integration/tests/permissions/test_whole_curator_flow.py @@ -1,23 +1,24 @@ """ This test tests the happy path for curator permissions """ +from danswer.db.enums import AccessType from danswer.db.models import UserRole from danswer.server.documents.models import DocumentSource from tests.integration.common_utils.managers.cc_pair import CCPairManager from tests.integration.common_utils.managers.connector import ConnectorManager from tests.integration.common_utils.managers.credential import CredentialManager -from tests.integration.common_utils.managers.user import TestUser +from tests.integration.common_utils.managers.user import DATestUser from tests.integration.common_utils.managers.user import UserManager from tests.integration.common_utils.managers.user_group import UserGroupManager def test_whole_curator_flow(reset: None) -> None: # Creating an admin user (first user created is automatically an admin) - admin_user: TestUser = UserManager.create(name="admin_user") + admin_user: DATestUser = UserManager.create(name="admin_user") assert UserManager.verify_role(admin_user, UserRole.ADMIN) # Creating a curator - curator: TestUser = UserManager.create(name="curator") + curator: DATestUser = UserManager.create(name="curator") # Creating a user group user_group_1 = UserGroupManager.create( @@ -50,7 +51,7 @@ def test_whole_curator_flow(reset: None) -> None: test_connector = ConnectorManager.create( name="curator_test_connector", source=DocumentSource.FILE, - is_public=False, + access_type=AccessType.PRIVATE, groups=[user_group_1.id], user_performing_action=curator, ) @@ -64,8 +65,8 @@ def test_whole_curator_flow(reset: None) -> None: connector_id=test_connector.id, credential_id=test_credential.id, name="curator_test_cc_pair", + access_type=AccessType.PRIVATE, groups=[user_group_1.id], - is_public=False, user_performing_action=curator, ) @@ -76,7 +77,92 @@ def test_whole_curator_flow(reset: None) -> None: # Verify that the curator can delete the CC pair CCPairManager.delete(cc_pair=test_cc_pair, user_performing_action=curator) - CCPairManager.wait_for_deletion_completion(user_performing_action=curator) + CCPairManager.wait_for_deletion_completion( + cc_pair_id=test_cc_pair.id, user_performing_action=curator + ) + + # Verify that the CC pair has been deleted + CCPairManager.verify( + cc_pair=test_cc_pair, + verify_deleted=True, + user_performing_action=admin_user, + ) + + +def test_global_curator_flow(reset: None) -> None: + # Creating an admin user (first user created is automatically an admin) + admin_user: DATestUser = UserManager.create(name="admin_user") + assert UserManager.verify_role(admin_user, UserRole.ADMIN) + + # Creating a user + global_curator: DATestUser = UserManager.create(name="global_curator") + assert UserManager.verify_role(global_curator, UserRole.BASIC) + + # Set the user to a global curator + UserManager.set_role( + user_to_set=global_curator, + target_role=UserRole.GLOBAL_CURATOR, + user_to_perform_action=admin_user, + ) + assert UserManager.verify_role(global_curator, UserRole.GLOBAL_CURATOR) + + # Creating a user group containing the global curator + user_group_1 = UserGroupManager.create( + name="user_group_1", + user_ids=[global_curator.id], + cc_pair_ids=[], + user_performing_action=admin_user, + ) + UserGroupManager.wait_for_sync( + user_groups_to_check=[user_group_1], user_performing_action=admin_user + ) + + # Creating a credential as global curator + test_credential = CredentialManager.create( + name="curator_test_credential", + source=DocumentSource.FILE, + curator_public=False, + groups=[user_group_1.id], + user_performing_action=global_curator, + ) + + # Creating a connector as global curator + test_connector = ConnectorManager.create( + name="curator_test_connector", + source=DocumentSource.FILE, + access_type=AccessType.PRIVATE, + groups=[user_group_1.id], + user_performing_action=global_curator, + ) + + # Test editing the connector + test_connector.name = "updated_test_connector" + ConnectorManager.edit( + connector=test_connector, user_performing_action=global_curator + ) + + # Creating a CC pair as global curator + test_cc_pair = CCPairManager.create( + connector_id=test_connector.id, + credential_id=test_credential.id, + name="curator_test_cc_pair", + access_type=AccessType.PRIVATE, + groups=[user_group_1.id], + user_performing_action=global_curator, + ) + + CCPairManager.verify(cc_pair=test_cc_pair, user_performing_action=admin_user) + + # Verify that the curator can pause and unpause the CC pair + CCPairManager.pause_cc_pair( + cc_pair=test_cc_pair, user_performing_action=global_curator + ) + + # Verify that the curator can delete the CC pair + CCPairManager.delete(cc_pair=test_cc_pair, user_performing_action=global_curator) + CCPairManager.wait_for_deletion_completion( + cc_pair_id=test_cc_pair.id, user_performing_action=global_curator + ) # Verify that the CC pair has been deleted CCPairManager.verify( diff --git a/backend/tests/integration/tests/personas/test_persona_categories.py b/backend/tests/integration/tests/personas/test_persona_categories.py new file mode 100644 index 00000000000..fdd0e645814 --- /dev/null +++ b/backend/tests/integration/tests/personas/test_persona_categories.py @@ -0,0 +1,92 @@ +from uuid import uuid4 + +import pytest +from requests.exceptions import HTTPError + +from tests.integration.common_utils.managers.persona import ( + PersonaCategoryManager, +) +from tests.integration.common_utils.managers.user import UserManager +from tests.integration.common_utils.test_models import DATestPersonaCategory +from tests.integration.common_utils.test_models import DATestUser + + +def test_persona_category_management(reset: None) -> None: + admin_user: DATestUser = UserManager.create(name="admin_user") + + persona_category = DATestPersonaCategory( + id=None, + name=f"Test Category {uuid4()}", + description="A description for test category", + ) + persona_category = PersonaCategoryManager.create( + category=persona_category, + user_performing_action=admin_user, + ) + print( + f"Created persona category {persona_category.name} with id {persona_category.id}" + ) + + assert PersonaCategoryManager.verify( + category=persona_category, + user_performing_action=admin_user, + ), "Persona category was not found after creation" + + regular_user: DATestUser = UserManager.create(name="regular_user") + + updated_persona_category = DATestPersonaCategory( + id=persona_category.id, + name=f"Updated {persona_category.name}", + description="An updated description", + ) + with pytest.raises(HTTPError) as exc_info: + PersonaCategoryManager.update( + category=updated_persona_category, + user_performing_action=regular_user, + ) + assert exc_info.value.response.status_code == 403 + + assert PersonaCategoryManager.verify( + category=persona_category, + user_performing_action=admin_user, + ), "Persona category should not have been updated by non-admin user" + + result = PersonaCategoryManager.delete( + category=persona_category, + user_performing_action=regular_user, + ) + assert ( + result is False + ), "Regular user should not be able to delete the persona category" + + assert PersonaCategoryManager.verify( + category=persona_category, + user_performing_action=admin_user, + ), "Persona category should not have been deleted by non-admin user" + + updated_persona_category.name = f"Updated {persona_category.name}" + updated_persona_category.description = "An updated description" + updated_persona_category = PersonaCategoryManager.update( + category=updated_persona_category, + user_performing_action=admin_user, + ) + print(f"Updated persona category to {updated_persona_category.name}") + + assert PersonaCategoryManager.verify( + category=updated_persona_category, + user_performing_action=admin_user, + ), "Persona category was not updated by admin" + + success = PersonaCategoryManager.delete( + category=persona_category, + user_performing_action=admin_user, + ) + assert success, "Admin user should be able to delete the persona category" + print( + f"Deleted persona category {persona_category.name} with id {persona_category.id}" + ) + + assert not PersonaCategoryManager.verify( + category=persona_category, + user_performing_action=admin_user, + ), "Persona category should not exist after deletion by admin" diff --git a/backend/tests/integration/tests/playwright/test_playwright.py b/backend/tests/integration/tests/playwright/test_playwright.py new file mode 100644 index 00000000000..63c4dda7910 --- /dev/null +++ b/backend/tests/integration/tests/playwright/test_playwright.py @@ -0,0 +1,18 @@ +import os + +import pytest + +from tests.integration.common_utils.managers.user import UserManager +from tests.integration.common_utils.test_models import DATestUser + + +@pytest.mark.skipif( + os.getenv("PYTEST_IGNORE_SKIP") is None, + reason="Skipped by default unless env var exists", +) +def test_playwright_setup(reset: None) -> None: + """Not really a test, just using this to automate setup for playwright tests.""" + + # Creating an admin user (first user created is automatically an admin) + admin_user: DATestUser = UserManager.create(name="admin_user") + assert admin_user diff --git a/backend/tests/integration/tests/pruning/test_pruning.py b/backend/tests/integration/tests/pruning/test_pruning.py new file mode 100644 index 00000000000..9d9a41c7069 --- /dev/null +++ b/backend/tests/integration/tests/pruning/test_pruning.py @@ -0,0 +1,186 @@ +import http.server +import os +import shutil +import tempfile +import threading +from collections.abc import Generator +from contextlib import contextmanager +from datetime import datetime +from datetime import timezone +from time import sleep +from typing import Any + +import uvicorn +from fastapi import FastAPI +from fastapi.staticfiles import StaticFiles + +from danswer.server.documents.models import DocumentSource +from danswer.utils.logger import setup_logger +from tests.integration.common_utils.managers.api_key import APIKeyManager +from tests.integration.common_utils.managers.cc_pair import CCPairManager +from tests.integration.common_utils.managers.user import UserManager +from tests.integration.common_utils.test_models import DATestUser +from tests.integration.common_utils.vespa import vespa_fixture + +logger = setup_logger() + + +# FastAPI server for serving files +def create_fastapi_app(directory: str) -> FastAPI: + app = FastAPI() + + # Mount the directory to serve static files + app.mount("/", StaticFiles(directory=directory, html=True), name="static") + + return app + + +# as far as we know, this doesn't hang when crawled. This is good. +@contextmanager +def fastapi_server_context( + directory: str, port: int = 8000 +) -> Generator[None, None, None]: + app = create_fastapi_app(directory) + + config = uvicorn.Config(app=app, host="0.0.0.0", port=port, log_level="info") + server = uvicorn.Server(config) + + # Create a thread to run the FastAPI server + server_thread = threading.Thread(target=server.run) + server_thread.daemon = ( + True # Ensures the thread will exit when the main program exits + ) + + try: + # Start the server in the background + server_thread.start() + sleep(5) # Give it a few seconds to start + yield # Yield control back to the calling function (context manager in use) + finally: + # Shutdown the server + server.should_exit = True + server_thread.join() + + +# Leaving this here for posterity and experimentation, but the reason we're +# not using this is python's web servers hang frequently when crawled +# this is obviously not good for a unit test +@contextmanager +def http_server_context( + directory: str, port: int = 8000 +) -> Generator[http.server.ThreadingHTTPServer, None, None]: + # Create a handler that serves files from the specified directory + def handler_class( + *args: Any, **kwargs: Any + ) -> http.server.SimpleHTTPRequestHandler: + return http.server.SimpleHTTPRequestHandler( + *args, directory=directory, **kwargs + ) + + # Create an HTTPServer instance + httpd = http.server.ThreadingHTTPServer(("0.0.0.0", port), handler_class) + + # Define a thread that runs the server in the background + server_thread = threading.Thread(target=httpd.serve_forever) + server_thread.daemon = ( + True # Ensures the thread will exit when the main program exits + ) + + try: + # Start the server in the background + server_thread.start() + sleep(5) # give it a few seconds to start + yield httpd + finally: + # Shutdown the server and wait for the thread to finish + httpd.shutdown() + httpd.server_close() + server_thread.join() + + +def test_web_pruning(reset: None, vespa_client: vespa_fixture) -> None: + # Creating an admin user (first user created is automatically an admin) + admin_user: DATestUser = UserManager.create(name="admin_user") + + # add api key to user + APIKeyManager.create( + user_performing_action=admin_user, + ) + + test_filename = os.path.realpath(__file__) + test_directory = os.path.dirname(test_filename) + with tempfile.TemporaryDirectory() as temp_dir: + port = 8889 + + website_src = os.path.join(test_directory, "website") + website_tgt = os.path.join(temp_dir, "website") + shutil.copytree(website_src, website_tgt) + with fastapi_server_context(os.path.join(temp_dir, "website"), port): + sleep(1) # sleep a tiny bit before starting everything + + hostname = os.getenv("TEST_WEB_HOSTNAME", "localhost") + config = { + "base_url": f"http://{hostname}:{port}/", + "web_connector_type": "recursive", + } + + # store the time before we create the connector so that we know after + # when the indexing should have started + now = datetime.now(timezone.utc) + + # create connector + cc_pair_1 = CCPairManager.create_from_scratch( + source=DocumentSource.WEB, + connector_specific_config=config, + user_performing_action=admin_user, + ) + + CCPairManager.wait_for_indexing( + cc_pair_1, now, timeout=60, user_performing_action=admin_user + ) + + selected_cc_pair = CCPairManager.get_indexing_status_by_id( + cc_pair_1.id, user_performing_action=admin_user + ) + assert selected_cc_pair is not None, "cc_pair not found after indexing!" + assert selected_cc_pair.docs_indexed == 15 + + logger.info("Removing about.html.") + os.remove(os.path.join(website_tgt, "about.html")) + logger.info("Removing courses.html.") + os.remove(os.path.join(website_tgt, "courses.html")) + + now = datetime.now(timezone.utc) + CCPairManager.prune(cc_pair_1, user_performing_action=admin_user) + CCPairManager.wait_for_prune( + cc_pair_1, now, timeout=60, user_performing_action=admin_user + ) + + selected_cc_pair = CCPairManager.get_indexing_status_by_id( + cc_pair_1.id, user_performing_action=admin_user + ) + assert selected_cc_pair is not None, "cc_pair not found after pruning!" + assert selected_cc_pair.docs_indexed == 13 + + # check vespa + index_id = f"http://{hostname}:{port}/index.html" + about_id = f"http://{hostname}:{port}/about.html" + courses_id = f"http://{hostname}:{port}/courses.html" + + doc_ids = [index_id, about_id, courses_id] + retrieved_docs_dict = vespa_client.get_documents_by_id(doc_ids)["documents"] + retrieved_docs = { + doc["fields"]["document_id"]: doc["fields"] + for doc in retrieved_docs_dict + } + + # verify index.html exists in Vespa + retrieved_doc = retrieved_docs.get(index_id) + assert retrieved_doc + + # verify about and courses do not exist + retrieved_doc = retrieved_docs.get(about_id) + assert not retrieved_doc + + retrieved_doc = retrieved_docs.get(courses_id) + assert not retrieved_doc diff --git a/backend/tests/integration/tests/pruning/website/about.html b/backend/tests/integration/tests/pruning/website/about.html new file mode 100644 index 00000000000..ea7fee823cd --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/about.html @@ -0,0 +1,523 @@ + + + + + Above Multi-purpose Free Bootstrap Responsive Template + + + + + + + + + + + + + + +
+ +
+ +
+ +
+
+
+
+

About Us

+
+
+
+
+
+
+
+
+
+ + Read more +
+
+
+ +
+
+ +
+
+
+

+ Lorem ipsum dolor sit amet, cadipisicing sit amet, consectetur + adipisicing elit. Atque sed, quidem quis praesentium, ut unde + fuga error commodi architecto, laudantium culpa tenetur at id, + beatae pet. +

+

+ Lorem ipsum dolor sit amet, consectetur adipisicing elit. + adipisicing sit amet, consectetur adipisicing elit. Atque sed, + quidem quis praesentium,m deserunt. +

+
    +
  • + Lorem + ipsum enimdolor sit amet +
  • +
  • + + Explicabo deleniti neque aliquid +
  • +
  • + + Consectetur adipisicing elit +
  • +
  • + Lorem + ipsum dolor sit amet +
  • +
  • + Quo + issimos molest quibusdam temporibus +
  • +
+
+
+
+
+
+
+
+ +
+

Why Choose Us?

+
+

+ Sed ut perspiciaatis unde omnis iste natus error sit + voluptatem accusantium doloremque laudantium, totam rem + aperiam, eaque ipsa quae ab illo inventore veritatis et quasi + architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam + voluptatem quia voluptas sit aspernatur.

Sed ut + perspiciaatis iste natus error sit voluptatem probably haven't + heard of them accusamus. +

+
+
+
+

Our Solution

+
+ +
+ +
+ + +
+ +
+ Sed ut perspiciaatis unde omnis iste natus error sit + voluptatem accusantium doloremque laudantium, totam rem + aperiam, eaque ipsa quae ab illo inventore veritatis et + quasi architecto beatae vitae dicta sunt explicabo. Nemo + enim ipsam voluptatem quia voluptas +
+
+
+
+ +
+
+ Sed ut perspiciaatis unde omnis iste natus error sit + voluptatem accusantium doloremque laudantium, totam rem + aperiam, eaque ipsa quae ab illo inventore veritatis et + quasi architecto beatae vitae dicta sunt explicabo. Nemo + enim ipsam voluptatem quia voluptas +
+
+
+
+ +
+
+ Sed ut perspiciaatis unde omnis iste natus error sit + voluptatem accusantium doloremque laudantium, totam rem + aperiam, eaque ipsa quae ab illo inventore veritatis et + quasi architecto beatae vitae dicta sunt explicabo. Nemo + enim ipsam voluptatem quia voluptas +
+
+
+
+ +
+
+ Sed ut perspiciaatis unde omnis iste natus error sit + voluptatem accusantium doloremque laudantium, totam rem + aperiam, eaque ipsa quae ab illo inventore veritatis et + quasi architecto beatae vitae dicta sunt explicabo. Nemo + enim ipsam voluptatem quia voluptas +
+
+
+
+ +
+ +
+
+

Our Expertise

+
+
Web Development
+
+ +
+ 40% Complete (success) +
+
+
Designing
+
+
+ 40% Complete (success) +
+
+
User Experience
+
+
+ 40% Complete (success) +
+
+
Development
+
+
+ 40% Complete (success) +
+
+
+
+ +
+
+ + + +
+

Our Team

+
+
+ + + +
+
+
+ +
+ + + +

Johne Doe

+ Creative +
+
+
+ +
+ + + +

Jennifer

+ Programmer +
+
+
+ +
+ + + +

Christean

+ CEO +
+
+
+ +
+ + + +

Kerinele rase

+ Manager +
+
+
+
+ + +
+
+
+ +
+ + + + + + + + + + + + + + + diff --git a/backend/tests/integration/tests/pruning/website/contact.html b/backend/tests/integration/tests/pruning/website/contact.html new file mode 100644 index 00000000000..dbe3225456d --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/contact.html @@ -0,0 +1,357 @@ + + + + + Above Multi-purpose Free Bootstrap Responsive Template + + + + + + + + + + + + + + +
+ +
+ +
+ +
+
+
+
+

Contact Us

+
+
+
+
+
+
+
+
+ +
+
+ trivoo +
+ +
+
+ +
+
+
+ + +
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+ +
+
+
+
+
+
+

Contact info

+
+
+ Lorem ipsum dolor sit amet, cadipisicing sit amet, consectetur + adipisicing elit. Atque sed, quidem quis praesentium. +
+

+ Lorem ipsum dolor sit amet, cadipisicing sit amet, consectetur + adipisicing elit. Lorem ipsum dolor sit amet, cadipisicing sit + amet, consectetur adipisicing elit. Atque sed, quidem quis + praesentium Atque sed, quidem quis praesentium, ut unde fuga + error commodi architecto, laudantium culpa tenetur at id, + beatae pet.
+

+
+ The Company Name.
+ 12345 St John Point,
+ Brisbean, ABC 12 St 11.

+ Telephone: +1 234 567 890
+ FAX: +1 234 567 890
+ E-mail: + mail@sitename.org
+
+
+
+
+
+
+ +
+ + + + + + + + + + + + + + + + diff --git a/backend/tests/integration/tests/pruning/website/courses.html b/backend/tests/integration/tests/pruning/website/courses.html new file mode 100644 index 00000000000..a813720fd28 --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/courses.html @@ -0,0 +1,218 @@ + + + + +Above Multi-purpose Free Bootstrap Responsive Template + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
+
+

Courses

+
+
+
+
+ +
+
+ +
+ +
+
+

Courses We Offer

Lorem ipsum dolor sit amet, consectetur adipisicing elit. Dolores quae porro consequatur aliquam, incidunt eius magni provident, doloribus omnis minus temporibus perferendis nesciunt quam repellendus nulla nemo ipsum odit corrupti consequuntur possimus, vero mollitia velit ad consectetur. Alias, laborum excepturi nihil autem nemo numquam, ipsa architecto non, magni consequuntur quam.
+
+
+
+
+
+

Heading Course

+

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Praesent vest sit amet, consec ibulum molestie lacus. Aenean nonummy hendrerit mauris. Phasellus porta.

+
+
+
+

Heading Course

+

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Praesent vest sit amet, consec ibulum molestie lacus. Aenean nonummy hendrerit mauris. Phasellus porta.

+
+
+
+

Heading Course

+

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Praesent vest sit amet, consec ibulum molestie lacus. Aenean nonummy hendrerit mauris. Phasellus porta.

+
+
+
+
+
+

Heading Course

+

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Praesent vest sit amet, consec ibulum molestie lacus. Aenean nonummy hendrerit mauris. Phasellus porta.

+
+
+
+

Heading Course

+

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Praesent vest sit amet, consec ibulum molestie lacus. Aenean nonummy hendrerit mauris. Phasellus porta.

+
+
+
+

Heading Course

+

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Praesent vest sit amet, consec ibulum molestie lacus. Aenean nonummy hendrerit mauris. Phasellus porta.

+
+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit. Dolores quae porro consequatur aliquam, incidunt eius magni provident, doloribus omnis minus temporibus perferendis nesciunt quam repellendus nulla nemo ipsum odit corrupti consequuntur possimus


+
+
+ +

Web Development

+

Lorem ipsum dolor sit amet, consectetur adipisicing elit. Dolores quae porro consequatur aliquam, incidunt eius magni provident, doloribus omnis minus temporibus perferendis nesciunt quam repellendus nulla nemo ipsum odit corrupti consequuntur possimus

+
+
+ +

Mobile Development

+

Lorem ipsum dolor sit amet, consectetur adipisicing elit. Dolores quae porro consequatur aliquam, incidunt eius magni provident, doloribus omnis minus temporibus perferendis nesciunt quam repellendus nulla nemo ipsum odit corrupti consequuntur possimus

+
+
+ +

Responsive Design

+

Lorem ipsum dolor sit amet, consectetur adipisicing elit. Dolores quae porro consequatur aliquam, incidunt eius magni provident, doloribus omnis minus temporibus perferendis nesciunt quam repellendus nulla nemo ipsum odit corrupti consequuntur possimus

+
+
+ + + + + +
+
+ +
+ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/backend/tests/integration/tests/pruning/website/css/animate.css b/backend/tests/integration/tests/pruning/website/css/animate.css new file mode 100644 index 00000000000..92a68838f4e --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/css/animate.css @@ -0,0 +1,3880 @@ +@charset "UTF-8"; +/* +Animate.css - http://daneden.me/animate +Licensed under the MIT license + +Copyright (c) 2013 Daniel Eden + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ +body { + /* Addresses a small issue in webkit: http://bit.ly/NEdoDq */ + -webkit-backface-visibility: hidden; +} +.animated { + -webkit-animation-duration: 1s; + -moz-animation-duration: 1s; + -o-animation-duration: 1s; + animation-duration: 1s; + -webkit-animation-fill-mode: both; + -moz-animation-fill-mode: both; + -o-animation-fill-mode: both; + animation-fill-mode: both; +} + +.animated.hinge { + -webkit-animation-duration: 2s; + -moz-animation-duration: 2s; + -o-animation-duration: 2s; + animation-duration: 2s; +} + +@-webkit-keyframes flash { + 0%, + 50%, + 100% { + opacity: 1; + } + 25%, + 75% { + opacity: 0; + } +} + +@-moz-keyframes flash { + 0%, + 50%, + 100% { + opacity: 1; + } + 25%, + 75% { + opacity: 0; + } +} + +@-o-keyframes flash { + 0%, + 50%, + 100% { + opacity: 1; + } + 25%, + 75% { + opacity: 0; + } +} + +@keyframes flash { + 0%, + 50%, + 100% { + opacity: 1; + } + 25%, + 75% { + opacity: 0; + } +} + +.flash { + -webkit-animation-name: flash; + -moz-animation-name: flash; + -o-animation-name: flash; + animation-name: flash; +} +@-webkit-keyframes shake { + 0%, + 100% { + -webkit-transform: translateX(0); + } + 10%, + 30%, + 50%, + 70%, + 90% { + -webkit-transform: translateX(-10px); + } + 20%, + 40%, + 60%, + 80% { + -webkit-transform: translateX(10px); + } +} + +@-moz-keyframes shake { + 0%, + 100% { + -moz-transform: translateX(0); + } + 10%, + 30%, + 50%, + 70%, + 90% { + -moz-transform: translateX(-10px); + } + 20%, + 40%, + 60%, + 80% { + -moz-transform: translateX(10px); + } +} + +@-o-keyframes shake { + 0%, + 100% { + -o-transform: translateX(0); + } + 10%, + 30%, + 50%, + 70%, + 90% { + -o-transform: translateX(-10px); + } + 20%, + 40%, + 60%, + 80% { + -o-transform: translateX(10px); + } +} + +@keyframes shake { + 0%, + 100% { + transform: translateX(0); + } + 10%, + 30%, + 50%, + 70%, + 90% { + transform: translateX(-10px); + } + 20%, + 40%, + 60%, + 80% { + transform: translateX(10px); + } +} + +.shake { + -webkit-animation-name: shake; + -moz-animation-name: shake; + -o-animation-name: shake; + animation-name: shake; +} +@-webkit-keyframes bounce { + 0%, + 20%, + 50%, + 80%, + 100% { + -webkit-transform: translateY(0); + } + 40% { + -webkit-transform: translateY(-30px); + } + 60% { + -webkit-transform: translateY(-15px); + } +} + +@-moz-keyframes bounce { + 0%, + 20%, + 50%, + 80%, + 100% { + -moz-transform: translateY(0); + } + 40% { + -moz-transform: translateY(-30px); + } + 60% { + -moz-transform: translateY(-15px); + } +} + +@-o-keyframes bounce { + 0%, + 20%, + 50%, + 80%, + 100% { + -o-transform: translateY(0); + } + 40% { + -o-transform: translateY(-30px); + } + 60% { + -o-transform: translateY(-15px); + } +} +@keyframes bounce { + 0%, + 20%, + 50%, + 80%, + 100% { + transform: translateY(0); + } + 40% { + transform: translateY(-30px); + } + 60% { + transform: translateY(-15px); + } +} + +.bounce { + -webkit-animation-name: bounce; + -moz-animation-name: bounce; + -o-animation-name: bounce; + animation-name: bounce; +} +@-webkit-keyframes tada { + 0% { + -webkit-transform: scale(1); + } + 10%, + 20% { + -webkit-transform: scale(0.9) rotate(-3deg); + } + 30%, + 50%, + 70%, + 90% { + -webkit-transform: scale(1.1) rotate(3deg); + } + 40%, + 60%, + 80% { + -webkit-transform: scale(1.1) rotate(-3deg); + } + 100% { + -webkit-transform: scale(1) rotate(0); + } +} + +@-moz-keyframes tada { + 0% { + -moz-transform: scale(1); + } + 10%, + 20% { + -moz-transform: scale(0.9) rotate(-3deg); + } + 30%, + 50%, + 70%, + 90% { + -moz-transform: scale(1.1) rotate(3deg); + } + 40%, + 60%, + 80% { + -moz-transform: scale(1.1) rotate(-3deg); + } + 100% { + -moz-transform: scale(1) rotate(0); + } +} + +@-o-keyframes tada { + 0% { + -o-transform: scale(1); + } + 10%, + 20% { + -o-transform: scale(0.9) rotate(-3deg); + } + 30%, + 50%, + 70%, + 90% { + -o-transform: scale(1.1) rotate(3deg); + } + 40%, + 60%, + 80% { + -o-transform: scale(1.1) rotate(-3deg); + } + 100% { + -o-transform: scale(1) rotate(0); + } +} + +@keyframes tada { + 0% { + transform: scale(1); + } + 10%, + 20% { + transform: scale(0.9) rotate(-3deg); + } + 30%, + 50%, + 70%, + 90% { + transform: scale(1.1) rotate(3deg); + } + 40%, + 60%, + 80% { + transform: scale(1.1) rotate(-3deg); + } + 100% { + transform: scale(1) rotate(0); + } +} + +.tada { + -webkit-animation-name: tada; + -moz-animation-name: tada; + -o-animation-name: tada; + animation-name: tada; +} +@-webkit-keyframes swing { + 20%, + 40%, + 60%, + 80%, + 100% { + -webkit-transform-origin: top center; + } + 20% { + -webkit-transform: rotate(15deg); + } + 40% { + -webkit-transform: rotate(-10deg); + } + 60% { + -webkit-transform: rotate(5deg); + } + 80% { + -webkit-transform: rotate(-5deg); + } + 100% { + -webkit-transform: rotate(0deg); + } +} + +@-moz-keyframes swing { + 20% { + -moz-transform: rotate(15deg); + } + 40% { + -moz-transform: rotate(-10deg); + } + 60% { + -moz-transform: rotate(5deg); + } + 80% { + -moz-transform: rotate(-5deg); + } + 100% { + -moz-transform: rotate(0deg); + } +} + +@-o-keyframes swing { + 20% { + -o-transform: rotate(15deg); + } + 40% { + -o-transform: rotate(-10deg); + } + 60% { + -o-transform: rotate(5deg); + } + 80% { + -o-transform: rotate(-5deg); + } + 100% { + -o-transform: rotate(0deg); + } +} + +@keyframes swing { + 20% { + transform: rotate(15deg); + } + 40% { + transform: rotate(-10deg); + } + 60% { + transform: rotate(5deg); + } + 80% { + transform: rotate(-5deg); + } + 100% { + transform: rotate(0deg); + } +} + +.swing { + -webkit-transform-origin: top center; + -moz-transform-origin: top center; + -o-transform-origin: top center; + transform-origin: top center; + -webkit-animation-name: swing; + -moz-animation-name: swing; + -o-animation-name: swing; + animation-name: swing; +} +/* originally authored by Nick Pettit - https://github.com/nickpettit/glide */ + +@-webkit-keyframes wobble { + 0% { + -webkit-transform: translateX(0%); + } + 15% { + -webkit-transform: translateX(-25%) rotate(-5deg); + } + 30% { + -webkit-transform: translateX(20%) rotate(3deg); + } + 45% { + -webkit-transform: translateX(-15%) rotate(-3deg); + } + 60% { + -webkit-transform: translateX(10%) rotate(2deg); + } + 75% { + -webkit-transform: translateX(-5%) rotate(-1deg); + } + 100% { + -webkit-transform: translateX(0%); + } +} + +@-moz-keyframes wobble { + 0% { + -moz-transform: translateX(0%); + } + 15% { + -moz-transform: translateX(-25%) rotate(-5deg); + } + 30% { + -moz-transform: translateX(20%) rotate(3deg); + } + 45% { + -moz-transform: translateX(-15%) rotate(-3deg); + } + 60% { + -moz-transform: translateX(10%) rotate(2deg); + } + 75% { + -moz-transform: translateX(-5%) rotate(-1deg); + } + 100% { + -moz-transform: translateX(0%); + } +} + +@-o-keyframes wobble { + 0% { + -o-transform: translateX(0%); + } + 15% { + -o-transform: translateX(-25%) rotate(-5deg); + } + 30% { + -o-transform: translateX(20%) rotate(3deg); + } + 45% { + -o-transform: translateX(-15%) rotate(-3deg); + } + 60% { + -o-transform: translateX(10%) rotate(2deg); + } + 75% { + -o-transform: translateX(-5%) rotate(-1deg); + } + 100% { + -o-transform: translateX(0%); + } +} + +@keyframes wobble { + 0% { + transform: translateX(0%); + } + 15% { + transform: translateX(-25%) rotate(-5deg); + } + 30% { + transform: translateX(20%) rotate(3deg); + } + 45% { + transform: translateX(-15%) rotate(-3deg); + } + 60% { + transform: translateX(10%) rotate(2deg); + } + 75% { + transform: translateX(-5%) rotate(-1deg); + } + 100% { + transform: translateX(0%); + } +} + +.wobble { + -webkit-animation-name: wobble; + -moz-animation-name: wobble; + -o-animation-name: wobble; + animation-name: wobble; +} +/* originally authored by Nick Pettit - https://github.com/nickpettit/glide */ + +@-webkit-keyframes pulse { + 0% { + -webkit-transform: scale(1); + } + 50% { + -webkit-transform: scale(1.1); + } + 100% { + -webkit-transform: scale(1); + } +} +@-moz-keyframes pulse { + 0% { + -moz-transform: scale(1); + } + 50% { + -moz-transform: scale(1.1); + } + 100% { + -moz-transform: scale(1); + } +} +@-o-keyframes pulse { + 0% { + -o-transform: scale(1); + } + 50% { + -o-transform: scale(1.1); + } + 100% { + -o-transform: scale(1); + } +} +@keyframes pulse { + 0% { + transform: scale(1); + } + 50% { + transform: scale(1.1); + } + 100% { + transform: scale(1); + } +} + +.pulse { + -webkit-animation-name: pulse; + -moz-animation-name: pulse; + -o-animation-name: pulse; + animation-name: pulse; +} +@-webkit-keyframes flip { + 0% { + -webkit-transform: perspective(400px) rotateY(0); + -webkit-animation-timing-function: ease-out; + } + 40% { + -webkit-transform: perspective(400px) translateZ(150px) rotateY(170deg); + -webkit-animation-timing-function: ease-out; + } + 50% { + -webkit-transform: perspective(400px) translateZ(150px) rotateY(190deg) + scale(1); + -webkit-animation-timing-function: ease-in; + } + 80% { + -webkit-transform: perspective(400px) rotateY(360deg) scale(0.95); + -webkit-animation-timing-function: ease-in; + } + 100% { + -webkit-transform: perspective(400px) scale(1); + -webkit-animation-timing-function: ease-in; + } +} +@-moz-keyframes flip { + 0% { + -moz-transform: perspective(400px) rotateY(0); + -moz-animation-timing-function: ease-out; + } + 40% { + -moz-transform: perspective(400px) translateZ(150px) rotateY(170deg); + -moz-animation-timing-function: ease-out; + } + 50% { + -moz-transform: perspective(400px) translateZ(150px) rotateY(190deg) + scale(1); + -moz-animation-timing-function: ease-in; + } + 80% { + -moz-transform: perspective(400px) rotateY(360deg) scale(0.95); + -moz-animation-timing-function: ease-in; + } + 100% { + -moz-transform: perspective(400px) scale(1); + -moz-animation-timing-function: ease-in; + } +} +@-o-keyframes flip { + 0% { + -o-transform: perspective(400px) rotateY(0); + -o-animation-timing-function: ease-out; + } + 40% { + -o-transform: perspective(400px) translateZ(150px) rotateY(170deg); + -o-animation-timing-function: ease-out; + } + 50% { + -o-transform: perspective(400px) translateZ(150px) rotateY(190deg) scale(1); + -o-animation-timing-function: ease-in; + } + 80% { + -o-transform: perspective(400px) rotateY(360deg) scale(0.95); + -o-animation-timing-function: ease-in; + } + 100% { + -o-transform: perspective(400px) scale(1); + -o-animation-timing-function: ease-in; + } +} +@keyframes flip { + 0% { + transform: perspective(400px) rotateY(0); + animation-timing-function: ease-out; + } + 40% { + transform: perspective(400px) translateZ(150px) rotateY(170deg); + animation-timing-function: ease-out; + } + 50% { + transform: perspective(400px) translateZ(150px) rotateY(190deg) scale(1); + animation-timing-function: ease-in; + } + 80% { + transform: perspective(400px) rotateY(360deg) scale(0.95); + animation-timing-function: ease-in; + } + 100% { + transform: perspective(400px) scale(1); + animation-timing-function: ease-in; + } +} + +.flip { + -webkit-backface-visibility: visible !important; + -webkit-animation-name: flip; + -moz-backface-visibility: visible !important; + -moz-animation-name: flip; + -o-backface-visibility: visible !important; + -o-animation-name: flip; + backface-visibility: visible !important; + animation-name: flip; +} +@-webkit-keyframes flipInX { + 0% { + -webkit-transform: perspective(400px) rotateX(90deg); + opacity: 0; + } + + 40% { + -webkit-transform: perspective(400px) rotateX(-10deg); + } + + 70% { + -webkit-transform: perspective(400px) rotateX(10deg); + } + + 100% { + -webkit-transform: perspective(400px) rotateX(0deg); + opacity: 1; + } +} +@-moz-keyframes flipInX { + 0% { + -moz-transform: perspective(400px) rotateX(90deg); + opacity: 0; + } + + 40% { + -moz-transform: perspective(400px) rotateX(-10deg); + } + + 70% { + -moz-transform: perspective(400px) rotateX(10deg); + } + + 100% { + -moz-transform: perspective(400px) rotateX(0deg); + opacity: 1; + } +} +@-o-keyframes flipInX { + 0% { + -o-transform: perspective(400px) rotateX(90deg); + opacity: 0; + } + + 40% { + -o-transform: perspective(400px) rotateX(-10deg); + } + + 70% { + -o-transform: perspective(400px) rotateX(10deg); + } + + 100% { + -o-transform: perspective(400px) rotateX(0deg); + opacity: 1; + } +} +@keyframes flipInX { + 0% { + transform: perspective(400px) rotateX(90deg); + opacity: 0; + } + + 40% { + transform: perspective(400px) rotateX(-10deg); + } + + 70% { + transform: perspective(400px) rotateX(10deg); + } + + 100% { + transform: perspective(400px) rotateX(0deg); + opacity: 1; + } +} + +.flipInX { + -webkit-backface-visibility: visible !important; + -webkit-animation-name: flipInX; + -moz-backface-visibility: visible !important; + -moz-animation-name: flipInX; + -o-backface-visibility: visible !important; + -o-animation-name: flipInX; + backface-visibility: visible !important; + animation-name: flipInX; +} +@-webkit-keyframes flipOutX { + 0% { + -webkit-transform: perspective(400px) rotateX(0deg); + opacity: 1; + } + 100% { + -webkit-transform: perspective(400px) rotateX(90deg); + opacity: 0; + } +} + +@-moz-keyframes flipOutX { + 0% { + -moz-transform: perspective(400px) rotateX(0deg); + opacity: 1; + } + 100% { + -moz-transform: perspective(400px) rotateX(90deg); + opacity: 0; + } +} + +@-o-keyframes flipOutX { + 0% { + -o-transform: perspective(400px) rotateX(0deg); + opacity: 1; + } + 100% { + -o-transform: perspective(400px) rotateX(90deg); + opacity: 0; + } +} + +@keyframes flipOutX { + 0% { + transform: perspective(400px) rotateX(0deg); + opacity: 1; + } + 100% { + transform: perspective(400px) rotateX(90deg); + opacity: 0; + } +} + +.flipOutX { + -webkit-animation-name: flipOutX; + -webkit-backface-visibility: visible !important; + -moz-animation-name: flipOutX; + -moz-backface-visibility: visible !important; + -o-animation-name: flipOutX; + -o-backface-visibility: visible !important; + animation-name: flipOutX; + backface-visibility: visible !important; +} +@-webkit-keyframes flipInY { + 0% { + -webkit-transform: perspective(400px) rotateY(90deg); + opacity: 0; + } + + 40% { + -webkit-transform: perspective(400px) rotateY(-10deg); + } + + 70% { + -webkit-transform: perspective(400px) rotateY(10deg); + } + + 100% { + -webkit-transform: perspective(400px) rotateY(0deg); + opacity: 1; + } +} +@-moz-keyframes flipInY { + 0% { + -moz-transform: perspective(400px) rotateY(90deg); + opacity: 0; + } + + 40% { + -moz-transform: perspective(400px) rotateY(-10deg); + } + + 70% { + -moz-transform: perspective(400px) rotateY(10deg); + } + + 100% { + -moz-transform: perspective(400px) rotateY(0deg); + opacity: 1; + } +} +@-o-keyframes flipInY { + 0% { + -o-transform: perspective(400px) rotateY(90deg); + opacity: 0; + } + + 40% { + -o-transform: perspective(400px) rotateY(-10deg); + } + + 70% { + -o-transform: perspective(400px) rotateY(10deg); + } + + 100% { + -o-transform: perspective(400px) rotateY(0deg); + opacity: 1; + } +} +@keyframes flipInY { + 0% { + transform: perspective(400px) rotateY(90deg); + opacity: 0; + } + + 40% { + transform: perspective(400px) rotateY(-10deg); + } + + 70% { + transform: perspective(400px) rotateY(10deg); + } + + 100% { + transform: perspective(400px) rotateY(0deg); + opacity: 1; + } +} + +.flipInY { + -webkit-backface-visibility: visible !important; + -webkit-animation-name: flipInY; + -moz-backface-visibility: visible !important; + -moz-animation-name: flipInY; + -o-backface-visibility: visible !important; + -o-animation-name: flipInY; + backface-visibility: visible !important; + animation-name: flipInY; +} +@-webkit-keyframes flipOutY { + 0% { + -webkit-transform: perspective(400px) rotateY(0deg); + opacity: 1; + } + 100% { + -webkit-transform: perspective(400px) rotateY(90deg); + opacity: 0; + } +} +@-moz-keyframes flipOutY { + 0% { + -moz-transform: perspective(400px) rotateY(0deg); + opacity: 1; + } + 100% { + -moz-transform: perspective(400px) rotateY(90deg); + opacity: 0; + } +} +@-o-keyframes flipOutY { + 0% { + -o-transform: perspective(400px) rotateY(0deg); + opacity: 1; + } + 100% { + -o-transform: perspective(400px) rotateY(90deg); + opacity: 0; + } +} +@keyframes flipOutY { + 0% { + transform: perspective(400px) rotateY(0deg); + opacity: 1; + } + 100% { + transform: perspective(400px) rotateY(90deg); + opacity: 0; + } +} + +.flipOutY { + -webkit-backface-visibility: visible !important; + -webkit-animation-name: flipOutY; + -moz-backface-visibility: visible !important; + -moz-animation-name: flipOutY; + -o-backface-visibility: visible !important; + -o-animation-name: flipOutY; + backface-visibility: visible !important; + animation-name: flipOutY; +} +@-webkit-keyframes fadeIn { + 0% { + opacity: 0; + } + 100% { + opacity: 1; + } +} + +@-moz-keyframes fadeIn { + 0% { + opacity: 0; + } + 100% { + opacity: 1; + } +} + +@-o-keyframes fadeIn { + 0% { + opacity: 0; + } + 100% { + opacity: 1; + } +} + +@keyframes fadeIn { + 0% { + opacity: 0; + } + 100% { + opacity: 1; + } +} + +.fadeIn { + -webkit-animation-name: fadeIn; + -moz-animation-name: fadeIn; + -o-animation-name: fadeIn; + animation-name: fadeIn; +} +@-webkit-keyframes fadeInUp { + 0% { + opacity: 0; + -webkit-transform: translateY(20px); + } + + 100% { + opacity: 1; + -webkit-transform: translateY(0); + } +} + +@-moz-keyframes fadeInUp { + 0% { + opacity: 0; + -moz-transform: translateY(20px); + } + + 100% { + opacity: 1; + -moz-transform: translateY(0); + } +} + +@-o-keyframes fadeInUp { + 0% { + opacity: 0; + -o-transform: translateY(20px); + } + + 100% { + opacity: 1; + -o-transform: translateY(0); + } +} + +@keyframes fadeInUp { + 0% { + opacity: 0; + transform: translateY(20px); + } + + 100% { + opacity: 1; + transform: translateY(0); + } +} + +.fadeInUp { + -webkit-animation-name: fadeInUp; + -moz-animation-name: fadeInUp; + -o-animation-name: fadeInUp; + animation-name: fadeInUp; +} +@-webkit-keyframes fadeInDown { + 0% { + opacity: 0; + -webkit-transform: translateY(-20px); + } + + 100% { + opacity: 1; + -webkit-transform: translateY(0); + } +} + +@-moz-keyframes fadeInDown { + 0% { + opacity: 0; + -moz-transform: translateY(-20px); + } + + 100% { + opacity: 1; + -moz-transform: translateY(0); + } +} + +@-o-keyframes fadeInDown { + 0% { + opacity: 0; + -o-transform: translateY(-20px); + } + + 100% { + opacity: 1; + -o-transform: translateY(0); + } +} + +@keyframes fadeInDown { + 0% { + opacity: 0; + transform: translateY(-20px); + } + + 100% { + opacity: 1; + transform: translateY(0); + } +} + +.fadeInDown { + -webkit-animation-name: fadeInDown; + -moz-animation-name: fadeInDown; + -o-animation-name: fadeInDown; + animation-name: fadeInDown; +} +@-webkit-keyframes fadeInLeft { + 0% { + opacity: 0; + -webkit-transform: translateX(-20px); + } + + 100% { + opacity: 1; + -webkit-transform: translateX(0); + } +} + +@-moz-keyframes fadeInLeft { + 0% { + opacity: 0; + -moz-transform: translateX(-20px); + } + + 100% { + opacity: 1; + -moz-transform: translateX(0); + } +} + +@-o-keyframes fadeInLeft { + 0% { + opacity: 0; + -o-transform: translateX(-20px); + } + + 100% { + opacity: 1; + -o-transform: translateX(0); + } +} + +@keyframes fadeInLeft { + 0% { + opacity: 0; + transform: translateX(-20px); + } + + 100% { + opacity: 1; + transform: translateX(0); + } +} + +.fadeInLeft { + -webkit-animation-name: fadeInLeft; + -moz-animation-name: fadeInLeft; + -o-animation-name: fadeInLeft; + animation-name: fadeInLeft; +} +@-webkit-keyframes fadeInRight { + 0% { + opacity: 0; + -webkit-transform: translateX(20px); + } + + 100% { + opacity: 1; + -webkit-transform: translateX(0); + } +} + +@-moz-keyframes fadeInRight { + 0% { + opacity: 0; + -moz-transform: translateX(20px); + } + + 100% { + opacity: 1; + -moz-transform: translateX(0); + } +} + +@-o-keyframes fadeInRight { + 0% { + opacity: 0; + -o-transform: translateX(20px); + } + + 100% { + opacity: 1; + -o-transform: translateX(0); + } +} + +@keyframes fadeInRight { + 0% { + opacity: 0; + transform: translateX(20px); + } + + 100% { + opacity: 1; + transform: translateX(0); + } +} + +.fadeInRight { + -webkit-animation-name: fadeInRight; + -moz-animation-name: fadeInRight; + -o-animation-name: fadeInRight; + animation-name: fadeInRight; +} +@-webkit-keyframes fadeInUpBig { + 0% { + opacity: 0; + -webkit-transform: translateY(2000px); + } + + 100% { + opacity: 1; + -webkit-transform: translateY(0); + } +} + +@-moz-keyframes fadeInUpBig { + 0% { + opacity: 0; + -moz-transform: translateY(2000px); + } + + 100% { + opacity: 1; + -moz-transform: translateY(0); + } +} + +@-o-keyframes fadeInUpBig { + 0% { + opacity: 0; + -o-transform: translateY(2000px); + } + + 100% { + opacity: 1; + -o-transform: translateY(0); + } +} + +@keyframes fadeInUpBig { + 0% { + opacity: 0; + transform: translateY(2000px); + } + + 100% { + opacity: 1; + transform: translateY(0); + } +} + +.fadeInUpBig { + -webkit-animation-name: fadeInUpBig; + -moz-animation-name: fadeInUpBig; + -o-animation-name: fadeInUpBig; + animation-name: fadeInUpBig; +} +@-webkit-keyframes fadeInDownBig { + 0% { + opacity: 0; + -webkit-transform: translateY(-2000px); + } + + 100% { + opacity: 1; + -webkit-transform: translateY(0); + } +} + +@-moz-keyframes fadeInDownBig { + 0% { + opacity: 0; + -moz-transform: translateY(-2000px); + } + + 100% { + opacity: 1; + -moz-transform: translateY(0); + } +} + +@-o-keyframes fadeInDownBig { + 0% { + opacity: 0; + -o-transform: translateY(-2000px); + } + + 100% { + opacity: 1; + -o-transform: translateY(0); + } +} + +@keyframes fadeInDownBig { + 0% { + opacity: 0; + transform: translateY(-2000px); + } + + 100% { + opacity: 1; + transform: translateY(0); + } +} + +.fadeInDownBig { + -webkit-animation-name: fadeInDownBig; + -moz-animation-name: fadeInDownBig; + -o-animation-name: fadeInDownBig; + animation-name: fadeInDownBig; +} +@-webkit-keyframes fadeInLeftBig { + 0% { + opacity: 0; + -webkit-transform: translateX(-2000px); + } + + 100% { + opacity: 1; + -webkit-transform: translateX(0); + } +} +@-moz-keyframes fadeInLeftBig { + 0% { + opacity: 0; + -moz-transform: translateX(-2000px); + } + + 100% { + opacity: 1; + -moz-transform: translateX(0); + } +} +@-o-keyframes fadeInLeftBig { + 0% { + opacity: 0; + -o-transform: translateX(-2000px); + } + + 100% { + opacity: 1; + -o-transform: translateX(0); + } +} +@keyframes fadeInLeftBig { + 0% { + opacity: 0; + transform: translateX(-2000px); + } + + 100% { + opacity: 1; + transform: translateX(0); + } +} + +.fadeInLeftBig { + -webkit-animation-name: fadeInLeftBig; + -moz-animation-name: fadeInLeftBig; + -o-animation-name: fadeInLeftBig; + animation-name: fadeInLeftBig; +} +@-webkit-keyframes fadeInRightBig { + 0% { + opacity: 0; + -webkit-transform: translateX(2000px); + } + + 100% { + opacity: 1; + -webkit-transform: translateX(0); + } +} + +@-moz-keyframes fadeInRightBig { + 0% { + opacity: 0; + -moz-transform: translateX(2000px); + } + + 100% { + opacity: 1; + -moz-transform: translateX(0); + } +} + +@-o-keyframes fadeInRightBig { + 0% { + opacity: 0; + -o-transform: translateX(2000px); + } + + 100% { + opacity: 1; + -o-transform: translateX(0); + } +} + +@keyframes fadeInRightBig { + 0% { + opacity: 0; + transform: translateX(2000px); + } + + 100% { + opacity: 1; + transform: translateX(0); + } +} + +.fadeInRightBig { + -webkit-animation-name: fadeInRightBig; + -moz-animation-name: fadeInRightBig; + -o-animation-name: fadeInRightBig; + animation-name: fadeInRightBig; +} +@-webkit-keyframes fadeOut { + 0% { + opacity: 1; + } + 100% { + opacity: 0; + } +} + +@-moz-keyframes fadeOut { + 0% { + opacity: 1; + } + 100% { + opacity: 0; + } +} + +@-o-keyframes fadeOut { + 0% { + opacity: 1; + } + 100% { + opacity: 0; + } +} + +@keyframes fadeOut { + 0% { + opacity: 1; + } + 100% { + opacity: 0; + } +} + +.fadeOut { + -webkit-animation-name: fadeOut; + -moz-animation-name: fadeOut; + -o-animation-name: fadeOut; + animation-name: fadeOut; +} +@-webkit-keyframes fadeOutUp { + 0% { + opacity: 1; + -webkit-transform: translateY(0); + } + + 100% { + opacity: 0; + -webkit-transform: translateY(-20px); + } +} +@-moz-keyframes fadeOutUp { + 0% { + opacity: 1; + -moz-transform: translateY(0); + } + + 100% { + opacity: 0; + -moz-transform: translateY(-20px); + } +} +@-o-keyframes fadeOutUp { + 0% { + opacity: 1; + -o-transform: translateY(0); + } + + 100% { + opacity: 0; + -o-transform: translateY(-20px); + } +} +@keyframes fadeOutUp { + 0% { + opacity: 1; + transform: translateY(0); + } + + 100% { + opacity: 0; + transform: translateY(-20px); + } +} + +.fadeOutUp { + -webkit-animation-name: fadeOutUp; + -moz-animation-name: fadeOutUp; + -o-animation-name: fadeOutUp; + animation-name: fadeOutUp; +} +@-webkit-keyframes fadeOutDown { + 0% { + opacity: 1; + -webkit-transform: translateY(0); + } + + 100% { + opacity: 0; + -webkit-transform: translateY(20px); + } +} + +@-moz-keyframes fadeOutDown { + 0% { + opacity: 1; + -moz-transform: translateY(0); + } + + 100% { + opacity: 0; + -moz-transform: translateY(20px); + } +} + +@-o-keyframes fadeOutDown { + 0% { + opacity: 1; + -o-transform: translateY(0); + } + + 100% { + opacity: 0; + -o-transform: translateY(20px); + } +} + +@keyframes fadeOutDown { + 0% { + opacity: 1; + transform: translateY(0); + } + + 100% { + opacity: 0; + transform: translateY(20px); + } +} + +.fadeOutDown { + -webkit-animation-name: fadeOutDown; + -moz-animation-name: fadeOutDown; + -o-animation-name: fadeOutDown; + animation-name: fadeOutDown; +} +@-webkit-keyframes fadeOutLeft { + 0% { + opacity: 1; + -webkit-transform: translateX(0); + } + + 100% { + opacity: 0; + -webkit-transform: translateX(-20px); + } +} + +@-moz-keyframes fadeOutLeft { + 0% { + opacity: 1; + -moz-transform: translateX(0); + } + + 100% { + opacity: 0; + -moz-transform: translateX(-20px); + } +} + +@-o-keyframes fadeOutLeft { + 0% { + opacity: 1; + -o-transform: translateX(0); + } + + 100% { + opacity: 0; + -o-transform: translateX(-20px); + } +} + +@keyframes fadeOutLeft { + 0% { + opacity: 1; + transform: translateX(0); + } + + 100% { + opacity: 0; + transform: translateX(-20px); + } +} + +.fadeOutLeft { + -webkit-animation-name: fadeOutLeft; + -moz-animation-name: fadeOutLeft; + -o-animation-name: fadeOutLeft; + animation-name: fadeOutLeft; +} +@-webkit-keyframes fadeOutRight { + 0% { + opacity: 1; + -webkit-transform: translateX(0); + } + + 100% { + opacity: 0; + -webkit-transform: translateX(20px); + } +} + +@-moz-keyframes fadeOutRight { + 0% { + opacity: 1; + -moz-transform: translateX(0); + } + + 100% { + opacity: 0; + -moz-transform: translateX(20px); + } +} + +@-o-keyframes fadeOutRight { + 0% { + opacity: 1; + -o-transform: translateX(0); + } + + 100% { + opacity: 0; + -o-transform: translateX(20px); + } +} + +@keyframes fadeOutRight { + 0% { + opacity: 1; + transform: translateX(0); + } + + 100% { + opacity: 0; + transform: translateX(20px); + } +} + +.fadeOutRight { + -webkit-animation-name: fadeOutRight; + -moz-animation-name: fadeOutRight; + -o-animation-name: fadeOutRight; + animation-name: fadeOutRight; +} +@-webkit-keyframes fadeOutUpBig { + 0% { + opacity: 1; + -webkit-transform: translateY(0); + } + + 100% { + opacity: 0; + -webkit-transform: translateY(-2000px); + } +} + +@-moz-keyframes fadeOutUpBig { + 0% { + opacity: 1; + -moz-transform: translateY(0); + } + + 100% { + opacity: 0; + -moz-transform: translateY(-2000px); + } +} + +@-o-keyframes fadeOutUpBig { + 0% { + opacity: 1; + -o-transform: translateY(0); + } + + 100% { + opacity: 0; + -o-transform: translateY(-2000px); + } +} + +@keyframes fadeOutUpBig { + 0% { + opacity: 1; + transform: translateY(0); + } + + 100% { + opacity: 0; + transform: translateY(-2000px); + } +} + +.fadeOutUpBig { + -webkit-animation-name: fadeOutUpBig; + -moz-animation-name: fadeOutUpBig; + -o-animation-name: fadeOutUpBig; + animation-name: fadeOutUpBig; +} +@-webkit-keyframes fadeOutDownBig { + 0% { + opacity: 1; + -webkit-transform: translateY(0); + } + + 100% { + opacity: 0; + -webkit-transform: translateY(2000px); + } +} + +@-moz-keyframes fadeOutDownBig { + 0% { + opacity: 1; + -moz-transform: translateY(0); + } + + 100% { + opacity: 0; + -moz-transform: translateY(2000px); + } +} + +@-o-keyframes fadeOutDownBig { + 0% { + opacity: 1; + -o-transform: translateY(0); + } + + 100% { + opacity: 0; + -o-transform: translateY(2000px); + } +} + +@keyframes fadeOutDownBig { + 0% { + opacity: 1; + transform: translateY(0); + } + + 100% { + opacity: 0; + transform: translateY(2000px); + } +} + +.fadeOutDownBig { + -webkit-animation-name: fadeOutDownBig; + -moz-animation-name: fadeOutDownBig; + -o-animation-name: fadeOutDownBig; + animation-name: fadeOutDownBig; +} +@-webkit-keyframes fadeOutLeftBig { + 0% { + opacity: 1; + -webkit-transform: translateX(0); + } + + 100% { + opacity: 0; + -webkit-transform: translateX(-2000px); + } +} + +@-moz-keyframes fadeOutLeftBig { + 0% { + opacity: 1; + -moz-transform: translateX(0); + } + + 100% { + opacity: 0; + -moz-transform: translateX(-2000px); + } +} + +@-o-keyframes fadeOutLeftBig { + 0% { + opacity: 1; + -o-transform: translateX(0); + } + + 100% { + opacity: 0; + -o-transform: translateX(-2000px); + } +} + +@keyframes fadeOutLeftBig { + 0% { + opacity: 1; + transform: translateX(0); + } + + 100% { + opacity: 0; + transform: translateX(-2000px); + } +} + +.fadeOutLeftBig { + -webkit-animation-name: fadeOutLeftBig; + -moz-animation-name: fadeOutLeftBig; + -o-animation-name: fadeOutLeftBig; + animation-name: fadeOutLeftBig; +} +@-webkit-keyframes fadeOutRightBig { + 0% { + opacity: 1; + -webkit-transform: translateX(0); + } + + 100% { + opacity: 0; + -webkit-transform: translateX(2000px); + } +} +@-moz-keyframes fadeOutRightBig { + 0% { + opacity: 1; + -moz-transform: translateX(0); + } + + 100% { + opacity: 0; + -moz-transform: translateX(2000px); + } +} +@-o-keyframes fadeOutRightBig { + 0% { + opacity: 1; + -o-transform: translateX(0); + } + + 100% { + opacity: 0; + -o-transform: translateX(2000px); + } +} +@keyframes fadeOutRightBig { + 0% { + opacity: 1; + transform: translateX(0); + } + + 100% { + opacity: 0; + transform: translateX(2000px); + } +} + +.fadeOutRightBig { + -webkit-animation-name: fadeOutRightBig; + -moz-animation-name: fadeOutRightBig; + -o-animation-name: fadeOutRightBig; + animation-name: fadeOutRightBig; +} +@-webkit-keyframes bounceIn { + 0% { + opacity: 0; + -webkit-transform: scale(0.3); + } + + 50% { + opacity: 1; + -webkit-transform: scale(1.05); + } + + 70% { + -webkit-transform: scale(0.9); + } + + 100% { + -webkit-transform: scale(1); + } +} + +@-moz-keyframes bounceIn { + 0% { + opacity: 0; + -moz-transform: scale(0.3); + } + + 50% { + opacity: 1; + -moz-transform: scale(1.05); + } + + 70% { + -moz-transform: scale(0.9); + } + + 100% { + -moz-transform: scale(1); + } +} + +@-o-keyframes bounceIn { + 0% { + opacity: 0; + -o-transform: scale(0.3); + } + + 50% { + opacity: 1; + -o-transform: scale(1.05); + } + + 70% { + -o-transform: scale(0.9); + } + + 100% { + -o-transform: scale(1); + } +} + +@keyframes bounceIn { + 0% { + opacity: 0; + transform: scale(0.3); + } + + 50% { + opacity: 1; + transform: scale(1.05); + } + + 70% { + transform: scale(0.9); + } + + 100% { + transform: scale(1); + } +} + +.bounceIn { + -webkit-animation-name: bounceIn; + -moz-animation-name: bounceIn; + -o-animation-name: bounceIn; + animation-name: bounceIn; +} +@-webkit-keyframes bounceInUp { + 0% { + opacity: 0; + -webkit-transform: translateY(2000px); + } + + 60% { + opacity: 1; + -webkit-transform: translateY(-30px); + } + + 80% { + -webkit-transform: translateY(10px); + } + + 100% { + -webkit-transform: translateY(0); + } +} +@-moz-keyframes bounceInUp { + 0% { + opacity: 0; + -moz-transform: translateY(2000px); + } + + 60% { + opacity: 1; + -moz-transform: translateY(-30px); + } + + 80% { + -moz-transform: translateY(10px); + } + + 100% { + -moz-transform: translateY(0); + } +} + +@-o-keyframes bounceInUp { + 0% { + opacity: 0; + -o-transform: translateY(2000px); + } + + 60% { + opacity: 1; + -o-transform: translateY(-30px); + } + + 80% { + -o-transform: translateY(10px); + } + + 100% { + -o-transform: translateY(0); + } +} + +@keyframes bounceInUp { + 0% { + opacity: 0; + transform: translateY(2000px); + } + + 60% { + opacity: 1; + transform: translateY(-30px); + } + + 80% { + transform: translateY(10px); + } + + 100% { + transform: translateY(0); + } +} + +.bounceInUp { + -webkit-animation-name: bounceInUp; + -moz-animation-name: bounceInUp; + -o-animation-name: bounceInUp; + animation-name: bounceInUp; +} +@-webkit-keyframes bounceInDown { + 0% { + opacity: 0; + -webkit-transform: translateY(-2000px); + } + + 60% { + opacity: 1; + -webkit-transform: translateY(30px); + } + + 80% { + -webkit-transform: translateY(-10px); + } + + 100% { + -webkit-transform: translateY(0); + } +} + +@-moz-keyframes bounceInDown { + 0% { + opacity: 0; + -moz-transform: translateY(-2000px); + } + + 60% { + opacity: 1; + -moz-transform: translateY(30px); + } + + 80% { + -moz-transform: translateY(-10px); + } + + 100% { + -moz-transform: translateY(0); + } +} + +@-o-keyframes bounceInDown { + 0% { + opacity: 0; + -o-transform: translateY(-2000px); + } + + 60% { + opacity: 1; + -o-transform: translateY(30px); + } + + 80% { + -o-transform: translateY(-10px); + } + + 100% { + -o-transform: translateY(0); + } +} + +@keyframes bounceInDown { + 0% { + opacity: 0; + transform: translateY(-2000px); + } + + 60% { + opacity: 1; + transform: translateY(30px); + } + + 80% { + transform: translateY(-10px); + } + + 100% { + transform: translateY(0); + } +} + +.bounceInDown { + -webkit-animation-name: bounceInDown; + -moz-animation-name: bounceInDown; + -o-animation-name: bounceInDown; + animation-name: bounceInDown; +} +@-webkit-keyframes bounceInLeft { + 0% { + opacity: 0; + -webkit-transform: translateX(-2000px); + } + + 60% { + opacity: 1; + -webkit-transform: translateX(30px); + } + + 80% { + -webkit-transform: translateX(-10px); + } + + 100% { + -webkit-transform: translateX(0); + } +} + +@-moz-keyframes bounceInLeft { + 0% { + opacity: 0; + -moz-transform: translateX(-2000px); + } + + 60% { + opacity: 1; + -moz-transform: translateX(30px); + } + + 80% { + -moz-transform: translateX(-10px); + } + + 100% { + -moz-transform: translateX(0); + } +} + +@-o-keyframes bounceInLeft { + 0% { + opacity: 0; + -o-transform: translateX(-2000px); + } + + 60% { + opacity: 1; + -o-transform: translateX(30px); + } + + 80% { + -o-transform: translateX(-10px); + } + + 100% { + -o-transform: translateX(0); + } +} + +@keyframes bounceInLeft { + 0% { + opacity: 0; + transform: translateX(-2000px); + } + + 60% { + opacity: 1; + transform: translateX(30px); + } + + 80% { + transform: translateX(-10px); + } + + 100% { + transform: translateX(0); + } +} + +.bounceInLeft { + -webkit-animation-name: bounceInLeft; + -moz-animation-name: bounceInLeft; + -o-animation-name: bounceInLeft; + animation-name: bounceInLeft; +} +@-webkit-keyframes bounceInRight { + 0% { + opacity: 0; + -webkit-transform: translateX(2000px); + } + + 60% { + opacity: 1; + -webkit-transform: translateX(-30px); + } + + 80% { + -webkit-transform: translateX(10px); + } + + 100% { + -webkit-transform: translateX(0); + } +} + +@-moz-keyframes bounceInRight { + 0% { + opacity: 0; + -moz-transform: translateX(2000px); + } + + 60% { + opacity: 1; + -moz-transform: translateX(-30px); + } + + 80% { + -moz-transform: translateX(10px); + } + + 100% { + -moz-transform: translateX(0); + } +} + +@-o-keyframes bounceInRight { + 0% { + opacity: 0; + -o-transform: translateX(2000px); + } + + 60% { + opacity: 1; + -o-transform: translateX(-30px); + } + + 80% { + -o-transform: translateX(10px); + } + + 100% { + -o-transform: translateX(0); + } +} + +@keyframes bounceInRight { + 0% { + opacity: 0; + transform: translateX(2000px); + } + + 60% { + opacity: 1; + transform: translateX(-30px); + } + + 80% { + transform: translateX(10px); + } + + 100% { + transform: translateX(0); + } +} + +.bounceInRight { + -webkit-animation-name: bounceInRight; + -moz-animation-name: bounceInRight; + -o-animation-name: bounceInRight; + animation-name: bounceInRight; +} +@-webkit-keyframes bounceOut { + 0% { + -webkit-transform: scale(1); + } + + 25% { + -webkit-transform: scale(0.95); + } + + 50% { + opacity: 1; + -webkit-transform: scale(1.1); + } + + 100% { + opacity: 0; + -webkit-transform: scale(0.3); + } +} + +@-moz-keyframes bounceOut { + 0% { + -moz-transform: scale(1); + } + + 25% { + -moz-transform: scale(0.95); + } + + 50% { + opacity: 1; + -moz-transform: scale(1.1); + } + + 100% { + opacity: 0; + -moz-transform: scale(0.3); + } +} + +@-o-keyframes bounceOut { + 0% { + -o-transform: scale(1); + } + + 25% { + -o-transform: scale(0.95); + } + + 50% { + opacity: 1; + -o-transform: scale(1.1); + } + + 100% { + opacity: 0; + -o-transform: scale(0.3); + } +} + +@keyframes bounceOut { + 0% { + transform: scale(1); + } + + 25% { + transform: scale(0.95); + } + + 50% { + opacity: 1; + transform: scale(1.1); + } + + 100% { + opacity: 0; + transform: scale(0.3); + } +} + +.bounceOut { + -webkit-animation-name: bounceOut; + -moz-animation-name: bounceOut; + -o-animation-name: bounceOut; + animation-name: bounceOut; +} +@-webkit-keyframes bounceOutUp { + 0% { + -webkit-transform: translateY(0); + } + + 20% { + opacity: 1; + -webkit-transform: translateY(20px); + } + + 100% { + opacity: 0; + -webkit-transform: translateY(-2000px); + } +} + +@-moz-keyframes bounceOutUp { + 0% { + -moz-transform: translateY(0); + } + + 20% { + opacity: 1; + -moz-transform: translateY(20px); + } + + 100% { + opacity: 0; + -moz-transform: translateY(-2000px); + } +} + +@-o-keyframes bounceOutUp { + 0% { + -o-transform: translateY(0); + } + + 20% { + opacity: 1; + -o-transform: translateY(20px); + } + + 100% { + opacity: 0; + -o-transform: translateY(-2000px); + } +} + +@keyframes bounceOutUp { + 0% { + transform: translateY(0); + } + + 20% { + opacity: 1; + transform: translateY(20px); + } + + 100% { + opacity: 0; + transform: translateY(-2000px); + } +} + +.bounceOutUp { + -webkit-animation-name: bounceOutUp; + -moz-animation-name: bounceOutUp; + -o-animation-name: bounceOutUp; + animation-name: bounceOutUp; +} +@-webkit-keyframes bounceOutDown { + 0% { + -webkit-transform: translateY(0); + } + + 20% { + opacity: 1; + -webkit-transform: translateY(-20px); + } + + 100% { + opacity: 0; + -webkit-transform: translateY(2000px); + } +} + +@-moz-keyframes bounceOutDown { + 0% { + -moz-transform: translateY(0); + } + + 20% { + opacity: 1; + -moz-transform: translateY(-20px); + } + + 100% { + opacity: 0; + -moz-transform: translateY(2000px); + } +} + +@-o-keyframes bounceOutDown { + 0% { + -o-transform: translateY(0); + } + + 20% { + opacity: 1; + -o-transform: translateY(-20px); + } + + 100% { + opacity: 0; + -o-transform: translateY(2000px); + } +} + +@keyframes bounceOutDown { + 0% { + transform: translateY(0); + } + + 20% { + opacity: 1; + transform: translateY(-20px); + } + + 100% { + opacity: 0; + transform: translateY(2000px); + } +} + +.bounceOutDown { + -webkit-animation-name: bounceOutDown; + -moz-animation-name: bounceOutDown; + -o-animation-name: bounceOutDown; + animation-name: bounceOutDown; +} +@-webkit-keyframes bounceOutLeft { + 0% { + -webkit-transform: translateX(0); + } + + 20% { + opacity: 1; + -webkit-transform: translateX(20px); + } + + 100% { + opacity: 0; + -webkit-transform: translateX(-2000px); + } +} + +@-moz-keyframes bounceOutLeft { + 0% { + -moz-transform: translateX(0); + } + + 20% { + opacity: 1; + -moz-transform: translateX(20px); + } + + 100% { + opacity: 0; + -moz-transform: translateX(-2000px); + } +} + +@-o-keyframes bounceOutLeft { + 0% { + -o-transform: translateX(0); + } + + 20% { + opacity: 1; + -o-transform: translateX(20px); + } + + 100% { + opacity: 0; + -o-transform: translateX(-2000px); + } +} + +@keyframes bounceOutLeft { + 0% { + transform: translateX(0); + } + + 20% { + opacity: 1; + transform: translateX(20px); + } + + 100% { + opacity: 0; + transform: translateX(-2000px); + } +} + +.bounceOutLeft { + -webkit-animation-name: bounceOutLeft; + -moz-animation-name: bounceOutLeft; + -o-animation-name: bounceOutLeft; + animation-name: bounceOutLeft; +} +@-webkit-keyframes bounceOutRight { + 0% { + -webkit-transform: translateX(0); + } + + 20% { + opacity: 1; + -webkit-transform: translateX(-20px); + } + + 100% { + opacity: 0; + -webkit-transform: translateX(2000px); + } +} + +@-moz-keyframes bounceOutRight { + 0% { + -moz-transform: translateX(0); + } + + 20% { + opacity: 1; + -moz-transform: translateX(-20px); + } + + 100% { + opacity: 0; + -moz-transform: translateX(2000px); + } +} + +@-o-keyframes bounceOutRight { + 0% { + -o-transform: translateX(0); + } + + 20% { + opacity: 1; + -o-transform: translateX(-20px); + } + + 100% { + opacity: 0; + -o-transform: translateX(2000px); + } +} + +@keyframes bounceOutRight { + 0% { + transform: translateX(0); + } + + 20% { + opacity: 1; + transform: translateX(-20px); + } + + 100% { + opacity: 0; + transform: translateX(2000px); + } +} + +.bounceOutRight { + -webkit-animation-name: bounceOutRight; + -moz-animation-name: bounceOutRight; + -o-animation-name: bounceOutRight; + animation-name: bounceOutRight; +} +@-webkit-keyframes rotateIn { + 0% { + -webkit-transform-origin: center center; + -webkit-transform: rotate(-200deg); + opacity: 0; + } + + 100% { + -webkit-transform-origin: center center; + -webkit-transform: rotate(0); + opacity: 1; + } +} +@-moz-keyframes rotateIn { + 0% { + -moz-transform-origin: center center; + -moz-transform: rotate(-200deg); + opacity: 0; + } + + 100% { + -moz-transform-origin: center center; + -moz-transform: rotate(0); + opacity: 1; + } +} +@-o-keyframes rotateIn { + 0% { + -o-transform-origin: center center; + -o-transform: rotate(-200deg); + opacity: 0; + } + + 100% { + -o-transform-origin: center center; + -o-transform: rotate(0); + opacity: 1; + } +} +@keyframes rotateIn { + 0% { + transform-origin: center center; + transform: rotate(-200deg); + opacity: 0; + } + + 100% { + transform-origin: center center; + transform: rotate(0); + opacity: 1; + } +} + +.rotateIn { + -webkit-animation-name: rotateIn; + -moz-animation-name: rotateIn; + -o-animation-name: rotateIn; + animation-name: rotateIn; +} +@-webkit-keyframes rotateInUpLeft { + 0% { + -webkit-transform-origin: left bottom; + -webkit-transform: rotate(90deg); + opacity: 0; + } + + 100% { + -webkit-transform-origin: left bottom; + -webkit-transform: rotate(0); + opacity: 1; + } +} + +@-moz-keyframes rotateInUpLeft { + 0% { + -moz-transform-origin: left bottom; + -moz-transform: rotate(90deg); + opacity: 0; + } + + 100% { + -moz-transform-origin: left bottom; + -moz-transform: rotate(0); + opacity: 1; + } +} + +@-o-keyframes rotateInUpLeft { + 0% { + -o-transform-origin: left bottom; + -o-transform: rotate(90deg); + opacity: 0; + } + + 100% { + -o-transform-origin: left bottom; + -o-transform: rotate(0); + opacity: 1; + } +} + +@keyframes rotateInUpLeft { + 0% { + transform-origin: left bottom; + transform: rotate(90deg); + opacity: 0; + } + + 100% { + transform-origin: left bottom; + transform: rotate(0); + opacity: 1; + } +} + +.rotateInUpLeft { + -webkit-animation-name: rotateInUpLeft; + -moz-animation-name: rotateInUpLeft; + -o-animation-name: rotateInUpLeft; + animation-name: rotateInUpLeft; +} +@-webkit-keyframes rotateInDownLeft { + 0% { + -webkit-transform-origin: left bottom; + -webkit-transform: rotate(-90deg); + opacity: 0; + } + + 100% { + -webkit-transform-origin: left bottom; + -webkit-transform: rotate(0); + opacity: 1; + } +} + +@-moz-keyframes rotateInDownLeft { + 0% { + -moz-transform-origin: left bottom; + -moz-transform: rotate(-90deg); + opacity: 0; + } + + 100% { + -moz-transform-origin: left bottom; + -moz-transform: rotate(0); + opacity: 1; + } +} + +@-o-keyframes rotateInDownLeft { + 0% { + -o-transform-origin: left bottom; + -o-transform: rotate(-90deg); + opacity: 0; + } + + 100% { + -o-transform-origin: left bottom; + -o-transform: rotate(0); + opacity: 1; + } +} + +@keyframes rotateInDownLeft { + 0% { + transform-origin: left bottom; + transform: rotate(-90deg); + opacity: 0; + } + + 100% { + transform-origin: left bottom; + transform: rotate(0); + opacity: 1; + } +} + +.rotateInDownLeft { + -webkit-animation-name: rotateInDownLeft; + -moz-animation-name: rotateInDownLeft; + -o-animation-name: rotateInDownLeft; + animation-name: rotateInDownLeft; +} +@-webkit-keyframes rotateInUpRight { + 0% { + -webkit-transform-origin: right bottom; + -webkit-transform: rotate(-90deg); + opacity: 0; + } + + 100% { + -webkit-transform-origin: right bottom; + -webkit-transform: rotate(0); + opacity: 1; + } +} + +@-moz-keyframes rotateInUpRight { + 0% { + -moz-transform-origin: right bottom; + -moz-transform: rotate(-90deg); + opacity: 0; + } + + 100% { + -moz-transform-origin: right bottom; + -moz-transform: rotate(0); + opacity: 1; + } +} + +@-o-keyframes rotateInUpRight { + 0% { + -o-transform-origin: right bottom; + -o-transform: rotate(-90deg); + opacity: 0; + } + + 100% { + -o-transform-origin: right bottom; + -o-transform: rotate(0); + opacity: 1; + } +} + +@keyframes rotateInUpRight { + 0% { + transform-origin: right bottom; + transform: rotate(-90deg); + opacity: 0; + } + + 100% { + transform-origin: right bottom; + transform: rotate(0); + opacity: 1; + } +} + +.rotateInUpRight { + -webkit-animation-name: rotateInUpRight; + -moz-animation-name: rotateInUpRight; + -o-animation-name: rotateInUpRight; + animation-name: rotateInUpRight; +} +@-webkit-keyframes rotateInDownRight { + 0% { + -webkit-transform-origin: right bottom; + -webkit-transform: rotate(90deg); + opacity: 0; + } + + 100% { + -webkit-transform-origin: right bottom; + -webkit-transform: rotate(0); + opacity: 1; + } +} + +@-moz-keyframes rotateInDownRight { + 0% { + -moz-transform-origin: right bottom; + -moz-transform: rotate(90deg); + opacity: 0; + } + + 100% { + -moz-transform-origin: right bottom; + -moz-transform: rotate(0); + opacity: 1; + } +} + +@-o-keyframes rotateInDownRight { + 0% { + -o-transform-origin: right bottom; + -o-transform: rotate(90deg); + opacity: 0; + } + + 100% { + -o-transform-origin: right bottom; + -o-transform: rotate(0); + opacity: 1; + } +} + +@keyframes rotateInDownRight { + 0% { + transform-origin: right bottom; + transform: rotate(90deg); + opacity: 0; + } + + 100% { + transform-origin: right bottom; + transform: rotate(0); + opacity: 1; + } +} + +.rotateInDownRight { + -webkit-animation-name: rotateInDownRight; + -moz-animation-name: rotateInDownRight; + -o-animation-name: rotateInDownRight; + animation-name: rotateInDownRight; +} +@-webkit-keyframes rotateOut { + 0% { + -webkit-transform-origin: center center; + -webkit-transform: rotate(0); + opacity: 1; + } + + 100% { + -webkit-transform-origin: center center; + -webkit-transform: rotate(200deg); + opacity: 0; + } +} + +@-moz-keyframes rotateOut { + 0% { + -moz-transform-origin: center center; + -moz-transform: rotate(0); + opacity: 1; + } + + 100% { + -moz-transform-origin: center center; + -moz-transform: rotate(200deg); + opacity: 0; + } +} + +@-o-keyframes rotateOut { + 0% { + -o-transform-origin: center center; + -o-transform: rotate(0); + opacity: 1; + } + + 100% { + -o-transform-origin: center center; + -o-transform: rotate(200deg); + opacity: 0; + } +} + +@keyframes rotateOut { + 0% { + transform-origin: center center; + transform: rotate(0); + opacity: 1; + } + + 100% { + transform-origin: center center; + transform: rotate(200deg); + opacity: 0; + } +} + +.rotateOut { + -webkit-animation-name: rotateOut; + -moz-animation-name: rotateOut; + -o-animation-name: rotateOut; + animation-name: rotateOut; +} +@-webkit-keyframes rotateOutUpLeft { + 0% { + -webkit-transform-origin: left bottom; + -webkit-transform: rotate(0); + opacity: 1; + } + + 100% { + -webkit-transform-origin: left bottom; + -webkit-transform: rotate(-90deg); + opacity: 0; + } +} + +@-moz-keyframes rotateOutUpLeft { + 0% { + -moz-transform-origin: left bottom; + -moz-transform: rotate(0); + opacity: 1; + } + + 100% { + -moz-transform-origin: left bottom; + -moz-transform: rotate(-90deg); + opacity: 0; + } +} + +@-o-keyframes rotateOutUpLeft { + 0% { + -o-transform-origin: left bottom; + -o-transform: rotate(0); + opacity: 1; + } + + 100% { + -o-transform-origin: left bottom; + -o-transform: rotate(-90deg); + opacity: 0; + } +} + +@keyframes rotateOutUpLeft { + 0% { + transform-origin: left bottom; + transform: rotate(0); + opacity: 1; + } + + 100% { + transform-origin: left bottom; + transform: rotate(-90deg); + opacity: 0; + } +} + +.rotateOutUpLeft { + -webkit-animation-name: rotateOutUpLeft; + -moz-animation-name: rotateOutUpLeft; + -o-animation-name: rotateOutUpLeft; + animation-name: rotateOutUpLeft; +} +@-webkit-keyframes rotateOutDownLeft { + 0% { + -webkit-transform-origin: left bottom; + -webkit-transform: rotate(0); + opacity: 1; + } + + 100% { + -webkit-transform-origin: left bottom; + -webkit-transform: rotate(90deg); + opacity: 0; + } +} + +@-moz-keyframes rotateOutDownLeft { + 0% { + -moz-transform-origin: left bottom; + -moz-transform: rotate(0); + opacity: 1; + } + + 100% { + -moz-transform-origin: left bottom; + -moz-transform: rotate(90deg); + opacity: 0; + } +} + +@-o-keyframes rotateOutDownLeft { + 0% { + -o-transform-origin: left bottom; + -o-transform: rotate(0); + opacity: 1; + } + + 100% { + -o-transform-origin: left bottom; + -o-transform: rotate(90deg); + opacity: 0; + } +} + +@keyframes rotateOutDownLeft { + 0% { + transform-origin: left bottom; + transform: rotate(0); + opacity: 1; + } + + 100% { + transform-origin: left bottom; + transform: rotate(90deg); + opacity: 0; + } +} + +.rotateOutDownLeft { + -webkit-animation-name: rotateOutDownLeft; + -moz-animation-name: rotateOutDownLeft; + -o-animation-name: rotateOutDownLeft; + animation-name: rotateOutDownLeft; +} +@-webkit-keyframes rotateOutUpRight { + 0% { + -webkit-transform-origin: right bottom; + -webkit-transform: rotate(0); + opacity: 1; + } + + 100% { + -webkit-transform-origin: right bottom; + -webkit-transform: rotate(90deg); + opacity: 0; + } +} + +@-moz-keyframes rotateOutUpRight { + 0% { + -moz-transform-origin: right bottom; + -moz-transform: rotate(0); + opacity: 1; + } + + 100% { + -moz-transform-origin: right bottom; + -moz-transform: rotate(90deg); + opacity: 0; + } +} + +@-o-keyframes rotateOutUpRight { + 0% { + -o-transform-origin: right bottom; + -o-transform: rotate(0); + opacity: 1; + } + + 100% { + -o-transform-origin: right bottom; + -o-transform: rotate(90deg); + opacity: 0; + } +} + +@keyframes rotateOutUpRight { + 0% { + transform-origin: right bottom; + transform: rotate(0); + opacity: 1; + } + + 100% { + transform-origin: right bottom; + transform: rotate(90deg); + opacity: 0; + } +} + +.rotateOutUpRight { + -webkit-animation-name: rotateOutUpRight; + -moz-animation-name: rotateOutUpRight; + -o-animation-name: rotateOutUpRight; + animation-name: rotateOutUpRight; +} +@-webkit-keyframes rotateOutDownRight { + 0% { + -webkit-transform-origin: right bottom; + -webkit-transform: rotate(0); + opacity: 1; + } + + 100% { + -webkit-transform-origin: right bottom; + -webkit-transform: rotate(-90deg); + opacity: 0; + } +} + +@-moz-keyframes rotateOutDownRight { + 0% { + -moz-transform-origin: right bottom; + -moz-transform: rotate(0); + opacity: 1; + } + + 100% { + -moz-transform-origin: right bottom; + -moz-transform: rotate(-90deg); + opacity: 0; + } +} + +@-o-keyframes rotateOutDownRight { + 0% { + -o-transform-origin: right bottom; + -o-transform: rotate(0); + opacity: 1; + } + + 100% { + -o-transform-origin: right bottom; + -o-transform: rotate(-90deg); + opacity: 0; + } +} + +@keyframes rotateOutDownRight { + 0% { + transform-origin: right bottom; + transform: rotate(0); + opacity: 1; + } + + 100% { + transform-origin: right bottom; + transform: rotate(-90deg); + opacity: 0; + } +} + +.rotateOutDownRight { + -webkit-animation-name: rotateOutDownRight; + -moz-animation-name: rotateOutDownRight; + -o-animation-name: rotateOutDownRight; + animation-name: rotateOutDownRight; +} +@-webkit-keyframes hinge { + 0% { + -webkit-transform: rotate(0); + -webkit-transform-origin: top left; + -webkit-animation-timing-function: ease-in-out; + } + 20%, + 60% { + -webkit-transform: rotate(80deg); + -webkit-transform-origin: top left; + -webkit-animation-timing-function: ease-in-out; + } + 40% { + -webkit-transform: rotate(60deg); + -webkit-transform-origin: top left; + -webkit-animation-timing-function: ease-in-out; + } + 80% { + -webkit-transform: rotate(60deg) translateY(0); + opacity: 1; + -webkit-transform-origin: top left; + -webkit-animation-timing-function: ease-in-out; + } + 100% { + -webkit-transform: translateY(700px); + opacity: 0; + } +} + +@-moz-keyframes hinge { + 0% { + -moz-transform: rotate(0); + -moz-transform-origin: top left; + -moz-animation-timing-function: ease-in-out; + } + 20%, + 60% { + -moz-transform: rotate(80deg); + -moz-transform-origin: top left; + -moz-animation-timing-function: ease-in-out; + } + 40% { + -moz-transform: rotate(60deg); + -moz-transform-origin: top left; + -moz-animation-timing-function: ease-in-out; + } + 80% { + -moz-transform: rotate(60deg) translateY(0); + opacity: 1; + -moz-transform-origin: top left; + -moz-animation-timing-function: ease-in-out; + } + 100% { + -moz-transform: translateY(700px); + opacity: 0; + } +} + +@-o-keyframes hinge { + 0% { + -o-transform: rotate(0); + -o-transform-origin: top left; + -o-animation-timing-function: ease-in-out; + } + 20%, + 60% { + -o-transform: rotate(80deg); + -o-transform-origin: top left; + -o-animation-timing-function: ease-in-out; + } + 40% { + -o-transform: rotate(60deg); + -o-transform-origin: top left; + -o-animation-timing-function: ease-in-out; + } + 80% { + -o-transform: rotate(60deg) translateY(0); + opacity: 1; + -o-transform-origin: top left; + -o-animation-timing-function: ease-in-out; + } + 100% { + -o-transform: translateY(700px); + opacity: 0; + } +} + +@keyframes hinge { + 0% { + transform: rotate(0); + transform-origin: top left; + animation-timing-function: ease-in-out; + } + 20%, + 60% { + transform: rotate(80deg); + transform-origin: top left; + animation-timing-function: ease-in-out; + } + 40% { + transform: rotate(60deg); + transform-origin: top left; + animation-timing-function: ease-in-out; + } + 80% { + transform: rotate(60deg) translateY(0); + opacity: 1; + transform-origin: top left; + animation-timing-function: ease-in-out; + } + 100% { + transform: translateY(700px); + opacity: 0; + } +} + +.hinge { + -webkit-animation-name: hinge; + -moz-animation-name: hinge; + -o-animation-name: hinge; + animation-name: hinge; +} +/* originally authored by Nick Pettit - https://github.com/nickpettit/glide */ + +@-webkit-keyframes rollIn { + 0% { + opacity: 0; + -webkit-transform: translateX(-100%) rotate(-120deg); + } + 100% { + opacity: 1; + -webkit-transform: translateX(0px) rotate(0deg); + } +} + +@-moz-keyframes rollIn { + 0% { + opacity: 0; + -moz-transform: translateX(-100%) rotate(-120deg); + } + 100% { + opacity: 1; + -moz-transform: translateX(0px) rotate(0deg); + } +} + +@-o-keyframes rollIn { + 0% { + opacity: 0; + -o-transform: translateX(-100%) rotate(-120deg); + } + 100% { + opacity: 1; + -o-transform: translateX(0px) rotate(0deg); + } +} + +@keyframes rollIn { + 0% { + opacity: 0; + transform: translateX(-100%) rotate(-120deg); + } + 100% { + opacity: 1; + transform: translateX(0px) rotate(0deg); + } +} + +.rollIn { + -webkit-animation-name: rollIn; + -moz-animation-name: rollIn; + -o-animation-name: rollIn; + animation-name: rollIn; +} +/* originally authored by Nick Pettit - https://github.com/nickpettit/glide */ + +@-webkit-keyframes rollOut { + 0% { + opacity: 1; + -webkit-transform: translateX(0px) rotate(0deg); + } + + 100% { + opacity: 0; + -webkit-transform: translateX(100%) rotate(120deg); + } +} + +@-moz-keyframes rollOut { + 0% { + opacity: 1; + -moz-transform: translateX(0px) rotate(0deg); + } + + 100% { + opacity: 0; + -moz-transform: translateX(100%) rotate(120deg); + } +} + +@-o-keyframes rollOut { + 0% { + opacity: 1; + -o-transform: translateX(0px) rotate(0deg); + } + + 100% { + opacity: 0; + -o-transform: translateX(100%) rotate(120deg); + } +} + +@keyframes rollOut { + 0% { + opacity: 1; + transform: translateX(0px) rotate(0deg); + } + + 100% { + opacity: 0; + transform: translateX(100%) rotate(120deg); + } +} + +.rollOut { + -webkit-animation-name: rollOut; + -moz-animation-name: rollOut; + -o-animation-name: rollOut; + animation-name: rollOut; +} + +/* originally authored by Angelo Rohit - https://github.com/angelorohit */ + +@-webkit-keyframes lightSpeedIn { + 0% { + -webkit-transform: translateX(100%) skewX(-30deg); + opacity: 0; + } + 60% { + -webkit-transform: translateX(-20%) skewX(30deg); + opacity: 1; + } + 80% { + -webkit-transform: translateX(0%) skewX(-15deg); + opacity: 1; + } + 100% { + -webkit-transform: translateX(0%) skewX(0deg); + opacity: 1; + } +} + +@-moz-keyframes lightSpeedIn { + 0% { + -moz-transform: translateX(100%) skewX(-30deg); + opacity: 0; + } + 60% { + -moz-transform: translateX(-20%) skewX(30deg); + opacity: 1; + } + 80% { + -moz-transform: translateX(0%) skewX(-15deg); + opacity: 1; + } + 100% { + -moz-transform: translateX(0%) skewX(0deg); + opacity: 1; + } +} + +@-o-keyframes lightSpeedIn { + 0% { + -o-transform: translateX(100%) skewX(-30deg); + opacity: 0; + } + 60% { + -o-transform: translateX(-20%) skewX(30deg); + opacity: 1; + } + 80% { + -o-transform: translateX(0%) skewX(-15deg); + opacity: 1; + } + 100% { + -o-transform: translateX(0%) skewX(0deg); + opacity: 1; + } +} + +@keyframes lightSpeedIn { + 0% { + transform: translateX(100%) skewX(-30deg); + opacity: 0; + } + 60% { + transform: translateX(-20%) skewX(30deg); + opacity: 1; + } + 80% { + transform: translateX(0%) skewX(-15deg); + opacity: 1; + } + 100% { + transform: translateX(0%) skewX(0deg); + opacity: 1; + } +} + +.lightSpeedIn { + -webkit-animation-name: lightSpeedIn; + -moz-animation-name: lightSpeedIn; + -o-animation-name: lightSpeedIn; + animation-name: lightSpeedIn; + + -webkit-animation-timing-function: ease-out; + -moz-animation-timing-function: ease-out; + -o-animation-timing-function: ease-out; + animation-timing-function: ease-out; +} + +.animated.lightSpeedIn { + -webkit-animation-duration: 0.5s; + -moz-animation-duration: 0.5s; + -o-animation-duration: 0.5s; + animation-duration: 0.5s; +} + +/* originally authored by Angelo Rohit - https://github.com/angelorohit */ + +@-webkit-keyframes lightSpeedOut { + 0% { + -webkit-transform: translateX(0%) skewX(0deg); + opacity: 1; + } + 100% { + -webkit-transform: translateX(100%) skewX(-30deg); + opacity: 0; + } +} + +@-moz-keyframes lightSpeedOut { + 0% { + -moz-transform: translateX(0%) skewX(0deg); + opacity: 1; + } + 100% { + -moz-transform: translateX(100%) skewX(-30deg); + opacity: 0; + } +} + +@-o-keyframes lightSpeedOut { + 0% { + -o-transform: translateX(0%) skewX(0deg); + opacity: 1; + } + 100% { + -o-transform: translateX(100%) skewX(-30deg); + opacity: 0; + } +} + +@keyframes lightSpeedOut { + 0% { + transform: translateX(0%) skewX(0deg); + opacity: 1; + } + 100% { + transform: translateX(100%) skewX(-30deg); + opacity: 0; + } +} + +.lightSpeedOut { + -webkit-animation-name: lightSpeedOut; + -moz-animation-name: lightSpeedOut; + -o-animation-name: lightSpeedOut; + animation-name: lightSpeedOut; + + -webkit-animation-timing-function: ease-in; + -moz-animation-timing-function: ease-in; + -o-animation-timing-function: ease-in; + animation-timing-function: ease-in; +} + +.animated.lightSpeedOut { + -webkit-animation-duration: 0.25s; + -moz-animation-duration: 0.25s; + -o-animation-duration: 0.25s; + animation-duration: 0.25s; +} + +/* originally authored by Angelo Rohit - https://github.com/angelorohit */ + +@-webkit-keyframes wiggle { + 0% { + -webkit-transform: skewX(9deg); + } + 10% { + -webkit-transform: skewX(-8deg); + } + 20% { + -webkit-transform: skewX(7deg); + } + 30% { + -webkit-transform: skewX(-6deg); + } + 40% { + -webkit-transform: skewX(5deg); + } + 50% { + -webkit-transform: skewX(-4deg); + } + 60% { + -webkit-transform: skewX(3deg); + } + 70% { + -webkit-transform: skewX(-2deg); + } + 80% { + -webkit-transform: skewX(1deg); + } + 90% { + -webkit-transform: skewX(0deg); + } + 100% { + -webkit-transform: skewX(0deg); + } +} + +@-moz-keyframes wiggle { + 0% { + -moz-transform: skewX(9deg); + } + 10% { + -moz-transform: skewX(-8deg); + } + 20% { + -moz-transform: skewX(7deg); + } + 30% { + -moz-transform: skewX(-6deg); + } + 40% { + -moz-transform: skewX(5deg); + } + 50% { + -moz-transform: skewX(-4deg); + } + 60% { + -moz-transform: skewX(3deg); + } + 70% { + -moz-transform: skewX(-2deg); + } + 80% { + -moz-transform: skewX(1deg); + } + 90% { + -moz-transform: skewX(0deg); + } + 100% { + -moz-transform: skewX(0deg); + } +} + +@-o-keyframes wiggle { + 0% { + -o-transform: skewX(9deg); + } + 10% { + -o-transform: skewX(-8deg); + } + 20% { + -o-transform: skewX(7deg); + } + 30% { + -o-transform: skewX(-6deg); + } + 40% { + -o-transform: skewX(5deg); + } + 50% { + -o-transform: skewX(-4deg); + } + 60% { + -o-transform: skewX(3deg); + } + 70% { + -o-transform: skewX(-2deg); + } + 80% { + -o-transform: skewX(1deg); + } + 90% { + -o-transform: skewX(0deg); + } + 100% { + -o-transform: skewX(0deg); + } +} + +@keyframes wiggle { + 0% { + transform: skewX(9deg); + } + 10% { + transform: skewX(-8deg); + } + 20% { + transform: skewX(7deg); + } + 30% { + transform: skewX(-6deg); + } + 40% { + transform: skewX(5deg); + } + 50% { + transform: skewX(-4deg); + } + 60% { + transform: skewX(3deg); + } + 70% { + transform: skewX(-2deg); + } + 80% { + transform: skewX(1deg); + } + 90% { + transform: skewX(0deg); + } + 100% { + transform: skewX(0deg); + } +} + +.wiggle { + -webkit-animation-name: wiggle; + -moz-animation-name: wiggle; + -o-animation-name: wiggle; + animation-name: wiggle; + + -webkit-animation-timing-function: ease-in; + -moz-animation-timing-function: ease-in; + -o-animation-timing-function: ease-in; + animation-timing-function: ease-in; +} + +.animated.wiggle { + -webkit-animation-duration: 0.75s; + -moz-animation-duration: 0.75s; + -o-animation-duration: 0.75s; + animation-duration: 0.75s; +} diff --git a/backend/tests/integration/tests/pruning/website/css/bootstrap.min.css b/backend/tests/integration/tests/pruning/website/css/bootstrap.min.css new file mode 100644 index 00000000000..8a7496b6f6a --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/css/bootstrap.min.css @@ -0,0 +1,6136 @@ +/*! + * Bootstrap v3.1.0 (http://getbootstrap.com) + * Copyright 2011-2014 Twitter, Inc. + * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) + */ + +/*! normalize.css v3.0.0 | MIT License | git.io/normalize */ +html { + font-family: sans-serif; + -ms-text-size-adjust: 100%; + -webkit-text-size-adjust: 100%; +} +body { + margin: 0; +} +article, +aside, +details, +figcaption, +figure, +footer, +header, +hgroup, +main, +nav, +section, +summary { + display: block; +} +audio, +canvas, +progress, +video { + display: inline-block; + vertical-align: baseline; +} +audio:not([controls]) { + display: none; + height: 0; +} +[hidden], +template { + display: none; +} +a { + background: 0 0; +} +a:active, +a:hover { + outline: 0; +} +abbr[title] { + border-bottom: 1px dotted; +} +b, +strong { + font-weight: 700; +} +dfn { + font-style: italic; +} +h1 { + font-size: 2em; + margin: 0.67em 0; +} +mark { + background: #ff0; + color: #000; +} +small { + font-size: 80%; +} +sub, +sup { + font-size: 75%; + line-height: 0; + position: relative; + vertical-align: baseline; +} +sup { + top: -0.5em; +} +sub { + bottom: -0.25em; +} +img { + border: 0; +} +svg:not(:root) { + overflow: hidden; +} +figure { + margin: 1em 40px; +} +hr { + -moz-box-sizing: content-box; + box-sizing: content-box; + height: 0; +} +pre { + overflow: auto; +} +code, +kbd, +pre, +samp { + font-family: monospace, monospace; + font-size: 1em; +} +button, +input, +optgroup, +select, +textarea { + color: inherit; + font: inherit; + margin: 0; +} +button { + overflow: visible; +} +button, +select { + text-transform: none; +} +button, +html input[type="button"], +input[type="reset"], +input[type="submit"] { + -webkit-appearance: button; + cursor: pointer; +} +button[disabled], +html input[disabled] { + cursor: default; +} +button::-moz-focus-inner, +input::-moz-focus-inner { + border: 0; + padding: 0; +} +input { + line-height: normal; +} +input[type="checkbox"], +input[type="radio"] { + box-sizing: border-box; + padding: 0; +} +input[type="number"]::-webkit-inner-spin-button, +input[type="number"]::-webkit-outer-spin-button { + height: auto; +} +input[type="search"] { + -webkit-appearance: textfield; + -moz-box-sizing: content-box; + -webkit-box-sizing: content-box; + box-sizing: content-box; +} +input[type="search"]::-webkit-search-cancel-button, +input[type="search"]::-webkit-search-decoration { + -webkit-appearance: none; +} +fieldset { + border: 1px solid silver; + margin: 0 2px; + padding: 0.35em 0.625em 0.75em; +} +legend { + border: 0; + padding: 0; +} +textarea { + overflow: auto; +} +optgroup { + font-weight: 700; +} +table { + border-collapse: collapse; + border-spacing: 0; +} +td, +th { + padding: 0; +} +@media print { + * { + text-shadow: none !important; + color: #000 !important; + background: transparent !important; + box-shadow: none !important; + } + a, + a:visited { + text-decoration: underline; + } + a[href]:after { + content: " (" attr(href) ")"; + } + abbr[title]:after { + content: " (" attr(title) ")"; + } + a[href^="javascript:"]:after, + a[href^="#"]:after { + content: ""; + } + pre, + blockquote { + border: 1px solid #999; + page-break-inside: avoid; + } + thead { + display: table-header-group; + } + tr, + img { + page-break-inside: avoid; + } + img { + max-width: 100% !important; + } + p, + h2, + h3 { + orphans: 3; + widows: 3; + } + h2, + h3 { + page-break-after: avoid; + } + select { + background: #fff !important; + } + .navbar { + display: none; + } + .table td, + .table th { + background-color: #fff !important; + } + .btn > .caret, + .dropup > .btn > .caret { + border-top-color: #000 !important; + } + .label { + border: 1px solid #000; + } + .table { + border-collapse: collapse !important; + } + .table-bordered th, + .table-bordered td { + border: 1px solid #ddd !important; + } +} +* { + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; +} +:before, +:after { + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; +} +html { + font-size: 62.5%; + -webkit-tap-highlight-color: rgba(0, 0, 0, 0); +} +body { + font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; + font-size: 14px; + line-height: 1.428571429; + color: #333; + background-color: #fff; +} +input, +button, +select, +textarea { + font-family: inherit; + font-size: inherit; + line-height: inherit; +} +a { + color: #428bca; + text-decoration: none; +} +a:hover, +a:focus { + color: #2a6496; + text-decoration: underline; +} +a:focus { + outline: thin dotted; + outline: 5px auto -webkit-focus-ring-color; + outline-offset: -2px; +} +figure { + margin: 0; +} +img { + vertical-align: middle; +} +.img-responsive { + display: block; + max-width: 100%; + height: auto; +} +.img-rounded { + border-radius: 6px; +} +.img-thumbnail { + padding: 4px; + line-height: 1.428571429; + background-color: #fff; + border: 1px solid #ddd; + border-radius: 4px; + -webkit-transition: all 0.2s ease-in-out; + transition: all 0.2s ease-in-out; + display: inline-block; + max-width: 100%; + height: auto; +} +.img-circle { + border-radius: 50%; +} +hr { + margin-top: 20px; + margin-bottom: 20px; + border: 0; + border-top: 1px solid #eee; +} +.sr-only { + position: absolute; + width: 1px; + height: 1px; + margin: -1px; + padding: 0; + overflow: hidden; + clip: rect(0, 0, 0, 0); + border: 0; +} +h1, +h2, +h3, +h4, +h5, +h6, +.h1, +.h2, +.h3, +.h4, +.h5, +.h6 { + font-family: inherit; + font-weight: 500; + line-height: 1.1; + color: inherit; +} +h1 small, +h2 small, +h3 small, +h4 small, +h5 small, +h6 small, +.h1 small, +.h2 small, +.h3 small, +.h4 small, +.h5 small, +.h6 small, +h1 .small, +h2 .small, +h3 .small, +h4 .small, +h5 .small, +h6 .small, +.h1 .small, +.h2 .small, +.h3 .small, +.h4 .small, +.h5 .small, +.h6 .small { + font-weight: 400; + line-height: 1; + color: #999; +} +h1, +.h1, +h2, +.h2, +h3, +.h3 { + margin-top: 20px; + margin-bottom: 10px; +} +h1 small, +.h1 small, +h2 small, +.h2 small, +h3 small, +.h3 small, +h1 .small, +.h1 .small, +h2 .small, +.h2 .small, +h3 .small, +.h3 .small { + font-size: 65%; +} +h4, +.h4, +h5, +.h5, +h6, +.h6 { + margin-top: 10px; + margin-bottom: 10px; +} +h4 small, +.h4 small, +h5 small, +.h5 small, +h6 small, +.h6 small, +h4 .small, +.h4 .small, +h5 .small, +.h5 .small, +h6 .small, +.h6 .small { + font-size: 75%; +} +h1, +.h1 { + font-size: 36px; +} +h2, +.h2 { + font-size: 30px; +} +h3, +.h3 { + font-size: 24px; +} +h4, +.h4 { + font-size: 18px; +} +h5, +.h5 { + font-size: 14px; +} +h6, +.h6 { + font-size: 12px; +} +p { + margin: 0 0 10px; +} +.lead { + margin-bottom: 20px; + font-size: 16px; + font-weight: 200; + line-height: 1.4; +} +@media (min-width: 768px) { + .lead { + font-size: 21px; + } +} +small, +.small { + font-size: 85%; +} +cite { + font-style: normal; +} +.text-left { + text-align: left; +} +.text-right { + text-align: right; +} +.text-center { + text-align: center; +} +.text-justify { + text-align: justify; +} +.text-muted { + color: #999; +} +.text-primary { + color: #428bca; +} +a.text-primary:hover { + color: #3071a9; +} +.text-success { + color: #3c763d; +} +a.text-success:hover { + color: #2b542c; +} +.text-info { + color: #31708f; +} +a.text-info:hover { + color: #245269; +} +.text-warning { + color: #8a6d3b; +} +a.text-warning:hover { + color: #66512c; +} +.text-danger { + color: #a94442; +} +a.text-danger:hover { + color: #843534; +} +.bg-primary { + color: #fff; + background-color: #428bca; +} +a.bg-primary:hover { + background-color: #3071a9; +} +.bg-success { + background-color: #dff0d8; +} +a.bg-success:hover { + background-color: #c1e2b3; +} +.bg-info { + background-color: #d9edf7; +} +a.bg-info:hover { + background-color: #afd9ee; +} +.bg-warning { + background-color: #fcf8e3; +} +a.bg-warning:hover { + background-color: #f7ecb5; +} +.bg-danger { + background-color: #f2dede; +} +a.bg-danger:hover { + background-color: #e4b9b9; +} +.page-header { + padding-bottom: 9px; + margin: 40px 0 20px; + border-bottom: 1px solid #eee; +} +ul, +ol { + margin-top: 0; + margin-bottom: 10px; +} +ul ul, +ol ul, +ul ol, +ol ol { + margin-bottom: 0; +} +.list-unstyled { + padding-left: 0; + list-style: none; +} +.list-inline { + padding-left: 0; + list-style: none; +} +.list-inline > li { + display: inline-block; + padding-left: 5px; + padding-right: 5px; +} +.list-inline > li:first-child { + padding-left: 0; +} +dl { + margin-top: 0; + margin-bottom: 20px; +} +dt, +dd { + line-height: 1.428571429; +} +dt { + font-weight: 700; +} +dd { + margin-left: 0; +} +@media (min-width: 768px) { + .dl-horizontal dt { + float: left; + width: 160px; + clear: left; + text-align: right; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + } + .dl-horizontal dd { + margin-left: 180px; + } +} +abbr[title], +abbr[data-original-title] { + cursor: help; + border-bottom: 1px dotted #999; +} +.initialism { + font-size: 90%; + text-transform: uppercase; +} +blockquote { + padding: 10px 20px; + margin: 0 0 20px; + font-size: 17.5px; + border-left: 5px solid #eee; +} +blockquote p:last-child, +blockquote ul:last-child, +blockquote ol:last-child { + margin-bottom: 0; +} +blockquote footer, +blockquote small, +blockquote .small { + display: block; + font-size: 80%; + line-height: 1.428571429; + color: #999; +} +blockquote footer:before, +blockquote small:before, +blockquote .small:before { + content: "\2014 \00A0"; +} +.blockquote-reverse, +blockquote.pull-right { + padding-right: 15px; + padding-left: 0; + border-right: 5px solid #eee; + border-left: 0; + text-align: right; +} +.blockquote-reverse footer:before, +blockquote.pull-right footer:before, +.blockquote-reverse small:before, +blockquote.pull-right small:before, +.blockquote-reverse .small:before, +blockquote.pull-right .small:before { + content: ""; +} +.blockquote-reverse footer:after, +blockquote.pull-right footer:after, +.blockquote-reverse small:after, +blockquote.pull-right small:after, +.blockquote-reverse .small:after, +blockquote.pull-right .small:after { + content: "\00A0 \2014"; +} +blockquote:before, +blockquote:after { + content: ""; +} +address { + margin-bottom: 20px; + font-style: normal; + line-height: 1.428571429; +} +code, +kbd, +pre, +samp { + font-family: Menlo, Monaco, Consolas, "Courier New", monospace; +} +code { + padding: 2px 4px; + font-size: 90%; + color: #c7254e; + background-color: #f9f2f4; + white-space: nowrap; + border-radius: 4px; +} +kbd { + padding: 2px 4px; + font-size: 90%; + color: #fff; + background-color: #333; + border-radius: 3px; + box-shadow: inset 0 -1px 0 rgba(0, 0, 0, 0.25); +} +pre { + display: block; + padding: 9.5px; + margin: 0 0 10px; + font-size: 13px; + line-height: 1.428571429; + word-break: break-all; + word-wrap: break-word; + color: #333; + background-color: #f5f5f5; + border: 1px solid #ccc; + border-radius: 4px; +} +pre code { + padding: 0; + font-size: inherit; + color: inherit; + white-space: pre-wrap; + background-color: transparent; + border-radius: 0; +} +.pre-scrollable { + max-height: 340px; + overflow-y: scroll; +} +.container { + margin-right: auto; + margin-left: auto; + padding-left: 15px; + padding-right: 15px; +} +@media (min-width: 768px) { + .container { + width: 750px; + } +} +@media (min-width: 992px) { + .container { + width: 970px; + } +} +@media (min-width: 1200px) { + .container { + width: 1170px; + } +} +.container-fluid { + margin-right: auto; + margin-left: auto; + padding-left: 15px; + padding-right: 15px; +} +.row { + margin-left: -15px; + margin-right: -15px; +} +.col-xs-1, +.col-sm-1, +.col-md-1, +.col-lg-1, +.col-xs-2, +.col-sm-2, +.col-md-2, +.col-lg-2, +.col-xs-3, +.col-sm-3, +.col-md-3, +.col-lg-3, +.col-xs-4, +.col-sm-4, +.col-md-4, +.col-lg-4, +.col-xs-5, +.col-sm-5, +.col-md-5, +.col-lg-5, +.col-xs-6, +.col-sm-6, +.col-md-6, +.col-lg-6, +.col-xs-7, +.col-sm-7, +.col-md-7, +.col-lg-7, +.col-xs-8, +.col-sm-8, +.col-md-8, +.col-lg-8, +.col-xs-9, +.col-sm-9, +.col-md-9, +.col-lg-9, +.col-xs-10, +.col-sm-10, +.col-md-10, +.col-lg-10, +.col-xs-11, +.col-sm-11, +.col-md-11, +.col-lg-11, +.col-xs-12, +.col-sm-12, +.col-md-12, +.col-lg-12 { + position: relative; + min-height: 1px; + padding-left: 15px; + padding-right: 15px; +} +.col-xs-1, +.col-xs-2, +.col-xs-3, +.col-xs-4, +.col-xs-5, +.col-xs-6, +.col-xs-7, +.col-xs-8, +.col-xs-9, +.col-xs-10, +.col-xs-11, +.col-xs-12 { + float: left; +} +.col-xs-12 { + width: 100%; +} +.col-xs-11 { + width: 91.66666666666666%; +} +.col-xs-10 { + width: 83.33333333333334%; +} +.col-xs-9 { + width: 75%; +} +.col-xs-8 { + width: 66.66666666666666%; +} +.col-xs-7 { + width: 58.333333333333336%; +} +.col-xs-6 { + width: 50%; +} +.col-xs-5 { + width: 41.66666666666667%; +} +.col-xs-4 { + width: 33.33333333333333%; +} +.col-xs-3 { + width: 25%; +} +.col-xs-2 { + width: 16.666666666666664%; +} +.col-xs-1 { + width: 8.333333333333332%; +} +.col-xs-pull-12 { + right: 100%; +} +.col-xs-pull-11 { + right: 91.66666666666666%; +} +.col-xs-pull-10 { + right: 83.33333333333334%; +} +.col-xs-pull-9 { + right: 75%; +} +.col-xs-pull-8 { + right: 66.66666666666666%; +} +.col-xs-pull-7 { + right: 58.333333333333336%; +} +.col-xs-pull-6 { + right: 50%; +} +.col-xs-pull-5 { + right: 41.66666666666667%; +} +.col-xs-pull-4 { + right: 33.33333333333333%; +} +.col-xs-pull-3 { + right: 25%; +} +.col-xs-pull-2 { + right: 16.666666666666664%; +} +.col-xs-pull-1 { + right: 8.333333333333332%; +} +.col-xs-pull-0 { + right: 0; +} +.col-xs-push-12 { + left: 100%; +} +.col-xs-push-11 { + left: 91.66666666666666%; +} +.col-xs-push-10 { + left: 83.33333333333334%; +} +.col-xs-push-9 { + left: 75%; +} +.col-xs-push-8 { + left: 66.66666666666666%; +} +.col-xs-push-7 { + left: 58.333333333333336%; +} +.col-xs-push-6 { + left: 50%; +} +.col-xs-push-5 { + left: 41.66666666666667%; +} +.col-xs-push-4 { + left: 33.33333333333333%; +} +.col-xs-push-3 { + left: 25%; +} +.col-xs-push-2 { + left: 16.666666666666664%; +} +.col-xs-push-1 { + left: 8.333333333333332%; +} +.col-xs-push-0 { + left: 0; +} +.col-xs-offset-12 { + margin-left: 100%; +} +.col-xs-offset-11 { + margin-left: 91.66666666666666%; +} +.col-xs-offset-10 { + margin-left: 83.33333333333334%; +} +.col-xs-offset-9 { + margin-left: 75%; +} +.col-xs-offset-8 { + margin-left: 66.66666666666666%; +} +.col-xs-offset-7 { + margin-left: 58.333333333333336%; +} +.col-xs-offset-6 { + margin-left: 50%; +} +.col-xs-offset-5 { + margin-left: 41.66666666666667%; +} +.col-xs-offset-4 { + margin-left: 33.33333333333333%; +} +.col-xs-offset-3 { + margin-left: 25%; +} +.col-xs-offset-2 { + margin-left: 16.666666666666664%; +} +.col-xs-offset-1 { + margin-left: 8.333333333333332%; +} +.col-xs-offset-0 { + margin-left: 0; +} +@media (min-width: 768px) { + .col-sm-1, + .col-sm-2, + .col-sm-3, + .col-sm-4, + .col-sm-5, + .col-sm-6, + .col-sm-7, + .col-sm-8, + .col-sm-9, + .col-sm-10, + .col-sm-11, + .col-sm-12 { + float: left; + } + .col-sm-12 { + width: 100%; + } + .col-sm-11 { + width: 91.66666666666666%; + } + .col-sm-10 { + width: 83.33333333333334%; + } + .col-sm-9 { + width: 75%; + } + .col-sm-8 { + width: 66.66666666666666%; + } + .col-sm-7 { + width: 58.333333333333336%; + } + .col-sm-6 { + width: 50%; + } + .col-sm-5 { + width: 41.66666666666667%; + } + .col-sm-4 { + width: 33.33333333333333%; + } + .col-sm-3 { + width: 25%; + } + .col-sm-2 { + width: 16.666666666666664%; + } + .col-sm-1 { + width: 8.333333333333332%; + } + .col-sm-pull-12 { + right: 100%; + } + .col-sm-pull-11 { + right: 91.66666666666666%; + } + .col-sm-pull-10 { + right: 83.33333333333334%; + } + .col-sm-pull-9 { + right: 75%; + } + .col-sm-pull-8 { + right: 66.66666666666666%; + } + .col-sm-pull-7 { + right: 58.333333333333336%; + } + .col-sm-pull-6 { + right: 50%; + } + .col-sm-pull-5 { + right: 41.66666666666667%; + } + .col-sm-pull-4 { + right: 33.33333333333333%; + } + .col-sm-pull-3 { + right: 25%; + } + .col-sm-pull-2 { + right: 16.666666666666664%; + } + .col-sm-pull-1 { + right: 8.333333333333332%; + } + .col-sm-pull-0 { + right: 0; + } + .col-sm-push-12 { + left: 100%; + } + .col-sm-push-11 { + left: 91.66666666666666%; + } + .col-sm-push-10 { + left: 83.33333333333334%; + } + .col-sm-push-9 { + left: 75%; + } + .col-sm-push-8 { + left: 66.66666666666666%; + } + .col-sm-push-7 { + left: 58.333333333333336%; + } + .col-sm-push-6 { + left: 50%; + } + .col-sm-push-5 { + left: 41.66666666666667%; + } + .col-sm-push-4 { + left: 33.33333333333333%; + } + .col-sm-push-3 { + left: 25%; + } + .col-sm-push-2 { + left: 16.666666666666664%; + } + .col-sm-push-1 { + left: 8.333333333333332%; + } + .col-sm-push-0 { + left: 0; + } + .col-sm-offset-12 { + margin-left: 100%; + } + .col-sm-offset-11 { + margin-left: 91.66666666666666%; + } + .col-sm-offset-10 { + margin-left: 83.33333333333334%; + } + .col-sm-offset-9 { + margin-left: 75%; + } + .col-sm-offset-8 { + margin-left: 66.66666666666666%; + } + .col-sm-offset-7 { + margin-left: 58.333333333333336%; + } + .col-sm-offset-6 { + margin-left: 50%; + } + .col-sm-offset-5 { + margin-left: 41.66666666666667%; + } + .col-sm-offset-4 { + margin-left: 33.33333333333333%; + } + .col-sm-offset-3 { + margin-left: 25%; + } + .col-sm-offset-2 { + margin-left: 16.666666666666664%; + } + .col-sm-offset-1 { + margin-left: 8.333333333333332%; + } + .col-sm-offset-0 { + margin-left: 0; + } +} +@media (min-width: 992px) { + .col-md-1, + .col-md-2, + .col-md-3, + .col-md-4, + .col-md-5, + .col-md-6, + .col-md-7, + .col-md-8, + .col-md-9, + .col-md-10, + .col-md-11, + .col-md-12 { + float: left; + } + .col-md-12 { + width: 100%; + } + .col-md-11 { + width: 91.66666666666666%; + } + .col-md-10 { + width: 83.33333333333334%; + } + .col-md-9 { + width: 75%; + } + .col-md-8 { + width: 66.66666666666666%; + } + .col-md-7 { + width: 58.333333333333336%; + } + .col-md-6 { + width: 50%; + } + .col-md-5 { + width: 41.66666666666667%; + } + .col-md-4 { + width: 33.33333333333333%; + } + .col-md-3 { + width: 25%; + } + .col-md-2 { + width: 16.666666666666664%; + } + .col-md-1 { + width: 8.333333333333332%; + } + .col-md-pull-12 { + right: 100%; + } + .col-md-pull-11 { + right: 91.66666666666666%; + } + .col-md-pull-10 { + right: 83.33333333333334%; + } + .col-md-pull-9 { + right: 75%; + } + .col-md-pull-8 { + right: 66.66666666666666%; + } + .col-md-pull-7 { + right: 58.333333333333336%; + } + .col-md-pull-6 { + right: 50%; + } + .col-md-pull-5 { + right: 41.66666666666667%; + } + .col-md-pull-4 { + right: 33.33333333333333%; + } + .col-md-pull-3 { + right: 25%; + } + .col-md-pull-2 { + right: 16.666666666666664%; + } + .col-md-pull-1 { + right: 8.333333333333332%; + } + .col-md-pull-0 { + right: 0; + } + .col-md-push-12 { + left: 100%; + } + .col-md-push-11 { + left: 91.66666666666666%; + } + .col-md-push-10 { + left: 83.33333333333334%; + } + .col-md-push-9 { + left: 75%; + } + .col-md-push-8 { + left: 66.66666666666666%; + } + .col-md-push-7 { + left: 58.333333333333336%; + } + .col-md-push-6 { + left: 50%; + } + .col-md-push-5 { + left: 41.66666666666667%; + } + .col-md-push-4 { + left: 33.33333333333333%; + } + .col-md-push-3 { + left: 25%; + } + .col-md-push-2 { + left: 16.666666666666664%; + } + .col-md-push-1 { + left: 8.333333333333332%; + } + .col-md-push-0 { + left: 0; + } + .col-md-offset-12 { + margin-left: 100%; + } + .col-md-offset-11 { + margin-left: 91.66666666666666%; + } + .col-md-offset-10 { + margin-left: 83.33333333333334%; + } + .col-md-offset-9 { + margin-left: 75%; + } + .col-md-offset-8 { + margin-left: 66.66666666666666%; + } + .col-md-offset-7 { + margin-left: 58.333333333333336%; + } + .col-md-offset-6 { + margin-left: 50%; + } + .col-md-offset-5 { + margin-left: 41.66666666666667%; + } + .col-md-offset-4 { + margin-left: 33.33333333333333%; + } + .col-md-offset-3 { + margin-left: 25%; + } + .col-md-offset-2 { + margin-left: 16.666666666666664%; + } + .col-md-offset-1 { + margin-left: 8.333333333333332%; + } + .col-md-offset-0 { + margin-left: 0; + } +} +@media (min-width: 1200px) { + .col-lg-1, + .col-lg-2, + .col-lg-3, + .col-lg-4, + .col-lg-5, + .col-lg-6, + .col-lg-7, + .col-lg-8, + .col-lg-9, + .col-lg-10, + .col-lg-11, + .col-lg-12 { + float: left; + } + .col-lg-12 { + width: 100%; + } + .col-lg-11 { + width: 91.66666666666666%; + } + .col-lg-10 { + width: 83.33333333333334%; + } + .col-lg-9 { + width: 75%; + } + .col-lg-8 { + width: 66.66666666666666%; + } + .col-lg-7 { + width: 58.333333333333336%; + } + .col-lg-6 { + width: 50%; + } + .col-lg-5 { + width: 41.66666666666667%; + } + .col-lg-4 { + width: 33.33333333333333%; + } + .col-lg-3 { + width: 25%; + } + .col-lg-2 { + width: 16.666666666666664%; + } + .col-lg-1 { + width: 8.333333333333332%; + } + .col-lg-pull-12 { + right: 100%; + } + .col-lg-pull-11 { + right: 91.66666666666666%; + } + .col-lg-pull-10 { + right: 83.33333333333334%; + } + .col-lg-pull-9 { + right: 75%; + } + .col-lg-pull-8 { + right: 66.66666666666666%; + } + .col-lg-pull-7 { + right: 58.333333333333336%; + } + .col-lg-pull-6 { + right: 50%; + } + .col-lg-pull-5 { + right: 41.66666666666667%; + } + .col-lg-pull-4 { + right: 33.33333333333333%; + } + .col-lg-pull-3 { + right: 25%; + } + .col-lg-pull-2 { + right: 16.666666666666664%; + } + .col-lg-pull-1 { + right: 8.333333333333332%; + } + .col-lg-pull-0 { + right: 0; + } + .col-lg-push-12 { + left: 100%; + } + .col-lg-push-11 { + left: 91.66666666666666%; + } + .col-lg-push-10 { + left: 83.33333333333334%; + } + .col-lg-push-9 { + left: 75%; + } + .col-lg-push-8 { + left: 66.66666666666666%; + } + .col-lg-push-7 { + left: 58.333333333333336%; + } + .col-lg-push-6 { + left: 50%; + } + .col-lg-push-5 { + left: 41.66666666666667%; + } + .col-lg-push-4 { + left: 33.33333333333333%; + } + .col-lg-push-3 { + left: 25%; + } + .col-lg-push-2 { + left: 16.666666666666664%; + } + .col-lg-push-1 { + left: 8.333333333333332%; + } + .col-lg-push-0 { + left: 0; + } + .col-lg-offset-12 { + margin-left: 100%; + } + .col-lg-offset-11 { + margin-left: 91.66666666666666%; + } + .col-lg-offset-10 { + margin-left: 83.33333333333334%; + } + .col-lg-offset-9 { + margin-left: 75%; + } + .col-lg-offset-8 { + margin-left: 66.66666666666666%; + } + .col-lg-offset-7 { + margin-left: 58.333333333333336%; + } + .col-lg-offset-6 { + margin-left: 50%; + } + .col-lg-offset-5 { + margin-left: 41.66666666666667%; + } + .col-lg-offset-4 { + margin-left: 33.33333333333333%; + } + .col-lg-offset-3 { + margin-left: 25%; + } + .col-lg-offset-2 { + margin-left: 16.666666666666664%; + } + .col-lg-offset-1 { + margin-left: 8.333333333333332%; + } + .col-lg-offset-0 { + margin-left: 0; + } +} +table { + max-width: 100%; + background-color: transparent; +} +th { + text-align: left; +} +.table { + width: 100%; + margin-bottom: 20px; +} +.table > thead > tr > th, +.table > tbody > tr > th, +.table > tfoot > tr > th, +.table > thead > tr > td, +.table > tbody > tr > td, +.table > tfoot > tr > td { + padding: 8px; + line-height: 1.428571429; + vertical-align: top; + border-top: 1px solid #ddd; +} +.table > thead > tr > th { + vertical-align: bottom; + border-bottom: 2px solid #ddd; +} +.table > caption + thead > tr:first-child > th, +.table > colgroup + thead > tr:first-child > th, +.table > thead:first-child > tr:first-child > th, +.table > caption + thead > tr:first-child > td, +.table > colgroup + thead > tr:first-child > td, +.table > thead:first-child > tr:first-child > td { + border-top: 0; +} +.table > tbody + tbody { + border-top: 2px solid #ddd; +} +.table .table { + background-color: #fff; +} +.table-condensed > thead > tr > th, +.table-condensed > tbody > tr > th, +.table-condensed > tfoot > tr > th, +.table-condensed > thead > tr > td, +.table-condensed > tbody > tr > td, +.table-condensed > tfoot > tr > td { + padding: 5px; +} +.table-bordered { + border: 1px solid #ddd; +} +.table-bordered > thead > tr > th, +.table-bordered > tbody > tr > th, +.table-bordered > tfoot > tr > th, +.table-bordered > thead > tr > td, +.table-bordered > tbody > tr > td, +.table-bordered > tfoot > tr > td { + border: 1px solid #ddd; +} +.table-bordered > thead > tr > th, +.table-bordered > thead > tr > td { + border-bottom-width: 2px; +} +.table-striped > tbody > tr:nth-child(odd) > td, +.table-striped > tbody > tr:nth-child(odd) > th { + background-color: #f9f9f9; +} +.table-hover > tbody > tr:hover > td, +.table-hover > tbody > tr:hover > th { + background-color: #f5f5f5; +} +table col[class*="col-"] { + position: static; + float: none; + display: table-column; +} +table td[class*="col-"], +table th[class*="col-"] { + position: static; + float: none; + display: table-cell; +} +.table > thead > tr > td.active, +.table > tbody > tr > td.active, +.table > tfoot > tr > td.active, +.table > thead > tr > th.active, +.table > tbody > tr > th.active, +.table > tfoot > tr > th.active, +.table > thead > tr.active > td, +.table > tbody > tr.active > td, +.table > tfoot > tr.active > td, +.table > thead > tr.active > th, +.table > tbody > tr.active > th, +.table > tfoot > tr.active > th { + background-color: #f5f5f5; +} +.table-hover > tbody > tr > td.active:hover, +.table-hover > tbody > tr > th.active:hover, +.table-hover > tbody > tr.active:hover > td, +.table-hover > tbody > tr.active:hover > th { + background-color: #e8e8e8; +} +.table > thead > tr > td.success, +.table > tbody > tr > td.success, +.table > tfoot > tr > td.success, +.table > thead > tr > th.success, +.table > tbody > tr > th.success, +.table > tfoot > tr > th.success, +.table > thead > tr.success > td, +.table > tbody > tr.success > td, +.table > tfoot > tr.success > td, +.table > thead > tr.success > th, +.table > tbody > tr.success > th, +.table > tfoot > tr.success > th { + background-color: #dff0d8; +} +.table-hover > tbody > tr > td.success:hover, +.table-hover > tbody > tr > th.success:hover, +.table-hover > tbody > tr.success:hover > td, +.table-hover > tbody > tr.success:hover > th { + background-color: #d0e9c6; +} +.table > thead > tr > td.info, +.table > tbody > tr > td.info, +.table > tfoot > tr > td.info, +.table > thead > tr > th.info, +.table > tbody > tr > th.info, +.table > tfoot > tr > th.info, +.table > thead > tr.info > td, +.table > tbody > tr.info > td, +.table > tfoot > tr.info > td, +.table > thead > tr.info > th, +.table > tbody > tr.info > th, +.table > tfoot > tr.info > th { + background-color: #d9edf7; +} +.table-hover > tbody > tr > td.info:hover, +.table-hover > tbody > tr > th.info:hover, +.table-hover > tbody > tr.info:hover > td, +.table-hover > tbody > tr.info:hover > th { + background-color: #c4e3f3; +} +.table > thead > tr > td.warning, +.table > tbody > tr > td.warning, +.table > tfoot > tr > td.warning, +.table > thead > tr > th.warning, +.table > tbody > tr > th.warning, +.table > tfoot > tr > th.warning, +.table > thead > tr.warning > td, +.table > tbody > tr.warning > td, +.table > tfoot > tr.warning > td, +.table > thead > tr.warning > th, +.table > tbody > tr.warning > th, +.table > tfoot > tr.warning > th { + background-color: #fcf8e3; +} +.table-hover > tbody > tr > td.warning:hover, +.table-hover > tbody > tr > th.warning:hover, +.table-hover > tbody > tr.warning:hover > td, +.table-hover > tbody > tr.warning:hover > th { + background-color: #faf2cc; +} +.table > thead > tr > td.danger, +.table > tbody > tr > td.danger, +.table > tfoot > tr > td.danger, +.table > thead > tr > th.danger, +.table > tbody > tr > th.danger, +.table > tfoot > tr > th.danger, +.table > thead > tr.danger > td, +.table > tbody > tr.danger > td, +.table > tfoot > tr.danger > td, +.table > thead > tr.danger > th, +.table > tbody > tr.danger > th, +.table > tfoot > tr.danger > th { + background-color: #f2dede; +} +.table-hover > tbody > tr > td.danger:hover, +.table-hover > tbody > tr > th.danger:hover, +.table-hover > tbody > tr.danger:hover > td, +.table-hover > tbody > tr.danger:hover > th { + background-color: #ebcccc; +} +@media (max-width: 767px) { + .table-responsive { + width: 100%; + margin-bottom: 15px; + overflow-y: hidden; + overflow-x: scroll; + -ms-overflow-style: -ms-autohiding-scrollbar; + border: 1px solid #ddd; + -webkit-overflow-scrolling: touch; + } + .table-responsive > .table { + margin-bottom: 0; + } + .table-responsive > .table > thead > tr > th, + .table-responsive > .table > tbody > tr > th, + .table-responsive > .table > tfoot > tr > th, + .table-responsive > .table > thead > tr > td, + .table-responsive > .table > tbody > tr > td, + .table-responsive > .table > tfoot > tr > td { + white-space: nowrap; + } + .table-responsive > .table-bordered { + border: 0; + } + .table-responsive > .table-bordered > thead > tr > th:first-child, + .table-responsive > .table-bordered > tbody > tr > th:first-child, + .table-responsive > .table-bordered > tfoot > tr > th:first-child, + .table-responsive > .table-bordered > thead > tr > td:first-child, + .table-responsive > .table-bordered > tbody > tr > td:first-child, + .table-responsive > .table-bordered > tfoot > tr > td:first-child { + border-left: 0; + } + .table-responsive > .table-bordered > thead > tr > th:last-child, + .table-responsive > .table-bordered > tbody > tr > th:last-child, + .table-responsive > .table-bordered > tfoot > tr > th:last-child, + .table-responsive > .table-bordered > thead > tr > td:last-child, + .table-responsive > .table-bordered > tbody > tr > td:last-child, + .table-responsive > .table-bordered > tfoot > tr > td:last-child { + border-right: 0; + } + .table-responsive > .table-bordered > tbody > tr:last-child > th, + .table-responsive > .table-bordered > tfoot > tr:last-child > th, + .table-responsive > .table-bordered > tbody > tr:last-child > td, + .table-responsive > .table-bordered > tfoot > tr:last-child > td { + border-bottom: 0; + } +} +fieldset { + padding: 0; + margin: 0; + border: 0; + min-width: 0; +} +legend { + display: block; + width: 100%; + padding: 0; + margin-bottom: 20px; + font-size: 21px; + line-height: inherit; + color: #333; + border: 0; + border-bottom: 1px solid #e5e5e5; +} +label { + display: inline-block; + margin-bottom: 5px; + font-weight: 700; +} +input[type="search"] { + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; +} +input[type="radio"], +input[type="checkbox"] { + margin: 4px 0 0; + margin-top: 1px \9; + line-height: normal; +} +input[type="file"] { + display: block; +} +input[type="range"] { + display: block; + width: 100%; +} +select[multiple], +select[size] { + height: auto; +} +input[type="file"]:focus, +input[type="radio"]:focus, +input[type="checkbox"]:focus { + outline: thin dotted; + outline: 5px auto -webkit-focus-ring-color; + outline-offset: -2px; +} +output { + display: block; + padding-top: 7px; + font-size: 14px; + line-height: 1.428571429; + color: #555; +} +.form-control { + display: block; + width: 100%; + height: 34px; + padding: 6px 12px; + font-size: 14px; + line-height: 1.428571429; + color: #555; + background-color: #fff; + background-image: none; + border: 1px solid #ccc; + border-radius: 4px; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + -webkit-transition: + border-color ease-in-out 0.15s, + box-shadow ease-in-out 0.15s; + transition: + border-color ease-in-out 0.15s, + box-shadow ease-in-out 0.15s; +} +.form-control:focus { + border-color: #66afe9; + outline: 0; + -webkit-box-shadow: + inset 0 1px 1px rgba(0, 0, 0, 0.075), + 0 0 8px rgba(102, 175, 233, 0.6); + box-shadow: + inset 0 1px 1px rgba(0, 0, 0, 0.075), + 0 0 8px rgba(102, 175, 233, 0.6); +} +.form-control:-moz-placeholder { + color: #999; +} +.form-control::-moz-placeholder { + color: #999; + opacity: 1; +} +.form-control:-ms-input-placeholder { + color: #999; +} +.form-control::-webkit-input-placeholder { + color: #999; +} +.form-control[disabled], +.form-control[readonly], +fieldset[disabled] .form-control { + cursor: not-allowed; + background-color: #eee; + opacity: 1; +} +textarea.form-control { + height: auto; +} +input[type="date"] { + line-height: 34px; +} +.form-group { + margin-bottom: 15px; +} +.radio, +.checkbox { + display: block; + min-height: 20px; + margin-top: 10px; + margin-bottom: 10px; + padding-left: 20px; +} +.radio label, +.checkbox label { + display: inline; + font-weight: 400; + cursor: pointer; +} +.radio input[type="radio"], +.radio-inline input[type="radio"], +.checkbox input[type="checkbox"], +.checkbox-inline input[type="checkbox"] { + float: left; + margin-left: -20px; +} +.radio + .radio, +.checkbox + .checkbox { + margin-top: -5px; +} +.radio-inline, +.checkbox-inline { + display: inline-block; + padding-left: 20px; + margin-bottom: 0; + vertical-align: middle; + font-weight: 400; + cursor: pointer; +} +.radio-inline + .radio-inline, +.checkbox-inline + .checkbox-inline { + margin-top: 0; + margin-left: 10px; +} +input[type="radio"][disabled], +input[type="checkbox"][disabled], +.radio[disabled], +.radio-inline[disabled], +.checkbox[disabled], +.checkbox-inline[disabled], +fieldset[disabled] input[type="radio"], +fieldset[disabled] input[type="checkbox"], +fieldset[disabled] .radio, +fieldset[disabled] .radio-inline, +fieldset[disabled] .checkbox, +fieldset[disabled] .checkbox-inline { + cursor: not-allowed; +} +.input-sm { + height: 30px; + padding: 5px 10px; + font-size: 12px; + line-height: 1.5; + border-radius: 3px; +} +select.input-sm { + height: 30px; + line-height: 30px; +} +textarea.input-sm, +select[multiple].input-sm { + height: auto; +} +.input-lg { + height: 46px; + padding: 10px 16px; + font-size: 18px; + line-height: 1.33; + border-radius: 6px; +} +select.input-lg { + height: 46px; + line-height: 46px; +} +textarea.input-lg, +select[multiple].input-lg { + height: auto; +} +.has-feedback { + position: relative; +} +.has-feedback .form-control { + padding-right: 42.5px; +} +.has-feedback .form-control-feedback { + position: absolute; + top: 25px; + right: 0; + display: block; + width: 34px; + height: 34px; + line-height: 34px; + text-align: center; +} +.has-success .help-block, +.has-success .control-label, +.has-success .radio, +.has-success .checkbox, +.has-success .radio-inline, +.has-success .checkbox-inline { + color: #3c763d; +} +.has-success .form-control { + border-color: #3c763d; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); +} +.has-success .form-control:focus { + border-color: #2b542c; + -webkit-box-shadow: + inset 0 1px 1px rgba(0, 0, 0, 0.075), + 0 0 6px #67b168; + box-shadow: + inset 0 1px 1px rgba(0, 0, 0, 0.075), + 0 0 6px #67b168; +} +.has-success .input-group-addon { + color: #3c763d; + border-color: #3c763d; + background-color: #dff0d8; +} +.has-success .form-control-feedback { + color: #3c763d; +} +.has-warning .help-block, +.has-warning .control-label, +.has-warning .radio, +.has-warning .checkbox, +.has-warning .radio-inline, +.has-warning .checkbox-inline { + color: #8a6d3b; +} +.has-warning .form-control { + border-color: #8a6d3b; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); +} +.has-warning .form-control:focus { + border-color: #66512c; + -webkit-box-shadow: + inset 0 1px 1px rgba(0, 0, 0, 0.075), + 0 0 6px #c0a16b; + box-shadow: + inset 0 1px 1px rgba(0, 0, 0, 0.075), + 0 0 6px #c0a16b; +} +.has-warning .input-group-addon { + color: #8a6d3b; + border-color: #8a6d3b; + background-color: #fcf8e3; +} +.has-warning .form-control-feedback { + color: #8a6d3b; +} +.has-error .help-block, +.has-error .control-label, +.has-error .radio, +.has-error .checkbox, +.has-error .radio-inline, +.has-error .checkbox-inline { + color: #a94442; +} +.has-error .form-control { + border-color: #a94442; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.075); +} +.has-error .form-control:focus { + border-color: #843534; + -webkit-box-shadow: + inset 0 1px 1px rgba(0, 0, 0, 0.075), + 0 0 6px #ce8483; + box-shadow: + inset 0 1px 1px rgba(0, 0, 0, 0.075), + 0 0 6px #ce8483; +} +.has-error .input-group-addon { + color: #a94442; + border-color: #a94442; + background-color: #f2dede; +} +.has-error .form-control-feedback { + color: #a94442; +} +.form-control-static { + margin-bottom: 0; +} +.help-block { + display: block; + margin-top: 5px; + margin-bottom: 10px; + color: #737373; +} +@media (min-width: 768px) { + .form-inline .form-group { + display: inline-block; + margin-bottom: 0; + vertical-align: middle; + } + .form-inline .form-control { + display: inline-block; + width: auto; + vertical-align: middle; + } + .form-inline .control-label { + margin-bottom: 0; + vertical-align: middle; + } + .form-inline .radio, + .form-inline .checkbox { + display: inline-block; + margin-top: 0; + margin-bottom: 0; + padding-left: 0; + vertical-align: middle; + } + .form-inline .radio input[type="radio"], + .form-inline .checkbox input[type="checkbox"] { + float: none; + margin-left: 0; + } + .form-inline .has-feedback .form-control-feedback { + top: 0; + } +} +.form-horizontal .control-label, +.form-horizontal .radio, +.form-horizontal .checkbox, +.form-horizontal .radio-inline, +.form-horizontal .checkbox-inline { + margin-top: 0; + margin-bottom: 0; + padding-top: 7px; +} +.form-horizontal .radio, +.form-horizontal .checkbox { + min-height: 27px; +} +.form-horizontal .form-group { + margin-left: -15px; + margin-right: -15px; +} +.form-horizontal .form-control-static { + padding-top: 7px; +} +@media (min-width: 768px) { + .form-horizontal .control-label { + text-align: right; + } +} +.form-horizontal .has-feedback .form-control-feedback { + top: 0; + right: 15px; +} +.btn { + display: inline-block; + margin-bottom: 0; + font-weight: 400; + text-align: center; + vertical-align: middle; + cursor: pointer; + background-image: none; + border: 1px solid transparent; + white-space: nowrap; + padding: 6px 12px; + font-size: 14px; + line-height: 1.428571429; + border-radius: 4px; + -webkit-user-select: none; + -moz-user-select: none; + -ms-user-select: none; + -o-user-select: none; + user-select: none; +} +.btn:focus { + outline: thin dotted; + outline: 5px auto -webkit-focus-ring-color; + outline-offset: -2px; +} +.btn:hover, +.btn:focus { + color: #333; + text-decoration: none; +} +.btn:active, +.btn.active { + outline: 0; + background-image: none; + -webkit-box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125); + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125); +} +.btn.disabled, +.btn[disabled], +fieldset[disabled] .btn { + cursor: not-allowed; + pointer-events: none; + opacity: 0.65; + filter: alpha(opacity=65); + -webkit-box-shadow: none; + box-shadow: none; +} +.btn-default { + color: #333; + background-color: #fff; + border-color: #ccc; +} +.btn-default:hover, +.btn-default:focus, +.btn-default:active, +.btn-default.active, +.open .dropdown-toggle.btn-default { + color: #333; + background-color: #ebebeb; + border-color: #adadad; +} +.btn-default:active, +.btn-default.active, +.open .dropdown-toggle.btn-default { + background-image: none; +} +.btn-default.disabled, +.btn-default[disabled], +fieldset[disabled] .btn-default, +.btn-default.disabled:hover, +.btn-default[disabled]:hover, +fieldset[disabled] .btn-default:hover, +.btn-default.disabled:focus, +.btn-default[disabled]:focus, +fieldset[disabled] .btn-default:focus, +.btn-default.disabled:active, +.btn-default[disabled]:active, +fieldset[disabled] .btn-default:active, +.btn-default.disabled.active, +.btn-default[disabled].active, +fieldset[disabled] .btn-default.active { + background-color: #fff; + border-color: #ccc; +} +.btn-default .badge { + color: #fff; + background-color: #333; +} +.btn-primary { + color: #fff; + background-color: #428bca; + border-color: #357ebd; +} +.btn-primary:hover, +.btn-primary:focus, +.btn-primary:active, +.btn-primary.active, +.open .dropdown-toggle.btn-primary { + color: #fff; + background-color: #3276b1; + border-color: #285e8e; +} +.btn-primary:active, +.btn-primary.active, +.open .dropdown-toggle.btn-primary { + background-image: none; +} +.btn-primary.disabled, +.btn-primary[disabled], +fieldset[disabled] .btn-primary, +.btn-primary.disabled:hover, +.btn-primary[disabled]:hover, +fieldset[disabled] .btn-primary:hover, +.btn-primary.disabled:focus, +.btn-primary[disabled]:focus, +fieldset[disabled] .btn-primary:focus, +.btn-primary.disabled:active, +.btn-primary[disabled]:active, +fieldset[disabled] .btn-primary:active, +.btn-primary.disabled.active, +.btn-primary[disabled].active, +fieldset[disabled] .btn-primary.active { + background-color: #428bca; + border-color: #357ebd; +} +.btn-primary .badge { + color: #428bca; + background-color: #fff; +} +.btn-success { + color: #fff; + background-color: #5cb85c; + border-color: #4cae4c; +} +.btn-success:hover, +.btn-success:focus, +.btn-success:active, +.btn-success.active, +.open .dropdown-toggle.btn-success { + color: #fff; + background-color: #47a447; + border-color: #398439; +} +.btn-success:active, +.btn-success.active, +.open .dropdown-toggle.btn-success { + background-image: none; +} +.btn-success.disabled, +.btn-success[disabled], +fieldset[disabled] .btn-success, +.btn-success.disabled:hover, +.btn-success[disabled]:hover, +fieldset[disabled] .btn-success:hover, +.btn-success.disabled:focus, +.btn-success[disabled]:focus, +fieldset[disabled] .btn-success:focus, +.btn-success.disabled:active, +.btn-success[disabled]:active, +fieldset[disabled] .btn-success:active, +.btn-success.disabled.active, +.btn-success[disabled].active, +fieldset[disabled] .btn-success.active { + background-color: #5cb85c; + border-color: #4cae4c; +} +.btn-success .badge { + color: #5cb85c; + background-color: #fff; +} +.btn-info { + color: #fff; + background-color: #5bc0de; + border-color: #46b8da; +} +.btn-info:hover, +.btn-info:focus, +.btn-info:active, +.btn-info.active, +.open .dropdown-toggle.btn-info { + color: #fff; + background-color: #39b3d7; + border-color: #269abc; +} +.btn-info:active, +.btn-info.active, +.open .dropdown-toggle.btn-info { + background-image: none; +} +.btn-info.disabled, +.btn-info[disabled], +fieldset[disabled] .btn-info, +.btn-info.disabled:hover, +.btn-info[disabled]:hover, +fieldset[disabled] .btn-info:hover, +.btn-info.disabled:focus, +.btn-info[disabled]:focus, +fieldset[disabled] .btn-info:focus, +.btn-info.disabled:active, +.btn-info[disabled]:active, +fieldset[disabled] .btn-info:active, +.btn-info.disabled.active, +.btn-info[disabled].active, +fieldset[disabled] .btn-info.active { + background-color: #5bc0de; + border-color: #46b8da; +} +.btn-info .badge { + color: #5bc0de; + background-color: #fff; +} +.btn-warning { + color: #fff; + background-color: #f0ad4e; + border-color: #eea236; +} +.btn-warning:hover, +.btn-warning:focus, +.btn-warning:active, +.btn-warning.active, +.open .dropdown-toggle.btn-warning { + color: #fff; + background-color: #ed9c28; + border-color: #d58512; +} +.btn-warning:active, +.btn-warning.active, +.open .dropdown-toggle.btn-warning { + background-image: none; +} +.btn-warning.disabled, +.btn-warning[disabled], +fieldset[disabled] .btn-warning, +.btn-warning.disabled:hover, +.btn-warning[disabled]:hover, +fieldset[disabled] .btn-warning:hover, +.btn-warning.disabled:focus, +.btn-warning[disabled]:focus, +fieldset[disabled] .btn-warning:focus, +.btn-warning.disabled:active, +.btn-warning[disabled]:active, +fieldset[disabled] .btn-warning:active, +.btn-warning.disabled.active, +.btn-warning[disabled].active, +fieldset[disabled] .btn-warning.active { + background-color: #f0ad4e; + border-color: #eea236; +} +.btn-warning .badge { + color: #f0ad4e; + background-color: #fff; +} +.btn-danger { + color: #fff; + background-color: #d9534f; + border-color: #d43f3a; +} +.btn-danger:hover, +.btn-danger:focus, +.btn-danger:active, +.btn-danger.active, +.open .dropdown-toggle.btn-danger { + color: #fff; + background-color: #d2322d; + border-color: #ac2925; +} +.btn-danger:active, +.btn-danger.active, +.open .dropdown-toggle.btn-danger { + background-image: none; +} +.btn-danger.disabled, +.btn-danger[disabled], +fieldset[disabled] .btn-danger, +.btn-danger.disabled:hover, +.btn-danger[disabled]:hover, +fieldset[disabled] .btn-danger:hover, +.btn-danger.disabled:focus, +.btn-danger[disabled]:focus, +fieldset[disabled] .btn-danger:focus, +.btn-danger.disabled:active, +.btn-danger[disabled]:active, +fieldset[disabled] .btn-danger:active, +.btn-danger.disabled.active, +.btn-danger[disabled].active, +fieldset[disabled] .btn-danger.active { + background-color: #d9534f; + border-color: #d43f3a; +} +.btn-danger .badge { + color: #d9534f; + background-color: #fff; +} +.btn-link { + color: #428bca; + font-weight: 400; + cursor: pointer; + border-radius: 0; +} +.btn-link, +.btn-link:active, +.btn-link[disabled], +fieldset[disabled] .btn-link { + background-color: transparent; + -webkit-box-shadow: none; + box-shadow: none; +} +.btn-link, +.btn-link:hover, +.btn-link:focus, +.btn-link:active { + border-color: transparent; +} +.btn-link:hover, +.btn-link:focus { + color: #2a6496; + text-decoration: underline; + background-color: transparent; +} +.btn-link[disabled]:hover, +fieldset[disabled] .btn-link:hover, +.btn-link[disabled]:focus, +fieldset[disabled] .btn-link:focus { + color: #999; + text-decoration: none; +} +.btn-lg { + padding: 10px 16px; + font-size: 18px; + line-height: 1.33; + border-radius: 6px; +} +.btn-sm { + padding: 5px 10px; + font-size: 12px; + line-height: 1.5; + border-radius: 3px; +} +.btn-xs { + padding: 1px 5px; + font-size: 12px; + line-height: 1.5; + border-radius: 3px; +} +.btn-block { + display: block; + width: 100%; + padding-left: 0; + padding-right: 0; +} +.btn-block + .btn-block { + margin-top: 5px; +} +input[type="submit"].btn-block, +input[type="reset"].btn-block, +input[type="button"].btn-block { + width: 100%; +} +.fade { + opacity: 0; + -webkit-transition: opacity 0.15s linear; + transition: opacity 0.15s linear; +} +.fade.in { + opacity: 1; +} +.collapse { + display: none; +} +.collapse.in { + display: block; +} +.collapsing { + position: relative; + height: 0; + overflow: hidden; + -webkit-transition: height 0.35s ease; + transition: height 0.35s ease; +} +@font-face { + font-family: "Glyphicons Halflings"; + src: url(../fonts/glyphicons-halflings-regular.eot); + src: + url(../fonts/glyphicons-halflings-regular.eot?#iefix) + format("embedded-opentype"), + url(../fonts/glyphicons-halflings-regular.woff) format("woff"), + url(../fonts/glyphicons-halflings-regular.ttf) format("truetype"), + url(../fonts/glyphicons-halflings-regular.svg#glyphicons_halflingsregular) + format("svg"); +} +.glyphicon { + position: relative; + top: 1px; + display: inline-block; + font-family: "Glyphicons Halflings"; + font-style: normal; + font-weight: 400; + line-height: 1; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; +} +.glyphicon-asterisk:before { + content: "\2a"; +} +.glyphicon-plus:before { + content: "\2b"; +} +.glyphicon-euro:before { + content: "\20ac"; +} +.glyphicon-minus:before { + content: "\2212"; +} +.glyphicon-cloud:before { + content: "\2601"; +} +.glyphicon-envelope:before { + content: "\2709"; +} +.glyphicon-pencil:before { + content: "\270f"; +} +.glyphicon-glass:before { + content: "\e001"; +} +.glyphicon-music:before { + content: "\e002"; +} +.glyphicon-search:before { + content: "\e003"; +} +.glyphicon-heart:before { + content: "\e005"; +} +.glyphicon-star:before { + content: "\e006"; +} +.glyphicon-star-empty:before { + content: "\e007"; +} +.glyphicon-user:before { + content: "\e008"; +} +.glyphicon-film:before { + content: "\e009"; +} +.glyphicon-th-large:before { + content: "\e010"; +} +.glyphicon-th:before { + content: "\e011"; +} +.glyphicon-th-list:before { + content: "\e012"; +} +.glyphicon-ok:before { + content: "\e013"; +} +.glyphicon-remove:before { + content: "\e014"; +} +.glyphicon-zoom-in:before { + content: "\e015"; +} +.glyphicon-zoom-out:before { + content: "\e016"; +} +.glyphicon-off:before { + content: "\e017"; +} +.glyphicon-signal:before { + content: "\e018"; +} +.glyphicon-cog:before { + content: "\e019"; +} +.glyphicon-trash:before { + content: "\e020"; +} +.glyphicon-home:before { + content: "\e021"; +} +.glyphicon-file:before { + content: "\e022"; +} +.glyphicon-time:before { + content: "\e023"; +} +.glyphicon-road:before { + content: "\e024"; +} +.glyphicon-download-alt:before { + content: "\e025"; +} +.glyphicon-download:before { + content: "\e026"; +} +.glyphicon-upload:before { + content: "\e027"; +} +.glyphicon-inbox:before { + content: "\e028"; +} +.glyphicon-play-circle:before { + content: "\e029"; +} +.glyphicon-repeat:before { + content: "\e030"; +} +.glyphicon-refresh:before { + content: "\e031"; +} +.glyphicon-list-alt:before { + content: "\e032"; +} +.glyphicon-lock:before { + content: "\e033"; +} +.glyphicon-flag:before { + content: "\e034"; +} +.glyphicon-headphones:before { + content: "\e035"; +} +.glyphicon-volume-off:before { + content: "\e036"; +} +.glyphicon-volume-down:before { + content: "\e037"; +} +.glyphicon-volume-up:before { + content: "\e038"; +} +.glyphicon-qrcode:before { + content: "\e039"; +} +.glyphicon-barcode:before { + content: "\e040"; +} +.glyphicon-tag:before { + content: "\e041"; +} +.glyphicon-tags:before { + content: "\e042"; +} +.glyphicon-book:before { + content: "\e043"; +} +.glyphicon-bookmark:before { + content: "\e044"; +} +.glyphicon-print:before { + content: "\e045"; +} +.glyphicon-camera:before { + content: "\e046"; +} +.glyphicon-font:before { + content: "\e047"; +} +.glyphicon-bold:before { + content: "\e048"; +} +.glyphicon-italic:before { + content: "\e049"; +} +.glyphicon-text-height:before { + content: "\e050"; +} +.glyphicon-text-width:before { + content: "\e051"; +} +.glyphicon-align-left:before { + content: "\e052"; +} +.glyphicon-align-center:before { + content: "\e053"; +} +.glyphicon-align-right:before { + content: "\e054"; +} +.glyphicon-align-justify:before { + content: "\e055"; +} +.glyphicon-list:before { + content: "\e056"; +} +.glyphicon-indent-left:before { + content: "\e057"; +} +.glyphicon-indent-right:before { + content: "\e058"; +} +.glyphicon-facetime-video:before { + content: "\e059"; +} +.glyphicon-picture:before { + content: "\e060"; +} +.glyphicon-map-marker:before { + content: "\e062"; +} +.glyphicon-adjust:before { + content: "\e063"; +} +.glyphicon-tint:before { + content: "\e064"; +} +.glyphicon-edit:before { + content: "\e065"; +} +.glyphicon-share:before { + content: "\e066"; +} +.glyphicon-check:before { + content: "\e067"; +} +.glyphicon-move:before { + content: "\e068"; +} +.glyphicon-step-backward:before { + content: "\e069"; +} +.glyphicon-fast-backward:before { + content: "\e070"; +} +.glyphicon-backward:before { + content: "\e071"; +} +.glyphicon-play:before { + content: "\e072"; +} +.glyphicon-pause:before { + content: "\e073"; +} +.glyphicon-stop:before { + content: "\e074"; +} +.glyphicon-forward:before { + content: "\e075"; +} +.glyphicon-fast-forward:before { + content: "\e076"; +} +.glyphicon-step-forward:before { + content: "\e077"; +} +.glyphicon-eject:before { + content: "\e078"; +} +.glyphicon-chevron-left:before { + content: "\e079"; +} +.glyphicon-chevron-right:before { + content: "\e080"; +} +.glyphicon-plus-sign:before { + content: "\e081"; +} +.glyphicon-minus-sign:before { + content: "\e082"; +} +.glyphicon-remove-sign:before { + content: "\e083"; +} +.glyphicon-ok-sign:before { + content: "\e084"; +} +.glyphicon-question-sign:before { + content: "\e085"; +} +.glyphicon-info-sign:before { + content: "\e086"; +} +.glyphicon-screenshot:before { + content: "\e087"; +} +.glyphicon-remove-circle:before { + content: "\e088"; +} +.glyphicon-ok-circle:before { + content: "\e089"; +} +.glyphicon-ban-circle:before { + content: "\e090"; +} +.glyphicon-arrow-left:before { + content: "\e091"; +} +.glyphicon-arrow-right:before { + content: "\e092"; +} +.glyphicon-arrow-up:before { + content: "\e093"; +} +.glyphicon-arrow-down:before { + content: "\e094"; +} +.glyphicon-share-alt:before { + content: "\e095"; +} +.glyphicon-resize-full:before { + content: "\e096"; +} +.glyphicon-resize-small:before { + content: "\e097"; +} +.glyphicon-exclamation-sign:before { + content: "\e101"; +} +.glyphicon-gift:before { + content: "\e102"; +} +.glyphicon-leaf:before { + content: "\e103"; +} +.glyphicon-fire:before { + content: "\e104"; +} +.glyphicon-eye-open:before { + content: "\e105"; +} +.glyphicon-eye-close:before { + content: "\e106"; +} +.glyphicon-warning-sign:before { + content: "\e107"; +} +.glyphicon-plane:before { + content: "\e108"; +} +.glyphicon-calendar:before { + content: "\e109"; +} +.glyphicon-random:before { + content: "\e110"; +} +.glyphicon-comment:before { + content: "\e111"; +} +.glyphicon-magnet:before { + content: "\e112"; +} +.glyphicon-chevron-up:before { + content: "\e113"; +} +.glyphicon-chevron-down:before { + content: "\e114"; +} +.glyphicon-retweet:before { + content: "\e115"; +} +.glyphicon-shopping-cart:before { + content: "\e116"; +} +.glyphicon-folder-close:before { + content: "\e117"; +} +.glyphicon-folder-open:before { + content: "\e118"; +} +.glyphicon-resize-vertical:before { + content: "\e119"; +} +.glyphicon-resize-horizontal:before { + content: "\e120"; +} +.glyphicon-hdd:before { + content: "\e121"; +} +.glyphicon-bullhorn:before { + content: "\e122"; +} +.glyphicon-bell:before { + content: "\e123"; +} +.glyphicon-certificate:before { + content: "\e124"; +} +.glyphicon-thumbs-up:before { + content: "\e125"; +} +.glyphicon-thumbs-down:before { + content: "\e126"; +} +.glyphicon-hand-right:before { + content: "\e127"; +} +.glyphicon-hand-left:before { + content: "\e128"; +} +.glyphicon-hand-up:before { + content: "\e129"; +} +.glyphicon-hand-down:before { + content: "\e130"; +} +.glyphicon-circle-arrow-right:before { + content: "\e131"; +} +.glyphicon-circle-arrow-left:before { + content: "\e132"; +} +.glyphicon-circle-arrow-up:before { + content: "\e133"; +} +.glyphicon-circle-arrow-down:before { + content: "\e134"; +} +.glyphicon-globe:before { + content: "\e135"; +} +.glyphicon-wrench:before { + content: "\e136"; +} +.glyphicon-tasks:before { + content: "\e137"; +} +.glyphicon-filter:before { + content: "\e138"; +} +.glyphicon-briefcase:before { + content: "\e139"; +} +.glyphicon-fullscreen:before { + content: "\e140"; +} +.glyphicon-dashboard:before { + content: "\e141"; +} +.glyphicon-paperclip:before { + content: "\e142"; +} +.glyphicon-heart-empty:before { + content: "\e143"; +} +.glyphicon-link:before { + content: "\e144"; +} +.glyphicon-phone:before { + content: "\e145"; +} +.glyphicon-pushpin:before { + content: "\e146"; +} +.glyphicon-usd:before { + content: "\e148"; +} +.glyphicon-gbp:before { + content: "\e149"; +} +.glyphicon-sort:before { + content: "\e150"; +} +.glyphicon-sort-by-alphabet:before { + content: "\e151"; +} +.glyphicon-sort-by-alphabet-alt:before { + content: "\e152"; +} +.glyphicon-sort-by-order:before { + content: "\e153"; +} +.glyphicon-sort-by-order-alt:before { + content: "\e154"; +} +.glyphicon-sort-by-attributes:before { + content: "\e155"; +} +.glyphicon-sort-by-attributes-alt:before { + content: "\e156"; +} +.glyphicon-unchecked:before { + content: "\e157"; +} +.glyphicon-expand:before { + content: "\e158"; +} +.glyphicon-collapse-down:before { + content: "\e159"; +} +.glyphicon-collapse-up:before { + content: "\e160"; +} +.glyphicon-log-in:before { + content: "\e161"; +} +.glyphicon-flash:before { + content: "\e162"; +} +.glyphicon-log-out:before { + content: "\e163"; +} +.glyphicon-new-window:before { + content: "\e164"; +} +.glyphicon-record:before { + content: "\e165"; +} +.glyphicon-save:before { + content: "\e166"; +} +.glyphicon-open:before { + content: "\e167"; +} +.glyphicon-saved:before { + content: "\e168"; +} +.glyphicon-import:before { + content: "\e169"; +} +.glyphicon-export:before { + content: "\e170"; +} +.glyphicon-send:before { + content: "\e171"; +} +.glyphicon-floppy-disk:before { + content: "\e172"; +} +.glyphicon-floppy-saved:before { + content: "\e173"; +} +.glyphicon-floppy-remove:before { + content: "\e174"; +} +.glyphicon-floppy-save:before { + content: "\e175"; +} +.glyphicon-floppy-open:before { + content: "\e176"; +} +.glyphicon-credit-card:before { + content: "\e177"; +} +.glyphicon-transfer:before { + content: "\e178"; +} +.glyphicon-cutlery:before { + content: "\e179"; +} +.glyphicon-header:before { + content: "\e180"; +} +.glyphicon-compressed:before { + content: "\e181"; +} +.glyphicon-earphone:before { + content: "\e182"; +} +.glyphicon-phone-alt:before { + content: "\e183"; +} +.glyphicon-tower:before { + content: "\e184"; +} +.glyphicon-stats:before { + content: "\e185"; +} +.glyphicon-sd-video:before { + content: "\e186"; +} +.glyphicon-hd-video:before { + content: "\e187"; +} +.glyphicon-subtitles:before { + content: "\e188"; +} +.glyphicon-sound-stereo:before { + content: "\e189"; +} +.glyphicon-sound-dolby:before { + content: "\e190"; +} +.glyphicon-sound-5-1:before { + content: "\e191"; +} +.glyphicon-sound-6-1:before { + content: "\e192"; +} +.glyphicon-sound-7-1:before { + content: "\e193"; +} +.glyphicon-copyright-mark:before { + content: "\e194"; +} +.glyphicon-registration-mark:before { + content: "\e195"; +} +.glyphicon-cloud-download:before { + content: "\e197"; +} +.glyphicon-cloud-upload:before { + content: "\e198"; +} +.glyphicon-tree-conifer:before { + content: "\e199"; +} +.glyphicon-tree-deciduous:before { + content: "\e200"; +} +.caret { + display: inline-block; + width: 0; + height: 0; + margin-left: 2px; + vertical-align: middle; + border-top: 4px solid; + border-right: 4px solid transparent; + border-left: 4px solid transparent; +} +.dropdown { + position: relative; +} +.dropdown-toggle:focus { + outline: 0; +} +.dropdown-menu { + position: absolute; + top: 100%; + left: 0; + z-index: 1000; + display: none; + float: left; + min-width: 160px; + padding: 5px 0; + margin: 2px 0 0; + list-style: none; + font-size: 14px; + background-color: #fff; + border: 1px solid #ccc; + border: 1px solid rgba(0, 0, 0, 0.15); + border-radius: 4px; + -webkit-box-shadow: 0 6px 12px rgba(0, 0, 0, 0.175); + box-shadow: 0 6px 12px rgba(0, 0, 0, 0.175); + background-clip: padding-box; +} +.dropdown-menu.pull-right { + right: 0; + left: auto; +} +.dropdown-menu .divider { + height: 1px; + margin: 9px 0; + overflow: hidden; + background-color: #e5e5e5; +} +.dropdown-menu > li > a { + display: block; + padding: 3px 20px; + clear: both; + font-weight: 400; + line-height: 1.428571429; + color: #333; + white-space: nowrap; +} +.dropdown-menu > li > a:hover, +.dropdown-menu > li > a:focus { + text-decoration: none; + color: #262626; + background-color: #f5f5f5; +} +.dropdown-menu > .active > a, +.dropdown-menu > .active > a:hover, +.dropdown-menu > .active > a:focus { + color: #fff; + text-decoration: none; + outline: 0; + background-color: #428bca; +} +.dropdown-menu > .disabled > a, +.dropdown-menu > .disabled > a:hover, +.dropdown-menu > .disabled > a:focus { + color: #999; +} +.dropdown-menu > .disabled > a:hover, +.dropdown-menu > .disabled > a:focus { + text-decoration: none; + background-color: transparent; + background-image: none; + filter: progid:DXImageTransform.Microsoft.gradient(enabled=false); + cursor: not-allowed; +} +.open > .dropdown-menu { + display: block; +} +.open > a { + outline: 0; +} +.dropdown-menu-right { + left: auto; + right: 0; +} +.dropdown-menu-left { + left: 0; + right: auto; +} +.dropdown-header { + display: block; + padding: 3px 20px; + font-size: 12px; + line-height: 1.428571429; + color: #999; +} +.dropdown-backdrop { + position: fixed; + left: 0; + right: 0; + bottom: 0; + top: 0; + z-index: 990; +} +.pull-right > .dropdown-menu { + right: 0; + left: auto; +} +.dropup .caret, +.navbar-fixed-bottom .dropdown .caret { + border-top: 0; + border-bottom: 4px solid; + content: ""; +} +.dropup .dropdown-menu, +.navbar-fixed-bottom .dropdown .dropdown-menu { + top: auto; + bottom: 100%; + margin-bottom: 1px; +} +@media (min-width: 768px) { + .navbar-right .dropdown-menu { + left: auto; + right: 0; + } + .navbar-right .dropdown-menu-left { + left: 0; + right: auto; + } +} +.btn-group, +.btn-group-vertical { + position: relative; + display: inline-block; + vertical-align: middle; +} +.btn-group > .btn, +.btn-group-vertical > .btn { + position: relative; + float: left; +} +.btn-group > .btn:hover, +.btn-group-vertical > .btn:hover, +.btn-group > .btn:focus, +.btn-group-vertical > .btn:focus, +.btn-group > .btn:active, +.btn-group-vertical > .btn:active, +.btn-group > .btn.active, +.btn-group-vertical > .btn.active { + z-index: 2; +} +.btn-group > .btn:focus, +.btn-group-vertical > .btn:focus { + outline: 0; +} +.btn-group .btn + .btn, +.btn-group .btn + .btn-group, +.btn-group .btn-group + .btn, +.btn-group .btn-group + .btn-group { + margin-left: -1px; +} +.btn-toolbar { + margin-left: -5px; +} +.btn-toolbar .btn-group, +.btn-toolbar .input-group { + float: left; +} +.btn-toolbar > .btn, +.btn-toolbar > .btn-group, +.btn-toolbar > .input-group { + margin-left: 5px; +} +.btn-group > .btn:not(:first-child):not(:last-child):not(.dropdown-toggle) { + border-radius: 0; +} +.btn-group > .btn:first-child { + margin-left: 0; +} +.btn-group > .btn:first-child:not(:last-child):not(.dropdown-toggle) { + border-bottom-right-radius: 0; + border-top-right-radius: 0; +} +.btn-group > .btn:last-child:not(:first-child), +.btn-group > .dropdown-toggle:not(:first-child) { + border-bottom-left-radius: 0; + border-top-left-radius: 0; +} +.btn-group > .btn-group { + float: left; +} +.btn-group > .btn-group:not(:first-child):not(:last-child) > .btn { + border-radius: 0; +} +.btn-group > .btn-group:first-child > .btn:last-child, +.btn-group > .btn-group:first-child > .dropdown-toggle { + border-bottom-right-radius: 0; + border-top-right-radius: 0; +} +.btn-group > .btn-group:last-child > .btn:first-child { + border-bottom-left-radius: 0; + border-top-left-radius: 0; +} +.btn-group .dropdown-toggle:active, +.btn-group.open .dropdown-toggle { + outline: 0; +} +.btn-group-xs > .btn { + padding: 1px 5px; + font-size: 12px; + line-height: 1.5; + border-radius: 3px; +} +.btn-group-sm > .btn { + padding: 5px 10px; + font-size: 12px; + line-height: 1.5; + border-radius: 3px; +} +.btn-group-lg > .btn { + padding: 10px 16px; + font-size: 18px; + line-height: 1.33; + border-radius: 6px; +} +.btn-group > .btn + .dropdown-toggle { + padding-left: 8px; + padding-right: 8px; +} +.btn-group > .btn-lg + .dropdown-toggle { + padding-left: 12px; + padding-right: 12px; +} +.btn-group.open .dropdown-toggle { + -webkit-box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125); + box-shadow: inset 0 3px 5px rgba(0, 0, 0, 0.125); +} +.btn-group.open .dropdown-toggle.btn-link { + -webkit-box-shadow: none; + box-shadow: none; +} +.btn .caret { + margin-left: 0; +} +.btn-lg .caret { + border-width: 5px 5px 0; + border-bottom-width: 0; +} +.dropup .btn-lg .caret { + border-width: 0 5px 5px; +} +.btn-group-vertical > .btn, +.btn-group-vertical > .btn-group, +.btn-group-vertical > .btn-group > .btn { + display: block; + float: none; + width: 100%; + max-width: 100%; +} +.btn-group-vertical > .btn-group > .btn { + float: none; +} +.btn-group-vertical > .btn + .btn, +.btn-group-vertical > .btn + .btn-group, +.btn-group-vertical > .btn-group + .btn, +.btn-group-vertical > .btn-group + .btn-group { + margin-top: -1px; + margin-left: 0; +} +.btn-group-vertical > .btn:not(:first-child):not(:last-child) { + border-radius: 0; +} +.btn-group-vertical > .btn:first-child:not(:last-child) { + border-top-right-radius: 4px; + border-bottom-right-radius: 0; + border-bottom-left-radius: 0; +} +.btn-group-vertical > .btn:last-child:not(:first-child) { + border-bottom-left-radius: 4px; + border-top-right-radius: 0; + border-top-left-radius: 0; +} +.btn-group-vertical > .btn-group:not(:first-child):not(:last-child) > .btn { + border-radius: 0; +} +.btn-group-vertical > .btn-group:first-child:not(:last-child) > .btn:last-child, +.btn-group-vertical + > .btn-group:first-child:not(:last-child) + > .dropdown-toggle { + border-bottom-right-radius: 0; + border-bottom-left-radius: 0; +} +.btn-group-vertical + > .btn-group:last-child:not(:first-child) + > .btn:first-child { + border-top-right-radius: 0; + border-top-left-radius: 0; +} +.btn-group-justified { + display: table; + width: 100%; + table-layout: fixed; + border-collapse: separate; +} +.btn-group-justified > .btn, +.btn-group-justified > .btn-group { + float: none; + display: table-cell; + width: 1%; +} +.btn-group-justified > .btn-group .btn { + width: 100%; +} +[data-toggle="buttons"] > .btn > input[type="radio"], +[data-toggle="buttons"] > .btn > input[type="checkbox"] { + display: none; +} +.input-group { + position: relative; + display: table; + border-collapse: separate; +} +.input-group[class*="col-"] { + float: none; + padding-left: 0; + padding-right: 0; +} +.input-group .form-control { + float: left; + width: 100%; + margin-bottom: 0; +} +.input-group-lg > .form-control, +.input-group-lg > .input-group-addon, +.input-group-lg > .input-group-btn > .btn { + height: 46px; + padding: 10px 16px; + font-size: 18px; + line-height: 1.33; + border-radius: 6px; +} +select.input-group-lg > .form-control, +select.input-group-lg > .input-group-addon, +select.input-group-lg > .input-group-btn > .btn { + height: 46px; + line-height: 46px; +} +textarea.input-group-lg > .form-control, +textarea.input-group-lg > .input-group-addon, +textarea.input-group-lg > .input-group-btn > .btn, +select[multiple].input-group-lg > .form-control, +select[multiple].input-group-lg > .input-group-addon, +select[multiple].input-group-lg > .input-group-btn > .btn { + height: auto; +} +.input-group-sm > .form-control, +.input-group-sm > .input-group-addon, +.input-group-sm > .input-group-btn > .btn { + height: 30px; + padding: 5px 10px; + font-size: 12px; + line-height: 1.5; + border-radius: 3px; +} +select.input-group-sm > .form-control, +select.input-group-sm > .input-group-addon, +select.input-group-sm > .input-group-btn > .btn { + height: 30px; + line-height: 30px; +} +textarea.input-group-sm > .form-control, +textarea.input-group-sm > .input-group-addon, +textarea.input-group-sm > .input-group-btn > .btn, +select[multiple].input-group-sm > .form-control, +select[multiple].input-group-sm > .input-group-addon, +select[multiple].input-group-sm > .input-group-btn > .btn { + height: auto; +} +.input-group-addon, +.input-group-btn, +.input-group .form-control { + display: table-cell; +} +.input-group-addon:not(:first-child):not(:last-child), +.input-group-btn:not(:first-child):not(:last-child), +.input-group .form-control:not(:first-child):not(:last-child) { + border-radius: 0; +} +.input-group-addon, +.input-group-btn { + width: 1%; + white-space: nowrap; + vertical-align: middle; +} +.input-group-addon { + padding: 6px 12px; + font-size: 14px; + font-weight: 400; + line-height: 1; + color: #555; + text-align: center; + background-color: #eee; + border: 1px solid #ccc; + border-radius: 4px; +} +.input-group-addon.input-sm { + padding: 5px 10px; + font-size: 12px; + border-radius: 3px; +} +.input-group-addon.input-lg { + padding: 10px 16px; + font-size: 18px; + border-radius: 6px; +} +.input-group-addon input[type="radio"], +.input-group-addon input[type="checkbox"] { + margin-top: 0; +} +.input-group .form-control:first-child, +.input-group-addon:first-child, +.input-group-btn:first-child > .btn, +.input-group-btn:first-child > .btn-group > .btn, +.input-group-btn:first-child > .dropdown-toggle, +.input-group-btn:last-child > .btn:not(:last-child):not(.dropdown-toggle), +.input-group-btn:last-child > .btn-group:not(:last-child) > .btn { + border-bottom-right-radius: 0; + border-top-right-radius: 0; +} +.input-group-addon:first-child { + border-right: 0; +} +.input-group .form-control:last-child, +.input-group-addon:last-child, +.input-group-btn:last-child > .btn, +.input-group-btn:last-child > .btn-group > .btn, +.input-group-btn:last-child > .dropdown-toggle, +.input-group-btn:first-child > .btn:not(:first-child), +.input-group-btn:first-child > .btn-group:not(:first-child) > .btn { + border-bottom-left-radius: 0; + border-top-left-radius: 0; +} +.input-group-addon:last-child { + border-left: 0; +} +.input-group-btn { + position: relative; + font-size: 0; + white-space: nowrap; +} +.input-group-btn > .btn { + position: relative; +} +.input-group-btn > .btn + .btn { + margin-left: -1px; +} +.input-group-btn > .btn:hover, +.input-group-btn > .btn:focus, +.input-group-btn > .btn:active { + z-index: 2; +} +.input-group-btn:first-child > .btn, +.input-group-btn:first-child > .btn-group { + margin-right: -1px; +} +.input-group-btn:last-child > .btn, +.input-group-btn:last-child > .btn-group { + margin-left: -1px; +} +.nav { + margin-bottom: 0; + padding-left: 0; + list-style: none; +} +.nav > li { + position: relative; + display: block; +} +.nav > li > a { + position: relative; + display: block; + padding: 10px 15px; +} +.nav > li > a:hover, +.nav > li > a:focus { + text-decoration: none; + background-color: #eee; +} +.nav > li.disabled > a { + color: #999; +} +.nav > li.disabled > a:hover, +.nav > li.disabled > a:focus { + color: #999; + text-decoration: none; + background-color: transparent; + cursor: not-allowed; +} +.nav .open > a, +.nav .open > a:hover, +.nav .open > a:focus { + background-color: #eee; + border-color: #428bca; +} +.nav .nav-divider { + height: 1px; + margin: 9px 0; + overflow: hidden; + background-color: #e5e5e5; +} +.nav > li > a > img { + max-width: none; +} +.nav-tabs { + border-bottom: 1px solid #ddd; +} +.nav-tabs > li { + float: left; + margin-bottom: -1px; +} +.nav-tabs > li > a { + margin-right: 2px; + line-height: 1.428571429; + border: 1px solid transparent; + border-radius: 4px 4px 0 0; +} +.nav-tabs > li > a:hover { + border-color: #eee #eee #ddd; +} +.nav-tabs > li.active > a, +.nav-tabs > li.active > a:hover, +.nav-tabs > li.active > a:focus { + color: #555; + background-color: #fff; + border: 1px solid #ddd; + border-bottom-color: transparent; + cursor: default; +} +.nav-tabs.nav-justified { + width: 100%; + border-bottom: 0; +} +.nav-tabs.nav-justified > li { + float: none; +} +.nav-tabs.nav-justified > li > a { + text-align: center; + margin-bottom: 5px; +} +.nav-tabs.nav-justified > .dropdown .dropdown-menu { + top: auto; + left: auto; +} +@media (min-width: 768px) { + .nav-tabs.nav-justified > li { + display: table-cell; + width: 1%; + } + .nav-tabs.nav-justified > li > a { + margin-bottom: 0; + } +} +.nav-tabs.nav-justified > li > a { + margin-right: 0; + border-radius: 4px; +} +.nav-tabs.nav-justified > .active > a, +.nav-tabs.nav-justified > .active > a:hover, +.nav-tabs.nav-justified > .active > a:focus { + border: 1px solid #ddd; +} +@media (min-width: 768px) { + .nav-tabs.nav-justified > li > a { + border-bottom: 1px solid #ddd; + border-radius: 4px 4px 0 0; + } + .nav-tabs.nav-justified > .active > a, + .nav-tabs.nav-justified > .active > a:hover, + .nav-tabs.nav-justified > .active > a:focus { + border-bottom-color: #fff; + } +} +.nav-pills > li { + float: left; +} +.nav-pills > li > a { + border-radius: 4px; +} +.nav-pills > li + li { + margin-left: 2px; +} +.nav-pills > li.active > a, +.nav-pills > li.active > a:hover, +.nav-pills > li.active > a:focus { + color: #fff; + background-color: #428bca; +} +.nav-stacked > li { + float: none; +} +.nav-stacked > li + li { + margin-top: 2px; + margin-left: 0; +} +.nav-justified { + width: 100%; +} +.nav-justified > li { + float: none; +} +.nav-justified > li > a { + text-align: center; + margin-bottom: 5px; +} +.nav-justified > .dropdown .dropdown-menu { + top: auto; + left: auto; +} +@media (min-width: 768px) { + .nav-justified > li { + display: table-cell; + width: 1%; + } + .nav-justified > li > a { + margin-bottom: 0; + } +} +.nav-tabs-justified { + border-bottom: 0; +} +.nav-tabs-justified > li > a { + margin-right: 0; + border-radius: 4px; +} +.nav-tabs-justified > .active > a, +.nav-tabs-justified > .active > a:hover, +.nav-tabs-justified > .active > a:focus { + border: 1px solid #ddd; +} +@media (min-width: 768px) { + .nav-tabs-justified > li > a { + border-bottom: 1px solid #ddd; + border-radius: 4px 4px 0 0; + } + .nav-tabs-justified > .active > a, + .nav-tabs-justified > .active > a:hover, + .nav-tabs-justified > .active > a:focus { + border-bottom-color: #fff; + } +} +.tab-content > .tab-pane { + display: none; +} +.tab-content > .active { + display: block; +} +.nav-tabs .dropdown-menu { + margin-top: -1px; + border-top-right-radius: 0; + border-top-left-radius: 0; +} +.navbar { + position: relative; + min-height: 50px; + margin-bottom: 20px; + border: 1px solid transparent; +} +@media (min-width: 768px) { + .navbar { + border-radius: 4px; + } +} +@media (min-width: 768px) { + .navbar-header { + float: left; + } +} +.navbar-collapse { + max-height: 340px; + overflow-x: visible; + padding-right: 15px; + padding-left: 15px; + border-top: 1px solid transparent; + box-shadow: inset 0 1px 0 rgba(255, 255, 255, 0.1); + -webkit-overflow-scrolling: touch; +} +.navbar-collapse.in { + overflow-y: auto; +} +@media (min-width: 768px) { + .navbar-collapse { + width: auto; + border-top: 0; + box-shadow: none; + } + .navbar-collapse.collapse { + display: block !important; + height: auto !important; + padding-bottom: 0; + overflow: visible !important; + } + .navbar-collapse.in { + overflow-y: visible; + } + .navbar-fixed-top .navbar-collapse, + .navbar-static-top .navbar-collapse, + .navbar-fixed-bottom .navbar-collapse { + padding-left: 0; + padding-right: 0; + } +} +.container > .navbar-header, +.container-fluid > .navbar-header, +.container > .navbar-collapse, +.container-fluid > .navbar-collapse { + margin-right: -15px; + margin-left: -15px; +} +@media (min-width: 768px) { + .container > .navbar-header, + .container-fluid > .navbar-header, + .container > .navbar-collapse, + .container-fluid > .navbar-collapse { + margin-right: 0; + margin-left: 0; + } +} +.navbar-static-top { + z-index: 1000; + border-width: 0 0 1px; +} +@media (min-width: 768px) { + .navbar-static-top { + border-radius: 0; + } +} +.navbar-fixed-top, +.navbar-fixed-bottom { + position: fixed; + right: 0; + left: 0; + z-index: 1030; +} +@media (min-width: 768px) { + .navbar-fixed-top, + .navbar-fixed-bottom { + border-radius: 0; + } +} +.navbar-fixed-top { + top: 0; + border-width: 0 0 1px; +} +.navbar-fixed-bottom { + bottom: 0; + margin-bottom: 0; + border-width: 1px 0 0; +} +.navbar-brand { + float: left; + padding: 15px; + font-size: 18px; + line-height: 20px; + height: 20px; +} +.navbar-brand:hover, +.navbar-brand:focus { + text-decoration: none; +} +@media (min-width: 768px) { + .navbar > .container .navbar-brand, + .navbar > .container-fluid .navbar-brand { + margin-left: -15px; + } +} +.navbar-toggle { + position: relative; + float: right; + margin-right: 15px; + padding: 9px 10px; + margin-top: 8px; + margin-bottom: 8px; + background-color: transparent; + background-image: none; + border: 1px solid transparent; + border-radius: 4px; +} +.navbar-toggle:focus { + outline: 0; +} +.navbar-toggle .icon-bar { + display: block; + width: 22px; + height: 2px; + border-radius: 1px; +} +.navbar-toggle .icon-bar + .icon-bar { + margin-top: 4px; +} +@media (min-width: 768px) { + .navbar-toggle { + display: none; + } +} +.navbar-nav { + margin: 7.5px -15px; +} +.navbar-nav > li > a { + padding-top: 10px; + padding-bottom: 10px; + line-height: 20px; +} +@media (max-width: 767px) { + .navbar-nav .open .dropdown-menu { + position: static; + float: none; + width: auto; + margin-top: 0; + background-color: transparent; + border: 0; + box-shadow: none; + } + .navbar-nav .open .dropdown-menu > li > a, + .navbar-nav .open .dropdown-menu .dropdown-header { + padding: 5px 15px 5px 25px; + } + .navbar-nav .open .dropdown-menu > li > a { + line-height: 20px; + } + .navbar-nav .open .dropdown-menu > li > a:hover, + .navbar-nav .open .dropdown-menu > li > a:focus { + background-image: none; + } +} +@media (min-width: 768px) { + .navbar-nav { + float: left; + margin: 0; + } + .navbar-nav > li { + float: left; + } + .navbar-nav > li > a { + padding-top: 15px; + padding-bottom: 15px; + } + .navbar-nav.navbar-right:last-child { + margin-right: -15px; + } +} +@media (min-width: 768px) { + .navbar-left { + float: left !important; + } + .navbar-right { + float: right !important; + } +} +.navbar-form { + margin-left: -15px; + margin-right: -15px; + padding: 10px 15px; + border-top: 1px solid transparent; + border-bottom: 1px solid transparent; + -webkit-box-shadow: + inset 0 1px 0 rgba(255, 255, 255, 0.1), + 0 1px 0 rgba(255, 255, 255, 0.1); + box-shadow: + inset 0 1px 0 rgba(255, 255, 255, 0.1), + 0 1px 0 rgba(255, 255, 255, 0.1); + margin-top: 8px; + margin-bottom: 8px; +} +@media (min-width: 768px) { + .navbar-form .form-group { + display: inline-block; + margin-bottom: 0; + vertical-align: middle; + } + .navbar-form .form-control { + display: inline-block; + width: auto; + vertical-align: middle; + } + .navbar-form .control-label { + margin-bottom: 0; + vertical-align: middle; + } + .navbar-form .radio, + .navbar-form .checkbox { + display: inline-block; + margin-top: 0; + margin-bottom: 0; + padding-left: 0; + vertical-align: middle; + } + .navbar-form .radio input[type="radio"], + .navbar-form .checkbox input[type="checkbox"] { + float: none; + margin-left: 0; + } + .navbar-form .has-feedback .form-control-feedback { + top: 0; + } +} +@media (max-width: 767px) { + .navbar-form .form-group { + margin-bottom: 5px; + } +} +@media (min-width: 768px) { + .navbar-form { + width: auto; + border: 0; + margin-left: 0; + margin-right: 0; + padding-top: 0; + padding-bottom: 0; + -webkit-box-shadow: none; + box-shadow: none; + } + .navbar-form.navbar-right:last-child { + margin-right: -15px; + } +} +.navbar-nav > li > .dropdown-menu { + margin-top: 0; + border-top-right-radius: 0; + border-top-left-radius: 0; +} +.navbar-fixed-bottom .navbar-nav > li > .dropdown-menu { + border-bottom-right-radius: 0; + border-bottom-left-radius: 0; +} +.navbar-btn { + margin-top: 8px; + margin-bottom: 8px; +} +.navbar-btn.btn-sm { + margin-top: 10px; + margin-bottom: 10px; +} +.navbar-btn.btn-xs { + margin-top: 14px; + margin-bottom: 14px; +} +.navbar-text { + margin-top: 15px; + margin-bottom: 15px; +} +@media (min-width: 768px) { + .navbar-text { + float: left; + margin-left: 15px; + margin-right: 15px; + } + .navbar-text.navbar-right:last-child { + margin-right: 0; + } +} +.navbar-default { + background-color: #f8f8f8; + border-color: #e7e7e7; +} +.navbar-default .navbar-brand { + color: #777; +} +.navbar-default .navbar-brand:hover, +.navbar-default .navbar-brand:focus { + color: #5e5e5e; + background-color: transparent; +} +.navbar-default .navbar-text { + color: #777; +} +.navbar-default .navbar-nav > li > a { + color: #777; +} +.navbar-default .navbar-nav > li > a:hover, +.navbar-default .navbar-nav > li > a:focus { + color: #333; + background-color: transparent; +} +.navbar-default .navbar-nav > .active > a, +.navbar-default .navbar-nav > .active > a:hover, +.navbar-default .navbar-nav > .active > a:focus { + color: #555; + background-color: #e7e7e7; +} +.navbar-default .navbar-nav > .disabled > a, +.navbar-default .navbar-nav > .disabled > a:hover, +.navbar-default .navbar-nav > .disabled > a:focus { + color: #ccc; + background-color: transparent; +} +.navbar-default .navbar-toggle { + border-color: #ddd; +} +.navbar-default .navbar-toggle:hover, +.navbar-default .navbar-toggle:focus { + background-color: #ddd; +} +.navbar-default .navbar-toggle .icon-bar { + background-color: #888; +} +.navbar-default .navbar-collapse, +.navbar-default .navbar-form { + border-color: #e7e7e7; +} +.navbar-default .navbar-nav > .open > a, +.navbar-default .navbar-nav > .open > a:hover, +.navbar-default .navbar-nav > .open > a:focus { + background-color: #e7e7e7; + color: #555; +} +@media (max-width: 767px) { + .navbar-default .navbar-nav .open .dropdown-menu > li > a { + color: #777; + } + .navbar-default .navbar-nav .open .dropdown-menu > li > a:hover, + .navbar-default .navbar-nav .open .dropdown-menu > li > a:focus { + color: #333; + background-color: transparent; + } + .navbar-default .navbar-nav .open .dropdown-menu > .active > a, + .navbar-default .navbar-nav .open .dropdown-menu > .active > a:hover, + .navbar-default .navbar-nav .open .dropdown-menu > .active > a:focus { + color: #555; + background-color: #e7e7e7; + } + .navbar-default .navbar-nav .open .dropdown-menu > .disabled > a, + .navbar-default .navbar-nav .open .dropdown-menu > .disabled > a:hover, + .navbar-default .navbar-nav .open .dropdown-menu > .disabled > a:focus { + color: #ccc; + background-color: transparent; + } +} +.navbar-default .navbar-link { + color: #777; +} +.navbar-default .navbar-link:hover { + color: #333; +} +.navbar-inverse { + background-color: #222; + border-color: #080808; +} +.navbar-inverse .navbar-brand { + color: #999; +} +.navbar-inverse .navbar-brand:hover, +.navbar-inverse .navbar-brand:focus { + color: #fff; + background-color: transparent; +} +.navbar-inverse .navbar-text { + color: #999; +} +.navbar-inverse .navbar-nav > li > a { + color: #999; +} +.navbar-inverse .navbar-nav > li > a:hover, +.navbar-inverse .navbar-nav > li > a:focus { + color: #fff; + background-color: transparent; +} +.navbar-inverse .navbar-nav > .active > a, +.navbar-inverse .navbar-nav > .active > a:hover, +.navbar-inverse .navbar-nav > .active > a:focus { + color: #fff; + background-color: #080808; +} +.navbar-inverse .navbar-nav > .disabled > a, +.navbar-inverse .navbar-nav > .disabled > a:hover, +.navbar-inverse .navbar-nav > .disabled > a:focus { + color: #444; + background-color: transparent; +} +.navbar-inverse .navbar-toggle { + border-color: #333; +} +.navbar-inverse .navbar-toggle:hover, +.navbar-inverse .navbar-toggle:focus { + background-color: #333; +} +.navbar-inverse .navbar-toggle .icon-bar { + background-color: #fff; +} +.navbar-inverse .navbar-collapse, +.navbar-inverse .navbar-form { + border-color: #101010; +} +.navbar-inverse .navbar-nav > .open > a, +.navbar-inverse .navbar-nav > .open > a:hover, +.navbar-inverse .navbar-nav > .open > a:focus { + background-color: #080808; + color: #fff; +} +@media (max-width: 767px) { + .navbar-inverse .navbar-nav .open .dropdown-menu > .dropdown-header { + border-color: #080808; + } + .navbar-inverse .navbar-nav .open .dropdown-menu .divider { + background-color: #080808; + } + .navbar-inverse .navbar-nav .open .dropdown-menu > li > a { + color: #999; + } + .navbar-inverse .navbar-nav .open .dropdown-menu > li > a:hover, + .navbar-inverse .navbar-nav .open .dropdown-menu > li > a:focus { + color: #fff; + background-color: transparent; + } + .navbar-inverse .navbar-nav .open .dropdown-menu > .active > a, + .navbar-inverse .navbar-nav .open .dropdown-menu > .active > a:hover, + .navbar-inverse .navbar-nav .open .dropdown-menu > .active > a:focus { + color: #fff; + background-color: #080808; + } + .navbar-inverse .navbar-nav .open .dropdown-menu > .disabled > a, + .navbar-inverse .navbar-nav .open .dropdown-menu > .disabled > a:hover, + .navbar-inverse .navbar-nav .open .dropdown-menu > .disabled > a:focus { + color: #444; + background-color: transparent; + } +} +.navbar-inverse .navbar-link { + color: #999; +} +.navbar-inverse .navbar-link:hover { + color: #fff; +} +.breadcrumb { + padding: 8px 15px; + margin-bottom: 20px; + list-style: none; + background-color: #f5f5f5; + border-radius: 4px; +} +.breadcrumb > li { + display: inline-block; +} +.breadcrumb > li + li:before { + content: "/\00a0"; + padding: 0 5px; + color: #ccc; +} +.breadcrumb > .active { + color: #999; +} +.pagination { + display: inline-block; + padding-left: 0; + margin: 20px 0; + border-radius: 4px; +} +.pagination > li { + display: inline; +} +.pagination > li > a, +.pagination > li > span { + position: relative; + float: left; + padding: 6px 12px; + line-height: 1.428571429; + text-decoration: none; + color: #428bca; + background-color: #fff; + border: 1px solid #ddd; + margin-left: -1px; +} +.pagination > li:first-child > a, +.pagination > li:first-child > span { + margin-left: 0; + border-bottom-left-radius: 4px; + border-top-left-radius: 4px; +} +.pagination > li:last-child > a, +.pagination > li:last-child > span { + border-bottom-right-radius: 4px; + border-top-right-radius: 4px; +} +.pagination > li > a:hover, +.pagination > li > span:hover, +.pagination > li > a:focus, +.pagination > li > span:focus { + color: #2a6496; + background-color: #eee; + border-color: #ddd; +} +.pagination > .active > a, +.pagination > .active > span, +.pagination > .active > a:hover, +.pagination > .active > span:hover, +.pagination > .active > a:focus, +.pagination > .active > span:focus { + z-index: 2; + color: #fff; + background-color: #428bca; + border-color: #428bca; + cursor: default; +} +.pagination > .disabled > span, +.pagination > .disabled > span:hover, +.pagination > .disabled > span:focus, +.pagination > .disabled > a, +.pagination > .disabled > a:hover, +.pagination > .disabled > a:focus { + color: #999; + background-color: #fff; + border-color: #ddd; + cursor: not-allowed; +} +.pagination-lg > li > a, +.pagination-lg > li > span { + padding: 10px 16px; + font-size: 18px; +} +.pagination-lg > li:first-child > a, +.pagination-lg > li:first-child > span { + border-bottom-left-radius: 6px; + border-top-left-radius: 6px; +} +.pagination-lg > li:last-child > a, +.pagination-lg > li:last-child > span { + border-bottom-right-radius: 6px; + border-top-right-radius: 6px; +} +.pagination-sm > li > a, +.pagination-sm > li > span { + padding: 5px 10px; + font-size: 12px; +} +.pagination-sm > li:first-child > a, +.pagination-sm > li:first-child > span { + border-bottom-left-radius: 3px; + border-top-left-radius: 3px; +} +.pagination-sm > li:last-child > a, +.pagination-sm > li:last-child > span { + border-bottom-right-radius: 3px; + border-top-right-radius: 3px; +} +.pager { + padding-left: 0; + margin: 20px 0; + list-style: none; + text-align: center; +} +.pager li { + display: inline; +} +.pager li > a, +.pager li > span { + display: inline-block; + padding: 5px 14px; + background-color: #fff; + border: 1px solid #ddd; + border-radius: 15px; +} +.pager li > a:hover, +.pager li > a:focus { + text-decoration: none; + background-color: #eee; +} +.pager .next > a, +.pager .next > span { + float: right; +} +.pager .previous > a, +.pager .previous > span { + float: left; +} +.pager .disabled > a, +.pager .disabled > a:hover, +.pager .disabled > a:focus, +.pager .disabled > span { + color: #999; + background-color: #fff; + cursor: not-allowed; +} +.label { + display: inline; + padding: 0.2em 0.6em 0.3em; + font-size: 75%; + font-weight: 700; + line-height: 1; + color: #fff; + text-align: center; + white-space: nowrap; + vertical-align: baseline; + border-radius: 0.25em; +} +.label[href]:hover, +.label[href]:focus { + color: #fff; + text-decoration: none; + cursor: pointer; +} +.label:empty { + display: none; +} +.btn .label { + position: relative; + top: -1px; +} +.label-default { + background-color: #999; +} +.label-default[href]:hover, +.label-default[href]:focus { + background-color: gray; +} +.label-primary { + background-color: #428bca; +} +.label-primary[href]:hover, +.label-primary[href]:focus { + background-color: #3071a9; +} +.label-success { + background-color: #5cb85c; +} +.label-success[href]:hover, +.label-success[href]:focus { + background-color: #449d44; +} +.label-info { + background-color: #5bc0de; +} +.label-info[href]:hover, +.label-info[href]:focus { + background-color: #31b0d5; +} +.label-warning { + background-color: #f0ad4e; +} +.label-warning[href]:hover, +.label-warning[href]:focus { + background-color: #ec971f; +} +.label-danger { + background-color: #d9534f; +} +.label-danger[href]:hover, +.label-danger[href]:focus { + background-color: #c9302c; +} +.badge { + display: inline-block; + min-width: 10px; + padding: 3px 7px; + font-size: 12px; + font-weight: 700; + color: #fff; + line-height: 1; + vertical-align: baseline; + white-space: nowrap; + text-align: center; + background-color: #999; + border-radius: 10px; +} +.badge:empty { + display: none; +} +.btn .badge { + position: relative; + top: -1px; +} +.btn-xs .badge { + top: 0; + padding: 1px 5px; +} +a.badge:hover, +a.badge:focus { + color: #fff; + text-decoration: none; + cursor: pointer; +} +a.list-group-item.active > .badge, +.nav-pills > .active > a > .badge { + color: #428bca; + background-color: #fff; +} +.nav-pills > li > a > .badge { + margin-left: 3px; +} +.jumbotron { + padding: 30px; + margin-bottom: 30px; + color: inherit; + background-color: #eee; +} +.jumbotron h1, +.jumbotron .h1 { + color: inherit; +} +.jumbotron p { + margin-bottom: 15px; + font-size: 21px; + font-weight: 200; +} +.container .jumbotron { + border-radius: 6px; +} +.jumbotron .container { + max-width: 100%; +} +@media screen and (min-width: 768px) { + .jumbotron { + padding-top: 48px; + padding-bottom: 48px; + } + .container .jumbotron { + padding-left: 60px; + padding-right: 60px; + } + .jumbotron h1, + .jumbotron .h1 { + font-size: 63px; + } +} +.thumbnail { + display: block; + padding: 4px; + margin-bottom: 20px; + line-height: 1.428571429; + background-color: #fff; + border: 1px solid #ddd; + border-radius: 4px; + -webkit-transition: all 0.2s ease-in-out; + transition: all 0.2s ease-in-out; +} +.thumbnail > img, +.thumbnail a > img { + display: block; + max-width: 100%; + height: auto; + margin-left: auto; + margin-right: auto; +} +a.thumbnail:hover, +a.thumbnail:focus, +a.thumbnail.active { + border-color: #428bca; +} +.thumbnail .caption { + padding: 9px; + color: #333; +} +.alert { + padding: 15px; + margin-bottom: 20px; + border: 1px solid transparent; + border-radius: 4px; +} +.alert h4 { + margin-top: 0; + color: inherit; +} +.alert .alert-link { + font-weight: 700; +} +.alert > p, +.alert > ul { + margin-bottom: 0; +} +.alert > p + p { + margin-top: 5px; +} +.alert-dismissable { + padding-right: 35px; +} +.alert-dismissable .close { + position: relative; + top: -2px; + right: -21px; + color: inherit; +} +.alert-success { + background-color: #dff0d8; + border-color: #d6e9c6; + color: #3c763d; +} +.alert-success hr { + border-top-color: #c9e2b3; +} +.alert-success .alert-link { + color: #2b542c; +} +.alert-info { + background-color: #d9edf7; + border-color: #bce8f1; + color: #31708f; +} +.alert-info hr { + border-top-color: #a6e1ec; +} +.alert-info .alert-link { + color: #245269; +} +.alert-warning { + background-color: #fcf8e3; + border-color: #faebcc; + color: #8a6d3b; +} +.alert-warning hr { + border-top-color: #f7e1b5; +} +.alert-warning .alert-link { + color: #66512c; +} +.alert-danger { + background-color: #f2dede; + border-color: #ebccd1; + color: #a94442; +} +.alert-danger hr { + border-top-color: #e4b9c0; +} +.alert-danger .alert-link { + color: #843534; +} +@-webkit-keyframes progress-bar-stripes { + from { + background-position: 40px 0; + } + to { + background-position: 0 0; + } +} +@keyframes progress-bar-stripes { + from { + background-position: 40px 0; + } + to { + background-position: 0 0; + } +} +.progress { + overflow: hidden; + height: 20px; + margin-bottom: 20px; + background-color: #f5f5f5; + border-radius: 4px; + -webkit-box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.1); + box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.1); +} +.progress-bar { + float: left; + width: 0; + height: 100%; + font-size: 12px; + line-height: 20px; + color: #fff; + text-align: center; + background-color: #428bca; + -webkit-box-shadow: inset 0 -1px 0 rgba(0, 0, 0, 0.15); + box-shadow: inset 0 -1px 0 rgba(0, 0, 0, 0.15); + -webkit-transition: width 0.6s ease; + transition: width 0.6s ease; +} +.progress-striped .progress-bar { + background-image: -webkit-linear-gradient( + 45deg, + rgba(255, 255, 255, 0.15) 25%, + transparent 25%, + transparent 50%, + rgba(255, 255, 255, 0.15) 50%, + rgba(255, 255, 255, 0.15) 75%, + transparent 75%, + transparent + ); + background-image: linear-gradient( + 45deg, + rgba(255, 255, 255, 0.15) 25%, + transparent 25%, + transparent 50%, + rgba(255, 255, 255, 0.15) 50%, + rgba(255, 255, 255, 0.15) 75%, + transparent 75%, + transparent + ); + background-size: 40px 40px; +} +.progress.active .progress-bar { + -webkit-animation: progress-bar-stripes 2s linear infinite; + animation: progress-bar-stripes 2s linear infinite; +} +.progress-bar-success { + background-color: #5cb85c; +} +.progress-striped .progress-bar-success { + background-image: -webkit-linear-gradient( + 45deg, + rgba(255, 255, 255, 0.15) 25%, + transparent 25%, + transparent 50%, + rgba(255, 255, 255, 0.15) 50%, + rgba(255, 255, 255, 0.15) 75%, + transparent 75%, + transparent + ); + background-image: linear-gradient( + 45deg, + rgba(255, 255, 255, 0.15) 25%, + transparent 25%, + transparent 50%, + rgba(255, 255, 255, 0.15) 50%, + rgba(255, 255, 255, 0.15) 75%, + transparent 75%, + transparent + ); +} +.progress-bar-info { + background-color: #5bc0de; +} +.progress-striped .progress-bar-info { + background-image: -webkit-linear-gradient( + 45deg, + rgba(255, 255, 255, 0.15) 25%, + transparent 25%, + transparent 50%, + rgba(255, 255, 255, 0.15) 50%, + rgba(255, 255, 255, 0.15) 75%, + transparent 75%, + transparent + ); + background-image: linear-gradient( + 45deg, + rgba(255, 255, 255, 0.15) 25%, + transparent 25%, + transparent 50%, + rgba(255, 255, 255, 0.15) 50%, + rgba(255, 255, 255, 0.15) 75%, + transparent 75%, + transparent + ); +} +.progress-bar-warning { + background-color: #f0ad4e; +} +.progress-striped .progress-bar-warning { + background-image: -webkit-linear-gradient( + 45deg, + rgba(255, 255, 255, 0.15) 25%, + transparent 25%, + transparent 50%, + rgba(255, 255, 255, 0.15) 50%, + rgba(255, 255, 255, 0.15) 75%, + transparent 75%, + transparent + ); + background-image: linear-gradient( + 45deg, + rgba(255, 255, 255, 0.15) 25%, + transparent 25%, + transparent 50%, + rgba(255, 255, 255, 0.15) 50%, + rgba(255, 255, 255, 0.15) 75%, + transparent 75%, + transparent + ); +} +.progress-bar-danger { + background-color: #d9534f; +} +.progress-striped .progress-bar-danger { + background-image: -webkit-linear-gradient( + 45deg, + rgba(255, 255, 255, 0.15) 25%, + transparent 25%, + transparent 50%, + rgba(255, 255, 255, 0.15) 50%, + rgba(255, 255, 255, 0.15) 75%, + transparent 75%, + transparent + ); + background-image: linear-gradient( + 45deg, + rgba(255, 255, 255, 0.15) 25%, + transparent 25%, + transparent 50%, + rgba(255, 255, 255, 0.15) 50%, + rgba(255, 255, 255, 0.15) 75%, + transparent 75%, + transparent + ); +} +.media, +.media-body { + overflow: hidden; + zoom: 1; +} +.media, +.media .media { + margin-top: 15px; +} +.media:first-child { + margin-top: 0; +} +.media-object { + display: block; +} +.media-heading { + margin: 0 0 5px; +} +.media > .pull-left { + margin-right: 10px; +} +.media > .pull-right { + margin-left: 10px; +} +.media-list { + padding-left: 0; + list-style: none; +} +.list-group { + margin-bottom: 20px; + padding-left: 0; +} +.list-group-item { + position: relative; + display: block; + padding: 10px 15px; + margin-bottom: -1px; + background-color: #fff; + border: 1px solid #ddd; +} +.list-group-item:first-child { + border-top-right-radius: 4px; + border-top-left-radius: 4px; +} +.list-group-item:last-child { + margin-bottom: 0; + border-bottom-right-radius: 4px; + border-bottom-left-radius: 4px; +} +.list-group-item > .badge { + float: right; +} +.list-group-item > .badge + .badge { + margin-right: 5px; +} +a.list-group-item { + color: #555; +} +a.list-group-item .list-group-item-heading { + color: #333; +} +a.list-group-item:hover, +a.list-group-item:focus { + text-decoration: none; + background-color: #f5f5f5; +} +a.list-group-item.active, +a.list-group-item.active:hover, +a.list-group-item.active:focus { + z-index: 2; + color: #fff; + background-color: #428bca; + border-color: #428bca; +} +a.list-group-item.active .list-group-item-heading, +a.list-group-item.active:hover .list-group-item-heading, +a.list-group-item.active:focus .list-group-item-heading { + color: inherit; +} +a.list-group-item.active .list-group-item-text, +a.list-group-item.active:hover .list-group-item-text, +a.list-group-item.active:focus .list-group-item-text { + color: #e1edf7; +} +.list-group-item-success { + color: #3c763d; + background-color: #dff0d8; +} +a.list-group-item-success { + color: #3c763d; +} +a.list-group-item-success .list-group-item-heading { + color: inherit; +} +a.list-group-item-success:hover, +a.list-group-item-success:focus { + color: #3c763d; + background-color: #d0e9c6; +} +a.list-group-item-success.active, +a.list-group-item-success.active:hover, +a.list-group-item-success.active:focus { + color: #fff; + background-color: #3c763d; + border-color: #3c763d; +} +.list-group-item-info { + color: #31708f; + background-color: #d9edf7; +} +a.list-group-item-info { + color: #31708f; +} +a.list-group-item-info .list-group-item-heading { + color: inherit; +} +a.list-group-item-info:hover, +a.list-group-item-info:focus { + color: #31708f; + background-color: #c4e3f3; +} +a.list-group-item-info.active, +a.list-group-item-info.active:hover, +a.list-group-item-info.active:focus { + color: #fff; + background-color: #31708f; + border-color: #31708f; +} +.list-group-item-warning { + color: #8a6d3b; + background-color: #fcf8e3; +} +a.list-group-item-warning { + color: #8a6d3b; +} +a.list-group-item-warning .list-group-item-heading { + color: inherit; +} +a.list-group-item-warning:hover, +a.list-group-item-warning:focus { + color: #8a6d3b; + background-color: #faf2cc; +} +a.list-group-item-warning.active, +a.list-group-item-warning.active:hover, +a.list-group-item-warning.active:focus { + color: #fff; + background-color: #8a6d3b; + border-color: #8a6d3b; +} +.list-group-item-danger { + color: #a94442; + background-color: #f2dede; +} +a.list-group-item-danger { + color: #a94442; +} +a.list-group-item-danger .list-group-item-heading { + color: inherit; +} +a.list-group-item-danger:hover, +a.list-group-item-danger:focus { + color: #a94442; + background-color: #ebcccc; +} +a.list-group-item-danger.active, +a.list-group-item-danger.active:hover, +a.list-group-item-danger.active:focus { + color: #fff; + background-color: #a94442; + border-color: #a94442; +} +.list-group-item-heading { + margin-top: 0; + margin-bottom: 5px; +} +.list-group-item-text { + margin-bottom: 0; + line-height: 1.3; +} +.panel { + margin-bottom: 20px; + background-color: #fff; + border: 1px solid transparent; + border-radius: 4px; + -webkit-box-shadow: 0 1px 1px rgba(0, 0, 0, 0.05); + box-shadow: 0 1px 1px rgba(0, 0, 0, 0.05); +} +.panel-body { + padding: 15px; +} +.panel > .list-group { + margin-bottom: 0; +} +.panel > .list-group .list-group-item { + border-width: 1px 0; + border-radius: 0; +} +.panel > .list-group .list-group-item:first-child { + border-top: 0; +} +.panel > .list-group .list-group-item:last-child { + border-bottom: 0; +} +.panel > .list-group:first-child .list-group-item:first-child { + border-top-right-radius: 3px; + border-top-left-radius: 3px; +} +.panel > .list-group:last-child .list-group-item:last-child { + border-bottom-right-radius: 3px; + border-bottom-left-radius: 3px; +} +.panel-heading + .list-group .list-group-item:first-child { + border-top-width: 0; +} +.panel > .table, +.panel > .table-responsive > .table { + margin-bottom: 0; +} +.panel > .table:first-child > thead:first-child > tr:first-child td:first-child, +.panel + > .table-responsive:first-child + > .table:first-child + > thead:first-child + > tr:first-child + td:first-child, +.panel > .table:first-child > tbody:first-child > tr:first-child td:first-child, +.panel + > .table-responsive:first-child + > .table:first-child + > tbody:first-child + > tr:first-child + td:first-child, +.panel > .table:first-child > thead:first-child > tr:first-child th:first-child, +.panel + > .table-responsive:first-child + > .table:first-child + > thead:first-child + > tr:first-child + th:first-child, +.panel > .table:first-child > tbody:first-child > tr:first-child th:first-child, +.panel + > .table-responsive:first-child + > .table:first-child + > tbody:first-child + > tr:first-child + th:first-child { + border-top-left-radius: 3px; +} +.panel > .table:first-child > thead:first-child > tr:first-child td:last-child, +.panel + > .table-responsive:first-child + > .table:first-child + > thead:first-child + > tr:first-child + td:last-child, +.panel > .table:first-child > tbody:first-child > tr:first-child td:last-child, +.panel + > .table-responsive:first-child + > .table:first-child + > tbody:first-child + > tr:first-child + td:last-child, +.panel > .table:first-child > thead:first-child > tr:first-child th:last-child, +.panel + > .table-responsive:first-child + > .table:first-child + > thead:first-child + > tr:first-child + th:last-child, +.panel > .table:first-child > tbody:first-child > tr:first-child th:last-child, +.panel + > .table-responsive:first-child + > .table:first-child + > tbody:first-child + > tr:first-child + th:last-child { + border-top-right-radius: 3px; +} +.panel > .table:last-child > tbody:last-child > tr:last-child td:first-child, +.panel + > .table-responsive:last-child + > .table:last-child + > tbody:last-child + > tr:last-child + td:first-child, +.panel > .table:last-child > tfoot:last-child > tr:last-child td:first-child, +.panel + > .table-responsive:last-child + > .table:last-child + > tfoot:last-child + > tr:last-child + td:first-child, +.panel > .table:last-child > tbody:last-child > tr:last-child th:first-child, +.panel + > .table-responsive:last-child + > .table:last-child + > tbody:last-child + > tr:last-child + th:first-child, +.panel > .table:last-child > tfoot:last-child > tr:last-child th:first-child, +.panel + > .table-responsive:last-child + > .table:last-child + > tfoot:last-child + > tr:last-child + th:first-child { + border-bottom-left-radius: 3px; +} +.panel > .table:last-child > tbody:last-child > tr:last-child td:last-child, +.panel + > .table-responsive:last-child + > .table:last-child + > tbody:last-child + > tr:last-child + td:last-child, +.panel > .table:last-child > tfoot:last-child > tr:last-child td:last-child, +.panel + > .table-responsive:last-child + > .table:last-child + > tfoot:last-child + > tr:last-child + td:last-child, +.panel > .table:last-child > tbody:last-child > tr:last-child th:last-child, +.panel + > .table-responsive:last-child + > .table:last-child + > tbody:last-child + > tr:last-child + th:last-child, +.panel > .table:last-child > tfoot:last-child > tr:last-child th:last-child, +.panel + > .table-responsive:last-child + > .table:last-child + > tfoot:last-child + > tr:last-child + th:last-child { + border-bottom-right-radius: 3px; +} +.panel > .panel-body + .table, +.panel > .panel-body + .table-responsive { + border-top: 1px solid #ddd; +} +.panel > .table > tbody:first-child > tr:first-child th, +.panel > .table > tbody:first-child > tr:first-child td { + border-top: 0; +} +.panel > .table-bordered, +.panel > .table-responsive > .table-bordered { + border: 0; +} +.panel > .table-bordered > thead > tr > th:first-child, +.panel > .table-responsive > .table-bordered > thead > tr > th:first-child, +.panel > .table-bordered > tbody > tr > th:first-child, +.panel > .table-responsive > .table-bordered > tbody > tr > th:first-child, +.panel > .table-bordered > tfoot > tr > th:first-child, +.panel > .table-responsive > .table-bordered > tfoot > tr > th:first-child, +.panel > .table-bordered > thead > tr > td:first-child, +.panel > .table-responsive > .table-bordered > thead > tr > td:first-child, +.panel > .table-bordered > tbody > tr > td:first-child, +.panel > .table-responsive > .table-bordered > tbody > tr > td:first-child, +.panel > .table-bordered > tfoot > tr > td:first-child, +.panel > .table-responsive > .table-bordered > tfoot > tr > td:first-child { + border-left: 0; +} +.panel > .table-bordered > thead > tr > th:last-child, +.panel > .table-responsive > .table-bordered > thead > tr > th:last-child, +.panel > .table-bordered > tbody > tr > th:last-child, +.panel > .table-responsive > .table-bordered > tbody > tr > th:last-child, +.panel > .table-bordered > tfoot > tr > th:last-child, +.panel > .table-responsive > .table-bordered > tfoot > tr > th:last-child, +.panel > .table-bordered > thead > tr > td:last-child, +.panel > .table-responsive > .table-bordered > thead > tr > td:last-child, +.panel > .table-bordered > tbody > tr > td:last-child, +.panel > .table-responsive > .table-bordered > tbody > tr > td:last-child, +.panel > .table-bordered > tfoot > tr > td:last-child, +.panel > .table-responsive > .table-bordered > tfoot > tr > td:last-child { + border-right: 0; +} +.panel > .table-bordered > thead > tr:first-child > th, +.panel > .table-responsive > .table-bordered > thead > tr:first-child > th, +.panel > .table-bordered > tbody > tr:first-child > th, +.panel > .table-responsive > .table-bordered > tbody > tr:first-child > th, +.panel > .table-bordered > tfoot > tr:first-child > th, +.panel > .table-responsive > .table-bordered > tfoot > tr:first-child > th, +.panel > .table-bordered > thead > tr:first-child > td, +.panel > .table-responsive > .table-bordered > thead > tr:first-child > td, +.panel > .table-bordered > tbody > tr:first-child > td, +.panel > .table-responsive > .table-bordered > tbody > tr:first-child > td, +.panel > .table-bordered > tfoot > tr:first-child > td, +.panel > .table-responsive > .table-bordered > tfoot > tr:first-child > td { + border-top: 0; +} +.panel > .table-bordered > thead > tr:last-child > th, +.panel > .table-responsive > .table-bordered > thead > tr:last-child > th, +.panel > .table-bordered > tbody > tr:last-child > th, +.panel > .table-responsive > .table-bordered > tbody > tr:last-child > th, +.panel > .table-bordered > tfoot > tr:last-child > th, +.panel > .table-responsive > .table-bordered > tfoot > tr:last-child > th, +.panel > .table-bordered > thead > tr:last-child > td, +.panel > .table-responsive > .table-bordered > thead > tr:last-child > td, +.panel > .table-bordered > tbody > tr:last-child > td, +.panel > .table-responsive > .table-bordered > tbody > tr:last-child > td, +.panel > .table-bordered > tfoot > tr:last-child > td, +.panel > .table-responsive > .table-bordered > tfoot > tr:last-child > td { + border-bottom: 0; +} +.panel > .table-responsive { + border: 0; + margin-bottom: 0; +} +.panel-heading { + padding: 10px 15px; + border-bottom: 1px solid transparent; + border-top-right-radius: 3px; + border-top-left-radius: 3px; +} +.panel-heading > .dropdown .dropdown-toggle { + color: inherit; +} +.panel-title { + margin-top: 0; + margin-bottom: 0; + font-size: 16px; + color: inherit; +} +.panel-title > a { + color: inherit; +} +.panel-footer { + padding: 10px 15px; + background-color: #f5f5f5; + border-top: 1px solid #ddd; + border-bottom-right-radius: 3px; + border-bottom-left-radius: 3px; +} +.panel-group { + margin-bottom: 20px; +} +.panel-group .panel { + margin-bottom: 0; + border-radius: 4px; + overflow: hidden; +} +.panel-group .panel + .panel { + margin-top: 5px; +} +.panel-group .panel-heading { + border-bottom: 0; +} +.panel-group .panel-heading + .panel-collapse .panel-body { + border-top: 1px solid #ddd; +} +.panel-group .panel-footer { + border-top: 0; +} +.panel-group .panel-footer + .panel-collapse .panel-body { + border-bottom: 1px solid #ddd; +} +.panel-default { + border-color: #ddd; +} +.panel-default > .panel-heading { + color: #333; + background-color: #f5f5f5; + border-color: #ddd; +} +.panel-default > .panel-heading + .panel-collapse .panel-body { + border-top-color: #ddd; +} +.panel-default > .panel-footer + .panel-collapse .panel-body { + border-bottom-color: #ddd; +} +.panel-primary { + border-color: #428bca; +} +.panel-primary > .panel-heading { + color: #fff; + background-color: #428bca; + border-color: #428bca; +} +.panel-primary > .panel-heading + .panel-collapse .panel-body { + border-top-color: #428bca; +} +.panel-primary > .panel-footer + .panel-collapse .panel-body { + border-bottom-color: #428bca; +} +.panel-success { + border-color: #d6e9c6; +} +.panel-success > .panel-heading { + color: #3c763d; + background-color: #dff0d8; + border-color: #d6e9c6; +} +.panel-success > .panel-heading + .panel-collapse .panel-body { + border-top-color: #d6e9c6; +} +.panel-success > .panel-footer + .panel-collapse .panel-body { + border-bottom-color: #d6e9c6; +} +.panel-info { + border-color: #bce8f1; +} +.panel-info > .panel-heading { + color: #31708f; + background-color: #d9edf7; + border-color: #bce8f1; +} +.panel-info > .panel-heading + .panel-collapse .panel-body { + border-top-color: #bce8f1; +} +.panel-info > .panel-footer + .panel-collapse .panel-body { + border-bottom-color: #bce8f1; +} +.panel-warning { + border-color: #faebcc; +} +.panel-warning > .panel-heading { + color: #8a6d3b; + background-color: #fcf8e3; + border-color: #faebcc; +} +.panel-warning > .panel-heading + .panel-collapse .panel-body { + border-top-color: #faebcc; +} +.panel-warning > .panel-footer + .panel-collapse .panel-body { + border-bottom-color: #faebcc; +} +.panel-danger { + border-color: #ebccd1; +} +.panel-danger > .panel-heading { + color: #a94442; + background-color: #f2dede; + border-color: #ebccd1; +} +.panel-danger > .panel-heading + .panel-collapse .panel-body { + border-top-color: #ebccd1; +} +.panel-danger > .panel-footer + .panel-collapse .panel-body { + border-bottom-color: #ebccd1; +} +.well { + min-height: 20px; + padding: 19px; + margin-bottom: 20px; + background-color: #f5f5f5; + border: 1px solid #e3e3e3; + border-radius: 4px; + -webkit-box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.05); + box-shadow: inset 0 1px 1px rgba(0, 0, 0, 0.05); +} +.well blockquote { + border-color: #ddd; + border-color: rgba(0, 0, 0, 0.15); +} +.well-lg { + padding: 24px; + border-radius: 6px; +} +.well-sm { + padding: 9px; + border-radius: 3px; +} +.close { + float: right; + font-size: 21px; + font-weight: 700; + line-height: 1; + color: #000; + text-shadow: 0 1px 0 #fff; + opacity: 0.2; + filter: alpha(opacity=20); +} +.close:hover, +.close:focus { + color: #000; + text-decoration: none; + cursor: pointer; + opacity: 0.5; + filter: alpha(opacity=50); +} +button.close { + padding: 0; + cursor: pointer; + background: 0 0; + border: 0; + -webkit-appearance: none; +} +.modal-open { + overflow: hidden; +} +.modal { + display: none; + overflow: auto; + overflow-y: scroll; + position: fixed; + top: 0; + right: 0; + bottom: 0; + left: 0; + z-index: 1050; + -webkit-overflow-scrolling: touch; + outline: 0; +} +.modal.fade .modal-dialog { + -webkit-transform: translate(0, -25%); + -ms-transform: translate(0, -25%); + transform: translate(0, -25%); + -webkit-transition: -webkit-transform 0.3s ease-out; + -moz-transition: -moz-transform 0.3s ease-out; + -o-transition: -o-transform 0.3s ease-out; + transition: transform 0.3s ease-out; +} +.modal.in .modal-dialog { + -webkit-transform: translate(0, 0); + -ms-transform: translate(0, 0); + transform: translate(0, 0); +} +.modal-dialog { + position: relative; + width: auto; + margin: 10px; +} +.modal-content { + position: relative; + background-color: #fff; + border: 1px solid #999; + border: 1px solid rgba(0, 0, 0, 0.2); + border-radius: 6px; + -webkit-box-shadow: 0 3px 9px rgba(0, 0, 0, 0.5); + box-shadow: 0 3px 9px rgba(0, 0, 0, 0.5); + background-clip: padding-box; + outline: 0; +} +.modal-backdrop { + position: fixed; + top: 0; + right: 0; + bottom: 0; + left: 0; + z-index: 1040; + background-color: #000; +} +.modal-backdrop.fade { + opacity: 0; + filter: alpha(opacity=0); +} +.modal-backdrop.in { + opacity: 0.5; + filter: alpha(opacity=50); +} +.modal-header { + padding: 15px; + border-bottom: 1px solid #e5e5e5; + min-height: 16.428571429px; +} +.modal-header .close { + margin-top: -2px; +} +.modal-title { + margin: 0; + line-height: 1.428571429; +} +.modal-body { + position: relative; + padding: 20px; +} +.modal-footer { + margin-top: 15px; + padding: 19px 20px 20px; + text-align: right; + border-top: 1px solid #e5e5e5; +} +.modal-footer .btn + .btn { + margin-left: 5px; + margin-bottom: 0; +} +.modal-footer .btn-group .btn + .btn { + margin-left: -1px; +} +.modal-footer .btn-block + .btn-block { + margin-left: 0; +} +@media (min-width: 768px) { + .modal-dialog { + width: 600px; + margin: 30px auto; + } + .modal-content { + -webkit-box-shadow: 0 5px 15px rgba(0, 0, 0, 0.5); + box-shadow: 0 5px 15px rgba(0, 0, 0, 0.5); + } + .modal-sm { + width: 300px; + } + .modal-lg { + width: 900px; + } +} +.tooltip { + position: absolute; + z-index: 1030; + display: block; + visibility: visible; + font-size: 12px; + line-height: 1.4; + opacity: 0; + filter: alpha(opacity=0); +} +.tooltip.in { + opacity: 0.9; + filter: alpha(opacity=90); +} +.tooltip.top { + margin-top: -3px; + padding: 5px 0; +} +.tooltip.right { + margin-left: 3px; + padding: 0 5px; +} +.tooltip.bottom { + margin-top: 3px; + padding: 5px 0; +} +.tooltip.left { + margin-left: -3px; + padding: 0 5px; +} +.tooltip-inner { + max-width: 200px; + padding: 3px 8px; + color: #fff; + text-align: center; + text-decoration: none; + background-color: #000; + border-radius: 4px; +} +.tooltip-arrow { + position: absolute; + width: 0; + height: 0; + border-color: transparent; + border-style: solid; +} +.tooltip.top .tooltip-arrow { + bottom: 0; + left: 50%; + margin-left: -5px; + border-width: 5px 5px 0; + border-top-color: #000; +} +.tooltip.top-left .tooltip-arrow { + bottom: 0; + left: 5px; + border-width: 5px 5px 0; + border-top-color: #000; +} +.tooltip.top-right .tooltip-arrow { + bottom: 0; + right: 5px; + border-width: 5px 5px 0; + border-top-color: #000; +} +.tooltip.right .tooltip-arrow { + top: 50%; + left: 0; + margin-top: -5px; + border-width: 5px 5px 5px 0; + border-right-color: #000; +} +.tooltip.left .tooltip-arrow { + top: 50%; + right: 0; + margin-top: -5px; + border-width: 5px 0 5px 5px; + border-left-color: #000; +} +.tooltip.bottom .tooltip-arrow { + top: 0; + left: 50%; + margin-left: -5px; + border-width: 0 5px 5px; + border-bottom-color: #000; +} +.tooltip.bottom-left .tooltip-arrow { + top: 0; + left: 5px; + border-width: 0 5px 5px; + border-bottom-color: #000; +} +.tooltip.bottom-right .tooltip-arrow { + top: 0; + right: 5px; + border-width: 0 5px 5px; + border-bottom-color: #000; +} +.popover { + position: absolute; + top: 0; + left: 0; + z-index: 1010; + display: none; + max-width: 276px; + padding: 1px; + text-align: left; + background-color: #fff; + background-clip: padding-box; + border: 1px solid #ccc; + border: 1px solid rgba(0, 0, 0, 0.2); + border-radius: 6px; + -webkit-box-shadow: 0 5px 10px rgba(0, 0, 0, 0.2); + box-shadow: 0 5px 10px rgba(0, 0, 0, 0.2); + white-space: normal; +} +.popover.top { + margin-top: -10px; +} +.popover.right { + margin-left: 10px; +} +.popover.bottom { + margin-top: 10px; +} +.popover.left { + margin-left: -10px; +} +.popover-title { + margin: 0; + padding: 8px 14px; + font-size: 14px; + font-weight: 400; + line-height: 18px; + background-color: #f7f7f7; + border-bottom: 1px solid #ebebeb; + border-radius: 5px 5px 0 0; +} +.popover-content { + padding: 9px 14px; +} +.popover .arrow, +.popover .arrow:after { + position: absolute; + display: block; + width: 0; + height: 0; + border-color: transparent; + border-style: solid; +} +.popover .arrow { + border-width: 11px; +} +.popover .arrow:after { + border-width: 10px; + content: ""; +} +.popover.top .arrow { + left: 50%; + margin-left: -11px; + border-bottom-width: 0; + border-top-color: #999; + border-top-color: rgba(0, 0, 0, 0.25); + bottom: -11px; +} +.popover.top .arrow:after { + content: " "; + bottom: 1px; + margin-left: -10px; + border-bottom-width: 0; + border-top-color: #fff; +} +.popover.right .arrow { + top: 50%; + left: -11px; + margin-top: -11px; + border-left-width: 0; + border-right-color: #999; + border-right-color: rgba(0, 0, 0, 0.25); +} +.popover.right .arrow:after { + content: " "; + left: 1px; + bottom: -10px; + border-left-width: 0; + border-right-color: #fff; +} +.popover.bottom .arrow { + left: 50%; + margin-left: -11px; + border-top-width: 0; + border-bottom-color: #999; + border-bottom-color: rgba(0, 0, 0, 0.25); + top: -11px; +} +.popover.bottom .arrow:after { + content: " "; + top: 1px; + margin-left: -10px; + border-top-width: 0; + border-bottom-color: #fff; +} +.popover.left .arrow { + top: 50%; + right: -11px; + margin-top: -11px; + border-right-width: 0; + border-left-color: #999; + border-left-color: rgba(0, 0, 0, 0.25); +} +.popover.left .arrow:after { + content: " "; + right: 1px; + border-right-width: 0; + border-left-color: #fff; + bottom: -10px; +} +.carousel { + position: relative; +} +.carousel-inner { + position: relative; + overflow: hidden; + width: 100%; +} +.carousel-inner > .item { + display: none; + position: relative; + -webkit-transition: 0.6s ease-in-out left; + transition: 0.6s ease-in-out left; +} +.carousel-inner > .item > img, +.carousel-inner > .item > a > img { + display: block; + max-width: 100%; + height: auto; + line-height: 1; +} +.carousel-inner > .active, +.carousel-inner > .next, +.carousel-inner > .prev { + display: block; +} +.carousel-inner > .active { + left: 0; +} +.carousel-inner > .next, +.carousel-inner > .prev { + position: absolute; + top: 0; + width: 100%; +} +.carousel-inner > .next { + left: 100%; +} +.carousel-inner > .prev { + left: -100%; +} +.carousel-inner > .next.left, +.carousel-inner > .prev.right { + left: 0; +} +.carousel-inner > .active.left { + left: -100%; +} +.carousel-inner > .active.right { + left: 100%; +} +.carousel-control { + position: absolute; + top: 0; + left: 0; + bottom: 0; + width: 15%; + opacity: 0.5; + filter: alpha(opacity=50); + font-size: 20px; + color: #fff; + text-align: center; + text-shadow: 0 1px 2px rgba(0, 0, 0, 0.6); +} +.carousel-control.left { + background-image: -webkit-linear-gradient( + left, + color-stop(rgba(0, 0, 0, 0.5) 0), + color-stop(rgba(0, 0, 0, 0.0001) 100%) + ); + background-image: linear-gradient( + to right, + rgba(0, 0, 0, 0.5) 0, + rgba(0, 0, 0, 0.0001) 100% + ); + background-repeat: repeat-x; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#80000000', endColorstr='#00000000', GradientType=1); +} +.carousel-control.right { + left: auto; + right: 0; + background-image: -webkit-linear-gradient( + left, + color-stop(rgba(0, 0, 0, 0.0001) 0), + color-stop(rgba(0, 0, 0, 0.5) 100%) + ); + background-image: linear-gradient( + to right, + rgba(0, 0, 0, 0.0001) 0, + rgba(0, 0, 0, 0.5) 100% + ); + background-repeat: repeat-x; + filter: progid:DXImageTransform.Microsoft.gradient(startColorstr='#00000000', endColorstr='#80000000', GradientType=1); +} +.carousel-control:hover, +.carousel-control:focus { + outline: 0; + color: #fff; + text-decoration: none; + opacity: 0.9; + filter: alpha(opacity=90); +} +.carousel-control .icon-prev, +.carousel-control .icon-next, +.carousel-control .glyphicon-chevron-left, +.carousel-control .glyphicon-chevron-right { + position: absolute; + top: 50%; + z-index: 5; + display: inline-block; +} +.carousel-control .icon-prev, +.carousel-control .glyphicon-chevron-left { + left: 50%; +} +.carousel-control .icon-next, +.carousel-control .glyphicon-chevron-right { + right: 50%; +} +.carousel-control .icon-prev, +.carousel-control .icon-next { + width: 20px; + height: 20px; + margin-top: -10px; + margin-left: -10px; + font-family: serif; +} +.carousel-control .icon-prev:before { + content: "\2039"; +} +.carousel-control .icon-next:before { + content: "\203a"; +} +.carousel-indicators { + position: absolute; + bottom: 10px; + left: 50%; + z-index: 15; + width: 60%; + margin-left: -30%; + padding-left: 0; + list-style: none; + text-align: center; +} +.carousel-indicators li { + display: inline-block; + width: 10px; + height: 10px; + margin: 1px; + text-indent: -999px; + border: 1px solid #fff; + border-radius: 10px; + cursor: pointer; + background-color: #000 \9; + background-color: rgba(0, 0, 0, 0); +} +.carousel-indicators .active { + margin: 0; + width: 12px; + height: 12px; + background-color: #fff; +} +.carousel-caption { + position: absolute; + left: 15%; + right: 15%; + bottom: 20px; + z-index: 10; + padding-top: 20px; + padding-bottom: 20px; + color: #fff; + text-align: center; + text-shadow: 0 1px 2px rgba(0, 0, 0, 0.6); +} +.carousel-caption .btn { + text-shadow: none; +} +@media screen and (min-width: 768px) { + .carousel-control .glyphicons-chevron-left, + .carousel-control .glyphicons-chevron-right, + .carousel-control .icon-prev, + .carousel-control .icon-next { + width: 30px; + height: 30px; + margin-top: -15px; + margin-left: -15px; + font-size: 30px; + } + .carousel-caption { + left: 20%; + right: 20%; + padding-bottom: 30px; + } + .carousel-indicators { + bottom: 20px; + } +} +.clearfix:before, +.clearfix:after, +.container:before, +.container:after, +.container-fluid:before, +.container-fluid:after, +.row:before, +.row:after, +.form-horizontal .form-group:before, +.form-horizontal .form-group:after, +.btn-toolbar:before, +.btn-toolbar:after, +.btn-group-vertical > .btn-group:before, +.btn-group-vertical > .btn-group:after, +.nav:before, +.nav:after, +.navbar:before, +.navbar:after, +.navbar-header:before, +.navbar-header:after, +.navbar-collapse:before, +.navbar-collapse:after, +.pager:before, +.pager:after, +.panel-body:before, +.panel-body:after, +.modal-footer:before, +.modal-footer:after { + content: " "; + display: table; +} +.clearfix:after, +.container:after, +.container-fluid:after, +.row:after, +.form-horizontal .form-group:after, +.btn-toolbar:after, +.btn-group-vertical > .btn-group:after, +.nav:after, +.navbar:after, +.navbar-header:after, +.navbar-collapse:after, +.pager:after, +.panel-body:after, +.modal-footer:after { + clear: both; +} +.center-block { + display: block; + margin-left: auto; + margin-right: auto; +} +.pull-right { + float: right !important; +} +.pull-left { + float: left !important; +} +.hide { + display: none !important; +} +.show { + display: block !important; +} +.invisible { + visibility: hidden; +} +.text-hide { + font: 0/0 a; + color: transparent; + text-shadow: none; + background-color: transparent; + border: 0; +} +.hidden { + display: none !important; + visibility: hidden !important; +} +.affix { + position: fixed; +} +@-ms-viewport { + width: device-width; +} +.visible-xs, +tr.visible-xs, +th.visible-xs, +td.visible-xs { + display: none !important; +} +@media (max-width: 767px) { + .visible-xs { + display: block !important; + } + table.visible-xs { + display: table; + } + tr.visible-xs { + display: table-row !important; + } + th.visible-xs, + td.visible-xs { + display: table-cell !important; + } +} +.visible-sm, +tr.visible-sm, +th.visible-sm, +td.visible-sm { + display: none !important; +} +@media (min-width: 768px) and (max-width: 991px) { + .visible-sm { + display: block !important; + } + table.visible-sm { + display: table; + } + tr.visible-sm { + display: table-row !important; + } + th.visible-sm, + td.visible-sm { + display: table-cell !important; + } +} +.visible-md, +tr.visible-md, +th.visible-md, +td.visible-md { + display: none !important; +} +@media (min-width: 992px) and (max-width: 1199px) { + .visible-md { + display: block !important; + } + table.visible-md { + display: table; + } + tr.visible-md { + display: table-row !important; + } + th.visible-md, + td.visible-md { + display: table-cell !important; + } +} +.visible-lg, +tr.visible-lg, +th.visible-lg, +td.visible-lg { + display: none !important; +} +@media (min-width: 1200px) { + .visible-lg { + display: block !important; + } + table.visible-lg { + display: table; + } + tr.visible-lg { + display: table-row !important; + } + th.visible-lg, + td.visible-lg { + display: table-cell !important; + } +} +@media (max-width: 767px) { + .hidden-xs, + tr.hidden-xs, + th.hidden-xs, + td.hidden-xs { + display: none !important; + } +} +@media (min-width: 768px) and (max-width: 991px) { + .hidden-sm, + tr.hidden-sm, + th.hidden-sm, + td.hidden-sm { + display: none !important; + } +} +@media (min-width: 992px) and (max-width: 1199px) { + .hidden-md, + tr.hidden-md, + th.hidden-md, + td.hidden-md { + display: none !important; + } +} +@media (min-width: 1200px) { + .hidden-lg, + tr.hidden-lg, + th.hidden-lg, + td.hidden-lg { + display: none !important; + } +} +.visible-print, +tr.visible-print, +th.visible-print, +td.visible-print { + display: none !important; +} +@media print { + .visible-print { + display: block !important; + } + table.visible-print { + display: table; + } + tr.visible-print { + display: table-row !important; + } + th.visible-print, + td.visible-print { + display: table-cell !important; + } +} +@media print { + .hidden-print, + tr.hidden-print, + th.hidden-print, + td.hidden-print { + display: none !important; + } +} diff --git a/backend/tests/integration/tests/pruning/website/css/custom-fonts.css b/backend/tests/integration/tests/pruning/website/css/custom-fonts.css new file mode 100644 index 00000000000..f3b62deb077 --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/css/custom-fonts.css @@ -0,0 +1,1028 @@ +/* ================================================== +Font-Face Icons +================================================== */ + +@font-face { + font-family: "Icons"; + src: url("../fonts/customicon/Icons.eot"); + src: + url("../fonts/customicon/Icons.eot?#iefix") format("embedded-opentype"), + url("../fonts/customicon/Icons.woff") format("woff"), + url("../fonts/customicon/Icons.ttf") format("truetype"), + url("../fonts/customicon/Icons.svg#Icons") format("svg"); + font-weight: normal; + font-style: normal; +} + +/* Use the following CSS code if you want to use data attributes for inserting your icons */ +[data-icon]:before { + font-family: "Icons"; + content: attr(data-icon); + speak: none; + font-weight: normal; + font-variant: normal; + text-transform: none; + line-height: 1; + -webkit-font-smoothing: antialiased; +} + +[class^="font-"]:before, +[class*=" font-"]:before { + font-family: "Icons"; + speak: none; + font-style: normal; + font-weight: normal; + font-variant: normal; + text-transform: none; + -webkit-font-smoothing: antialiased; +} + +[class^="font-"], +[class*=" font-"] { + display: inline-block; + line-height: 1em; +} + +/* Use the following CSS code if you want to have a class per icon */ +/* +Instead of a list of all class selectors, +you can use the generic selector below, but it's slower: +[class*="font-icon-"] { +*/ +.font-icon-zoom-out, +.font-icon-zoom-in, +.font-icon-wrench, +.font-icon-waves, +.font-icon-warning, +.font-icon-volume-up, +.font-icon-volume-off, +.font-icon-volume-down, +.font-icon-viewport, +.font-icon-user, +.font-icon-user-border, +.font-icon-upload, +.font-icon-upload-2, +.font-icon-unlock, +.font-icon-underline, +.font-icon-tint, +.font-icon-time, +.font-icon-text, +.font-icon-text-width, +.font-icon-text-height, +.font-icon-tags, +.font-icon-tag, +.font-icon-table, +.font-icon-strikethrough, +.font-icon-stop, +.font-icon-step-forward, +.font-icon-step-backward, +.font-icon-stars, +.font-icon-star, +.font-icon-star-line, +.font-icon-star-half, +.font-icon-sort, +.font-icon-sort-up, +.font-icon-sort-down, +.font-icon-social-zerply, +.font-icon-social-youtube, +.font-icon-social-yelp, +.font-icon-social-yahoo, +.font-icon-social-wordpress, +.font-icon-social-virb, +.font-icon-social-vimeo, +.font-icon-social-viddler, +.font-icon-social-twitter, +.font-icon-social-tumblr, +.font-icon-social-stumbleupon, +.font-icon-social-soundcloud, +.font-icon-social-skype, +.font-icon-social-share-this, +.font-icon-social-quora, +.font-icon-social-pinterest, +.font-icon-social-photobucket, +.font-icon-social-paypal, +.font-icon-social-myspace, +.font-icon-social-linkedin, +.font-icon-social-last-fm, +.font-icon-social-grooveshark, +.font-icon-social-google-plus, +.font-icon-social-github, +.font-icon-social-forrst, +.font-icon-social-flickr, +.font-icon-social-facebook, +.font-icon-social-evernote, +.font-icon-social-envato, +.font-icon-social-email, +.font-icon-social-dribbble, +.font-icon-social-digg, +.font-icon-social-deviant-art, +.font-icon-social-blogger, +.font-icon-social-behance, +.font-icon-social-bebo, +.font-icon-social-addthis, +.font-icon-social-500px, +.font-icon-sitemap, +.font-icon-signout, +.font-icon-signin, +.font-icon-signal, +.font-icon-shopping-cart, +.font-icon-search, +.font-icon-rss, +.font-icon-road, +.font-icon-retweet, +.font-icon-resize-vertical, +.font-icon-resize-vertical-2, +.font-icon-resize-small, +.font-icon-resize-horizontal, +.font-icon-resize-horizontal-2, +.font-icon-resize-fullscreen, +.font-icon-resize-full, +.font-icon-repeat, +.font-icon-reorder, +.font-icon-remove, +.font-icon-remove-sign, +.font-icon-remove-circle, +.font-icon-read-more, +.font-icon-random, +.font-icon-question-sign, +.font-icon-pushpin, +.font-icon-pushpin-2, +.font-icon-print, +.font-icon-plus, +.font-icon-plus-sign, +.font-icon-play, +.font-icon-picture, +.font-icon-phone, +.font-icon-phone-sign, +.font-icon-phone-boxed, +.font-icon-pause, +.font-icon-paste, +.font-icon-paper-clip, +.font-icon-ok, +.font-icon-ok-sign, +.font-icon-ok-circle, +.font-icon-music, +.font-icon-move, +.font-icon-money, +.font-icon-minus, +.font-icon-minus-sign, +.font-icon-map, +.font-icon-map-marker, +.font-icon-map-marker-2, +.font-icon-magnet, +.font-icon-magic, +.font-icon-lock, +.font-icon-list, +.font-icon-list-3, +.font-icon-list-2, +.font-icon-link, +.font-icon-layer, +.font-icon-key, +.font-icon-italic, +.font-icon-info, +.font-icon-indent-right, +.font-icon-indent-left, +.font-icon-inbox, +.font-icon-inbox-empty, +.font-icon-home, +.font-icon-heart, +.font-icon-heart-line, +.font-icon-headphones, +.font-icon-headphones-line, +.font-icon-headphones-line-2, +.font-icon-headphones-2, +.font-icon-hdd, +.font-icon-group, +.font-icon-grid, +.font-icon-grid-large, +.font-icon-globe_line, +.font-icon-glass, +.font-icon-glass_2, +.font-icon-gift, +.font-icon-forward, +.font-icon-font, +.font-icon-folder-open, +.font-icon-folder-close, +.font-icon-flag, +.font-icon-fire, +.font-icon-film, +.font-icon-file, +.font-icon-file-empty, +.font-icon-fast-forward, +.font-icon-fast-backward, +.font-icon-facetime, +.font-icon-eye, +.font-icon-eye_disable, +.font-icon-expand-view, +.font-icon-expand-view-3, +.font-icon-expand-view-2, +.font-icon-expand-vertical, +.font-icon-expand-horizontal, +.font-icon-exclamation, +.font-icon-email, +.font-icon-email_2, +.font-icon-eject, +.font-icon-edit, +.font-icon-edit-check, +.font-icon-download, +.font-icon-download_2, +.font-icon-dashboard, +.font-icon-credit-card, +.font-icon-copy, +.font-icon-comments, +.font-icon-comments-line, +.font-icon-comment, +.font-icon-comment-line, +.font-icon-columns, +.font-icon-columns-2, +.font-icon-cogs, +.font-icon-cog, +.font-icon-cloud, +.font-icon-check, +.font-icon-check-empty, +.font-icon-certificate, +.font-icon-camera, +.font-icon-calendar, +.font-icon-bullhorn, +.font-icon-briefcase, +.font-icon-bookmark, +.font-icon-book, +.font-icon-bolt, +.font-icon-bold, +.font-icon-blockquote, +.font-icon-bell, +.font-icon-beaker, +.font-icon-barcode, +.font-icon-ban-circle, +.font-icon-ban-chart, +.font-icon-ban-chart-2, +.font-icon-backward, +.font-icon-asterisk, +.font-icon-arrow-simple-up, +.font-icon-arrow-simple-up-circle, +.font-icon-arrow-simple-right, +.font-icon-arrow-simple-right-circle, +.font-icon-arrow-simple-left, +.font-icon-arrow-simple-left-circle, +.font-icon-arrow-simple-down, +.font-icon-arrow-simple-down-circle, +.font-icon-arrow-round-up, +.font-icon-arrow-round-up-circle, +.font-icon-arrow-round-right, +.font-icon-arrow-round-right-circle, +.font-icon-arrow-round-left, +.font-icon-arrow-round-left-circle, +.font-icon-arrow-round-down, +.font-icon-arrow-round-down-circle, +.font-icon-arrow-light-up, +.font-icon-arrow-light-round-up, +.font-icon-arrow-light-round-up-circle, +.font-icon-arrow-light-round-right, +.font-icon-arrow-light-round-right-circle, +.font-icon-arrow-light-round-left, +.font-icon-arrow-light-round-left-circle, +.font-icon-arrow-light-round-down, +.font-icon-arrow-light-round-down-circle, +.font-icon-arrow-light-right, +.font-icon-arrow-light-left, +.font-icon-arrow-light-down, +.font-icon-align-right, +.font-icon-align-left, +.font-icon-align-justify, +.font-icon-align-center, +.font-icon-adjust { + font-family: "Icons"; + speak: none; + font-style: normal; + font-weight: normal; + font-variant: normal; + text-transform: none; + line-height: 1; + -webkit-font-smoothing: antialiased; +} +.font-icon-zoom-out:before { + content: "\e000"; +} +.font-icon-zoom-in:before { + content: "\e001"; +} +.font-icon-wrench:before { + content: "\e002"; +} +.font-icon-waves:before { + content: "\e003"; +} +.font-icon-warning:before { + content: "\e004"; +} +.font-icon-volume-up:before { + content: "\e005"; +} +.font-icon-volume-off:before { + content: "\e006"; +} +.font-icon-volume-down:before { + content: "\e007"; +} +.font-icon-viewport:before { + content: "\e008"; +} +.font-icon-user:before { + content: "\e009"; +} +.font-icon-user-border:before { + content: "\e00a"; +} +.font-icon-upload:before { + content: "\e00b"; +} +.font-icon-upload-2:before { + content: "\e00c"; +} +.font-icon-unlock:before { + content: "\e00d"; +} +.font-icon-underline:before { + content: "\e00e"; +} +.font-icon-tint:before { + content: "\e00f"; +} +.font-icon-time:before { + content: "\e010"; +} +.font-icon-text:before { + content: "\e011"; +} +.font-icon-text-width:before { + content: "\e012"; +} +.font-icon-text-height:before { + content: "\e013"; +} +.font-icon-tags:before { + content: "\e014"; +} +.font-icon-tag:before { + content: "\e015"; +} +.font-icon-table:before { + content: "\e016"; +} +.font-icon-strikethrough:before { + content: "\e017"; +} +.font-icon-stop:before { + content: "\e018"; +} +.font-icon-step-forward:before { + content: "\e019"; +} +.font-icon-step-backward:before { + content: "\e01a"; +} +.font-icon-stars:before { + content: "\e01b"; +} +.font-icon-star:before { + content: "\e01c"; +} +.font-icon-star-line:before { + content: "\e01d"; +} +.font-icon-star-half:before { + content: "\e01e"; +} +.font-icon-sort:before { + content: "\e01f"; +} +.font-icon-sort-up:before { + content: "\e020"; +} +.font-icon-sort-down:before { + content: "\e021"; +} +.font-icon-social-zerply:before { + content: "\e022"; +} +.font-icon-social-youtube:before { + content: "\e023"; +} +.font-icon-social-yelp:before { + content: "\e024"; +} +.font-icon-social-yahoo:before { + content: "\e025"; +} +.font-icon-social-wordpress:before { + content: "\e026"; +} +.font-icon-social-virb:before { + content: "\e027"; +} +.font-icon-social-vimeo:before { + content: "\e028"; +} +.font-icon-social-viddler:before { + content: "\e029"; +} +.font-icon-social-twitter:before { + content: "\e02a"; +} +.font-icon-social-tumblr:before { + content: "\e02b"; +} +.font-icon-social-stumbleupon:before { + content: "\e02c"; +} +.font-icon-social-soundcloud:before { + content: "\e02d"; +} +.font-icon-social-skype:before { + content: "\e02e"; +} +.font-icon-social-share-this:before { + content: "\e02f"; +} +.font-icon-social-quora:before { + content: "\e030"; +} +.font-icon-social-pinterest:before { + content: "\e031"; +} +.font-icon-social-photobucket:before { + content: "\e032"; +} +.font-icon-social-paypal:before { + content: "\e033"; +} +.font-icon-social-myspace:before { + content: "\e034"; +} +.font-icon-social-linkedin:before { + content: "\e035"; +} +.font-icon-social-last-fm:before { + content: "\e036"; +} +.font-icon-social-grooveshark:before { + content: "\e037"; +} +.font-icon-social-google-plus:before { + content: "\e038"; +} +.font-icon-social-github:before { + content: "\e039"; +} +.font-icon-social-forrst:before { + content: "\e03a"; +} +.font-icon-social-flickr:before { + content: "\e03b"; +} +.font-icon-social-facebook:before { + content: "\e03c"; +} +.font-icon-social-evernote:before { + content: "\e03d"; +} +.font-icon-social-envato:before { + content: "\e03e"; +} +.font-icon-social-email:before { + content: "\e03f"; +} +.font-icon-social-dribbble:before { + content: "\e040"; +} +.font-icon-social-digg:before { + content: "\e041"; +} +.font-icon-social-deviant-art:before { + content: "\e042"; +} +.font-icon-social-blogger:before { + content: "\e043"; +} +.font-icon-social-behance:before { + content: "\e044"; +} +.font-icon-social-bebo:before { + content: "\e045"; +} +.font-icon-social-addthis:before { + content: "\e046"; +} +.font-icon-social-500px:before { + content: "\e047"; +} +.font-icon-sitemap:before { + content: "\e048"; +} +.font-icon-signout:before { + content: "\e049"; +} +.font-icon-signin:before { + content: "\e04a"; +} +.font-icon-signal:before { + content: "\e04b"; +} +.font-icon-shopping-cart:before { + content: "\e04c"; +} +.font-icon-search:before { + content: "\e04d"; +} +.font-icon-rss:before { + content: "\e04e"; +} +.font-icon-road:before { + content: "\e04f"; +} +.font-icon-retweet:before { + content: "\e050"; +} +.font-icon-resize-vertical:before { + content: "\e051"; +} +.font-icon-resize-vertical-2:before { + content: "\e052"; +} +.font-icon-resize-small:before { + content: "\e053"; +} +.font-icon-resize-horizontal:before { + content: "\e054"; +} +.font-icon-resize-horizontal-2:before { + content: "\e055"; +} +.font-icon-resize-fullscreen:before { + content: "\e056"; +} +.font-icon-resize-full:before { + content: "\e057"; +} +.font-icon-repeat:before { + content: "\e058"; +} +.font-icon-reorder:before { + content: "\e059"; +} +.font-icon-remove:before { + content: "\e05a"; +} +.font-icon-remove-sign:before { + content: "\e05b"; +} +.font-icon-remove-circle:before { + content: "\e05c"; +} +.font-icon-read-more:before { + content: "\e05d"; +} +.font-icon-random:before { + content: "\e05e"; +} +.font-icon-question-sign:before { + content: "\e05f"; +} +.font-icon-pushpin:before { + content: "\e060"; +} +.font-icon-pushpin-2:before { + content: "\e061"; +} +.font-icon-print:before { + content: "\e062"; +} +.font-icon-plus:before { + content: "\e063"; +} +.font-icon-plus-sign:before { + content: "\e064"; +} +.font-icon-play:before { + content: "\e065"; +} +.font-icon-picture:before { + content: "\e066"; +} +.font-icon-phone:before { + content: "\e067"; +} +.font-icon-phone-sign:before { + content: "\e068"; +} +.font-icon-phone-boxed:before { + content: "\e069"; +} +.font-icon-pause:before { + content: "\e06a"; +} +.font-icon-paste:before { + content: "\e06b"; +} +.font-icon-paper-clip:before { + content: "\e06c"; +} +.font-icon-ok:before { + content: "\e06d"; +} +.font-icon-ok-sign:before { + content: "\e06e"; +} +.font-icon-ok-circle:before { + content: "\e06f"; +} +.font-icon-music:before { + content: "\e070"; +} +.font-icon-move:before { + content: "\e071"; +} +.font-icon-money:before { + content: "\e072"; +} +.font-icon-minus:before { + content: "\e073"; +} +.font-icon-minus-sign:before { + content: "\e074"; +} +.font-icon-map:before { + content: "\e075"; +} +.font-icon-map-marker:before { + content: "\e076"; +} +.font-icon-map-marker-2:before { + content: "\e077"; +} +.font-icon-magnet:before { + content: "\e078"; +} +.font-icon-magic:before { + content: "\e079"; +} +.font-icon-lock:before { + content: "\e07a"; +} +.font-icon-list:before { + content: "\e07b"; +} +.font-icon-list-3:before { + content: "\e07c"; +} +.font-icon-list-2:before { + content: "\e07d"; +} +.font-icon-link:before { + content: "\e07e"; +} +.font-icon-layer:before { + content: "\e07f"; +} +.font-icon-key:before { + content: "\e080"; +} +.font-icon-italic:before { + content: "\e081"; +} +.font-icon-info:before { + content: "\e082"; +} +.font-icon-indent-right:before { + content: "\e083"; +} +.font-icon-indent-left:before { + content: "\e084"; +} +.font-icon-inbox:before { + content: "\e085"; +} +.font-icon-inbox-empty:before { + content: "\e086"; +} +.font-icon-home:before { + content: "\e087"; +} +.font-icon-heart:before { + content: "\e088"; +} +.font-icon-heart-line:before { + content: "\e089"; +} +.font-icon-headphones:before { + content: "\e08a"; +} +.font-icon-headphones-line:before { + content: "\e08b"; +} +.font-icon-headphones-line-2:before { + content: "\e08c"; +} +.font-icon-headphones-2:before { + content: "\e08d"; +} +.font-icon-hdd:before { + content: "\e08e"; +} +.font-icon-group:before { + content: "\e08f"; +} +.font-icon-grid:before { + content: "\e090"; +} +.font-icon-grid-large:before { + content: "\e091"; +} +.font-icon-globe_line:before { + content: "\e092"; +} +.font-icon-glass:before { + content: "\e093"; +} +.font-icon-glass_2:before { + content: "\e094"; +} +.font-icon-gift:before { + content: "\e095"; +} +.font-icon-forward:before { + content: "\e096"; +} +.font-icon-font:before { + content: "\e097"; +} +.font-icon-folder-open:before { + content: "\e098"; +} +.font-icon-folder-close:before { + content: "\e099"; +} +.font-icon-flag:before { + content: "\e09a"; +} +.font-icon-fire:before { + content: "\e09b"; +} +.font-icon-film:before { + content: "\e09c"; +} +.font-icon-file:before { + content: "\e09d"; +} +.font-icon-file-empty:before { + content: "\e09e"; +} +.font-icon-fast-forward:before { + content: "\e09f"; +} +.font-icon-fast-backward:before { + content: "\e0a0"; +} +.font-icon-facetime:before { + content: "\e0a1"; +} +.font-icon-eye:before { + content: "\e0a2"; +} +.font-icon-eye_disable:before { + content: "\e0a3"; +} +.font-icon-expand-view:before { + content: "\e0a4"; +} +.font-icon-expand-view-3:before { + content: "\e0a5"; +} +.font-icon-expand-view-2:before { + content: "\e0a6"; +} +.font-icon-expand-vertical:before { + content: "\e0a7"; +} +.font-icon-expand-horizontal:before { + content: "\e0a8"; +} +.font-icon-exclamation:before { + content: "\e0a9"; +} +.font-icon-email:before { + content: "\e0aa"; +} +.font-icon-email_2:before { + content: "\e0ab"; +} +.font-icon-eject:before { + content: "\e0ac"; +} +.font-icon-edit:before { + content: "\e0ad"; +} +.font-icon-edit-check:before { + content: "\e0ae"; +} +.font-icon-download:before { + content: "\e0af"; +} +.font-icon-download_2:before { + content: "\e0b0"; +} +.font-icon-dashboard:before { + content: "\e0b1"; +} +.font-icon-credit-card:before { + content: "\e0b2"; +} +.font-icon-copy:before { + content: "\e0b3"; +} +.font-icon-comments:before { + content: "\e0b4"; +} +.font-icon-comments-line:before { + content: "\e0b5"; +} +.font-icon-comment:before { + content: "\e0b6"; +} +.font-icon-comment-line:before { + content: "\e0b7"; +} +.font-icon-columns:before { + content: "\e0b8"; +} +.font-icon-columns-2:before { + content: "\e0b9"; +} +.font-icon-cogs:before { + content: "\e0ba"; +} +.font-icon-cog:before { + content: "\e0bb"; +} +.font-icon-cloud:before { + content: "\e0bc"; +} +.font-icon-check:before { + content: "\e0bd"; +} +.font-icon-check-empty:before { + content: "\e0be"; +} +.font-icon-certificate:before { + content: "\e0bf"; +} +.font-icon-camera:before { + content: "\e0c0"; +} +.font-icon-calendar:before { + content: "\e0c1"; +} +.font-icon-bullhorn:before { + content: "\e0c2"; +} +.font-icon-briefcase:before { + content: "\e0c3"; +} +.font-icon-bookmark:before { + content: "\e0c4"; +} +.font-icon-book:before { + content: "\e0c5"; +} +.font-icon-bolt:before { + content: "\e0c6"; +} +.font-icon-bold:before { + content: "\e0c7"; +} +.font-icon-blockquote:before { + content: "\e0c8"; +} +.font-icon-bell:before { + content: "\e0c9"; +} +.font-icon-beaker:before { + content: "\e0ca"; +} +.font-icon-barcode:before { + content: "\e0cb"; +} +.font-icon-ban-circle:before { + content: "\e0cc"; +} +.font-icon-ban-chart:before { + content: "\e0cd"; +} +.font-icon-ban-chart-2:before { + content: "\e0ce"; +} +.font-icon-backward:before { + content: "\e0cf"; +} +.font-icon-asterisk:before { + content: "\e0d0"; +} +.font-icon-arrow-simple-up:before { + content: "\e0d1"; +} +.font-icon-arrow-simple-up-circle:before { + content: "\e0d2"; +} +.font-icon-arrow-simple-right:before { + content: "\e0d3"; +} +.font-icon-arrow-simple-right-circle:before { + content: "\e0d4"; +} +.font-icon-arrow-simple-left:before { + content: "\e0d5"; +} +.font-icon-arrow-simple-left-circle:before { + content: "\e0d6"; +} +.font-icon-arrow-simple-down:before { + content: "\e0d7"; +} +.font-icon-arrow-simple-down-circle:before { + content: "\e0d8"; +} +.font-icon-arrow-round-up:before { + content: "\e0d9"; +} +.font-icon-arrow-round-up-circle:before { + content: "\e0da"; +} +.font-icon-arrow-round-right:before { + content: "\e0db"; +} +.font-icon-arrow-round-right-circle:before { + content: "\e0dc"; +} +.font-icon-arrow-round-left:before { + content: "\e0dd"; +} +.font-icon-arrow-round-left-circle:before { + content: "\e0de"; +} +.font-icon-arrow-round-down:before { + content: "\e0df"; +} +.font-icon-arrow-round-down-circle:before { + content: "\e0e0"; +} +.font-icon-arrow-light-up:before { + content: "\e0e1"; +} +.font-icon-arrow-light-round-up:before { + content: "\e0e2"; +} +.font-icon-arrow-light-round-up-circle:before { + content: "\e0e3"; +} +.font-icon-arrow-light-round-right:before { + content: "\e0e4"; +} +.font-icon-arrow-light-round-right-circle:before { + content: "\e0e5"; +} +.font-icon-arrow-light-round-left:before { + content: "\e0e6"; +} +.font-icon-arrow-light-round-left-circle:before { + content: "\e0e7"; +} +.font-icon-arrow-light-round-down:before { + content: "\e0e8"; +} +.font-icon-arrow-light-round-down-circle:before { + content: "\e0e9"; +} +.font-icon-arrow-light-right:before { + content: "\e0ea"; +} +.font-icon-arrow-light-left:before { + content: "\e0eb"; +} +.font-icon-arrow-light-down:before { + content: "\e0ec"; +} +.font-icon-align-right:before { + content: "\e0ed"; +} +.font-icon-align-left:before { + content: "\e0ee"; +} +.font-icon-align-justify:before { + content: "\e0ef"; +} +.font-icon-align-center:before { + content: "\e0f0"; +} +.font-icon-adjust:before { + content: "\e0f1"; +} diff --git a/backend/tests/integration/tests/pruning/website/css/fancybox/blank.gif b/backend/tests/integration/tests/pruning/website/css/fancybox/blank.gif new file mode 100644 index 00000000000..35d42e808f0 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/css/fancybox/blank.gif differ diff --git a/backend/tests/integration/tests/pruning/website/css/fancybox/fancybox_loading.gif b/backend/tests/integration/tests/pruning/website/css/fancybox/fancybox_loading.gif new file mode 100644 index 00000000000..01586176d79 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/css/fancybox/fancybox_loading.gif differ diff --git a/backend/tests/integration/tests/pruning/website/css/fancybox/fancybox_overlay.png b/backend/tests/integration/tests/pruning/website/css/fancybox/fancybox_overlay.png new file mode 100644 index 00000000000..a4391396a9d Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/css/fancybox/fancybox_overlay.png differ diff --git a/backend/tests/integration/tests/pruning/website/css/fancybox/fancybox_sprite.png b/backend/tests/integration/tests/pruning/website/css/fancybox/fancybox_sprite.png new file mode 100644 index 00000000000..fd8d5ca566d Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/css/fancybox/fancybox_sprite.png differ diff --git a/backend/tests/integration/tests/pruning/website/css/fancybox/jquery.fancybox.css b/backend/tests/integration/tests/pruning/website/css/fancybox/jquery.fancybox.css new file mode 100644 index 00000000000..a20015ff683 --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/css/fancybox/jquery.fancybox.css @@ -0,0 +1,349 @@ +/*! fancyBox v2.1.4 fancyapps.com | fancyapps.com/fancybox/#license */ +.fancybox-wrap, +.fancybox-skin, +.fancybox-outer, +.fancybox-inner, +.fancybox-image, +.fancybox-wrap iframe, +.fancybox-wrap object, +.fancybox-nav, +.fancybox-nav span, +.fancybox-tmp { + padding: 0; + margin: 0; + border: 0; + outline: none; + vertical-align: top; +} + +.fancybox-wrap { + position: absolute; + top: 0; + left: 0; + z-index: 8020; +} + +.fancybox-skin { + position: relative; + background: #2f3238; + color: #565656; + text-shadow: none; + -webkit-border-radius: 0; + -moz-border-radius: 0; + border-radius: 0; +} + +.fancybox-opened { + z-index: 8030; +} + +.fancybox-opened .fancybox-skin { + -webkit-box-shadow: none; + -moz-box-shadow: none; + box-shadow: none; +} + +.fancybox-outer, +.fancybox-inner { + position: relative; +} + +.fancybox-inner { + overflow: hidden; +} + +.fancybox-type-iframe .fancybox-inner { + -webkit-overflow-scrolling: touch; +} + +.fancybox-error { + color: #444; + font-size: 14px; + line-height: 20px; + margin: 0; + padding: 15px; + white-space: nowrap; +} + +.fancybox-image, +.fancybox-iframe { + display: block; + width: 100%; + height: 100%; +} + +.fancybox-image { + max-width: 100%; + max-height: 100%; +} + +#fancybox-loading, +.fancybox-close, +.fancybox-prev span, +.fancybox-next span { + background-image: url("fancybox_sprite.png") !important; +} + +#fancybox-loading { + position: fixed; + top: 50%; + left: 50%; + margin-top: -22px; + margin-left: -22px; + background-position: 0 -108px; + opacity: 0.8; + cursor: pointer; + z-index: 8060; +} + +#fancybox-loading div { + width: 44px; + height: 44px; + background: url("fancybox_loading.gif") center center no-repeat; +} + +.fancybox-close { + position: absolute; + right: 0; + top: 0; + width: 40px; + height: 38px; + cursor: pointer; + z-index: 9000; + background-image: none; + + opacity: 0.5; + + -webkit-transition: + background 0.1s linear 0s, + opacity 0.1s linear 0s; + -moz-transition: + background 0.1s linear 0s, + opacity 0.1s linear 0s; + -o-transition: + background 0.1s linear 0s, + opacity 0.1s linear 0s; + transition: + background 0.1s linear 0s, + opacity 0.1s linear 0s; +} + +.fancybox-close i { + left: 50%; + top: 50%; + margin: -11px 0 0 -11px; + font-size: 22px; + line-height: 1em; + position: absolute; + color: #ffffff; +} + +.fancybox-close:hover { + opacity: 1; +} + +.fancybox-nav { + position: absolute; + top: 0; + height: 100%; + cursor: pointer; + text-decoration: none; + background: transparent url("blank.gif"); /* helps IE */ + -webkit-tap-highlight-color: rgba(0, 0, 0, 0); + z-index: 8040; +} + +.fancybox-prev, +.fancybox-prev span { + left: 0; +} + +.fancybox-next, +.fancybox-next span { + right: 0; +} + +.fancybox-nav span { + position: absolute; + top: 50%; + width: 44px; + height: 32px; + margin-top: -25px; + cursor: pointer; + z-index: 8040; + background-image: none; + background-color: #26292e; + background-position-y: -38px; + opacity: 0.5; + + -webkit-transition: + background 0.1s linear 0s, + opacity 0.1s linear 0s; + -moz-transition: + background 0.1s linear 0s, + opacity 0.1s linear 0s; + -o-transition: + background 0.1s linear 0s, + opacity 0.1s linear 0s; + transition: + background 0.1s linear 0s, + opacity 0.1s linear 0s; +} +.fancybox-next span { + background-position-y: -72px; +} +.fancybox-prev span i { + left: 50%; + top: 50%; + margin: -15px 0 0 -17px; + font-size: 30px; + line-height: 1em; + position: absolute; + color: #ffffff; +} + +.fancybox-next span i { + left: 50%; + top: 50%; + margin: -15px 0 0 -15px; + font-size: 30px; + line-height: 1em; + position: absolute; + color: #ffffff; +} + +.fancybox-nav:hover span { + opacity: 1; +} + +.fancybox-tmp { + position: absolute; + top: -99999px; + left: -99999px; + visibility: hidden; + max-width: 99999px; + max-height: 99999px; + overflow: visible !important; +} + +/* Overlay helper */ + +.fancybox-lock { + margin: 0 !important; +} + +.fancybox-overlay { + position: absolute; + top: 0; + left: 0; + overflow: hidden !important; + display: none; + z-index: 8010; + background: url("fancybox_overlay.png"); +} + +.fancybox-overlay-fixed { + position: fixed; + bottom: 0; + right: 0; +} + +.fancybox-lock .fancybox-overlay { + overflow: auto; + overflow-y: scroll; +} + +/* Title helper */ + +.fancybox-title { + visibility: hidden; + position: relative; + text-shadow: none; + z-index: 8050; +} + +.fancybox-opened .fancybox-title { + visibility: visible; +} + +.fancybox-opened .fancybox-title h4 { + font-size: 24px; + color: #fff; + font-weight: 300; + margin-bottom: 10px; +} + +.fancybox-opened .fancybox-title p { + font-size: 16px; + font-weight: 300; + color: #bbb; + line-height: 1.6em; + margin-bottom: 0; +} + +.fancybox-title-float-wrap { + position: absolute; + bottom: 0; + right: 50%; + margin-bottom: -35px; + z-index: 8050; + text-align: center; +} + +.fancybox-title-float-wrap .child { + display: inline-block; + margin-right: -100%; + padding: 2px 20px; + background: transparent; /* Fallback for web browsers that doesn't support RGBa */ + background: rgba(0, 0, 0, 0.8); + -webkit-border-radius: 15px; + -moz-border-radius: 15px; + border-radius: 15px; + text-shadow: 0 1px 2px #222; + color: #fff; + font-weight: bold; + line-height: 24px; + white-space: nowrap; +} + +.fancybox-title-outside-wrap { + position: relative; + margin-top: 10px; + color: #fff; +} + +.fancybox-title-inside-wrap { + padding: 3px 30px 6px; + background: #61b331; +} + +.fancybox-title-over-wrap { + position: absolute; + bottom: 0; + left: 0; + color: #fff; + padding: 10px; + background: #000; + background: rgba(0, 0, 0, 0.8); +} + +@media (max-width: 480px) { + .fancybox-nav span, + .fancybox-nav:hover span, + .fancybox-close, + .fancybox-close:hover { + background: transparent; + } + + .fancybox-close i { + left: 70px; + top: 10px; + } +} + +@media (max-width: 320px) { + .fancybox-close i { + left: 30px; + top: 20px; + } +} diff --git a/backend/tests/integration/tests/pruning/website/css/font-awesome.css b/backend/tests/integration/tests/pruning/website/css/font-awesome.css new file mode 100644 index 00000000000..49a13c9a58d --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/css/font-awesome.css @@ -0,0 +1,1344 @@ +/*! + * Font Awesome 4.0.3 by @davegandy - http://fontawesome.io - @fontawesome + * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) + */ +/* FONT PATH + * -------------------------- */ +@font-face { + font-family: "FontAwesome"; + src: url("../fonts/fontawesome-webfont.eot?v=4.0.3"); + src: + url("../fonts/fontawesome-webfont.eot?#iefix&v=4.0.3") + format("embedded-opentype"), + url("../fonts/fontawesome-webfont.woff?v=4.0.3") format("woff"), + url("../fonts/fontawesome-webfont.ttf?v=4.0.3") format("truetype"), + url("../fonts/fontawesome-webfont.svg?v=4.0.3#fontawesomeregular") + format("svg"); + font-weight: normal; + font-style: normal; +} +.fa { + display: inline-block; + font-family: FontAwesome; + font-style: normal; + font-weight: normal; + line-height: 1; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; +} +/* makes the font 33% larger relative to the icon container */ +.fa-lg { + font-size: 1.3333333333333333em; + line-height: 0.75em; + vertical-align: -15%; +} +.fa-2x { + font-size: 2em; +} +.fa-3x { + font-size: 3em; +} +.fa-4x { + font-size: 4em; +} +.fa-5x { + font-size: 5em; +} +.fa-fw { + width: 1.2857142857142858em; + text-align: center; +} +.fa-ul { + padding-left: 0; + margin-left: 2.142857142857143em; + list-style-type: none; +} +.fa-ul > li { + position: relative; +} +.fa-li { + position: absolute; + left: -2.142857142857143em; + width: 2.142857142857143em; + top: 0.14285714285714285em; + text-align: center; +} +.fa-li.fa-lg { + left: -1.8571428571428572em; +} +.fa-border { + padding: 0.2em 0.25em 0.15em; + border: solid 0.08em #eeeeee; + border-radius: 0.1em; +} +.pull-right { + float: right; +} +.pull-left { + float: left; +} +.fa.pull-left { + margin-right: 0.3em; +} +.fa.pull-right { + margin-left: 0.3em; +} +.fa-spin { + -webkit-animation: spin 2s infinite linear; + -moz-animation: spin 2s infinite linear; + -o-animation: spin 2s infinite linear; + animation: spin 2s infinite linear; +} +@-moz-keyframes spin { + 0% { + -moz-transform: rotate(0deg); + } + 100% { + -moz-transform: rotate(359deg); + } +} +@-webkit-keyframes spin { + 0% { + -webkit-transform: rotate(0deg); + } + 100% { + -webkit-transform: rotate(359deg); + } +} +@-o-keyframes spin { + 0% { + -o-transform: rotate(0deg); + } + 100% { + -o-transform: rotate(359deg); + } +} +@-ms-keyframes spin { + 0% { + -ms-transform: rotate(0deg); + } + 100% { + -ms-transform: rotate(359deg); + } +} +@keyframes spin { + 0% { + transform: rotate(0deg); + } + 100% { + transform: rotate(359deg); + } +} +.fa-rotate-90 { + filter: progid:DXImageTransform.Microsoft.BasicImage(rotation=1); + -webkit-transform: rotate(90deg); + -moz-transform: rotate(90deg); + -ms-transform: rotate(90deg); + -o-transform: rotate(90deg); + transform: rotate(90deg); +} +.fa-rotate-180 { + filter: progid:DXImageTransform.Microsoft.BasicImage(rotation=2); + -webkit-transform: rotate(180deg); + -moz-transform: rotate(180deg); + -ms-transform: rotate(180deg); + -o-transform: rotate(180deg); + transform: rotate(180deg); +} +.fa-rotate-270 { + filter: progid:DXImageTransform.Microsoft.BasicImage(rotation=3); + -webkit-transform: rotate(270deg); + -moz-transform: rotate(270deg); + -ms-transform: rotate(270deg); + -o-transform: rotate(270deg); + transform: rotate(270deg); +} +.fa-flip-horizontal { + filter: progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1); + -webkit-transform: scale(-1, 1); + -moz-transform: scale(-1, 1); + -ms-transform: scale(-1, 1); + -o-transform: scale(-1, 1); + transform: scale(-1, 1); +} +.fa-flip-vertical { + filter: progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1); + -webkit-transform: scale(1, -1); + -moz-transform: scale(1, -1); + -ms-transform: scale(1, -1); + -o-transform: scale(1, -1); + transform: scale(1, -1); +} +.fa-stack { + position: relative; + display: inline-block; + width: 2em; + height: 2em; + line-height: 2em; + vertical-align: middle; +} +.fa-stack-1x, +.fa-stack-2x { + position: absolute; + left: 0; + width: 100%; + text-align: center; +} +.fa-stack-1x { + line-height: inherit; +} +.fa-stack-2x { + font-size: 2em; +} +.fa-inverse { + color: #ffffff; +} +/* Font Awesome uses the Unicode Private Use Area (PUA) to ensure screen + readers do not read off random characters that represent icons */ +.fa-glass:before { + content: "\f000"; +} +.fa-music:before { + content: "\f001"; +} +.fa-search:before { + content: "\f002"; +} +.fa-envelope-o:before { + content: "\f003"; +} +.fa-heart:before { + content: "\f004"; +} +.fa-star:before { + content: "\f005"; +} +.fa-star-o:before { + content: "\f006"; +} +.fa-user:before { + content: "\f007"; +} +.fa-film:before { + content: "\f008"; +} +.fa-th-large:before { + content: "\f009"; +} +.fa-th:before { + content: "\f00a"; +} +.fa-th-list:before { + content: "\f00b"; +} +.fa-check:before { + content: "\f00c"; +} +.fa-times:before { + content: "\f00d"; +} +.fa-search-plus:before { + content: "\f00e"; +} +.fa-search-minus:before { + content: "\f010"; +} +.fa-power-off:before { + content: "\f011"; +} +.fa-signal:before { + content: "\f012"; +} +.fa-gear:before, +.fa-cog:before { + content: "\f013"; +} +.fa-trash-o:before { + content: "\f014"; +} +.fa-home:before { + content: "\f015"; +} +.fa-file-o:before { + content: "\f016"; +} +.fa-clock-o:before { + content: "\f017"; +} +.fa-road:before { + content: "\f018"; +} +.fa-download:before { + content: "\f019"; +} +.fa-arrow-circle-o-down:before { + content: "\f01a"; +} +.fa-arrow-circle-o-up:before { + content: "\f01b"; +} +.fa-inbox:before { + content: "\f01c"; +} +.fa-play-circle-o:before { + content: "\f01d"; +} +.fa-rotate-right:before, +.fa-repeat:before { + content: "\f01e"; +} +.fa-refresh:before { + content: "\f021"; +} +.fa-list-alt:before { + content: "\f022"; +} +.fa-lock:before { + content: "\f023"; +} +.fa-flag:before { + content: "\f024"; +} +.fa-headphones:before { + content: "\f025"; +} +.fa-volume-off:before { + content: "\f026"; +} +.fa-volume-down:before { + content: "\f027"; +} +.fa-volume-up:before { + content: "\f028"; +} +.fa-qrcode:before { + content: "\f029"; +} +.fa-barcode:before { + content: "\f02a"; +} +.fa-tag:before { + content: "\f02b"; +} +.fa-tags:before { + content: "\f02c"; +} +.fa-book:before { + content: "\f02d"; +} +.fa-bookmark:before { + content: "\f02e"; +} +.fa-print:before { + content: "\f02f"; +} +.fa-camera:before { + content: "\f030"; +} +.fa-font:before { + content: "\f031"; +} +.fa-bold:before { + content: "\f032"; +} +.fa-italic:before { + content: "\f033"; +} +.fa-text-height:before { + content: "\f034"; +} +.fa-text-width:before { + content: "\f035"; +} +.fa-align-left:before { + content: "\f036"; +} +.fa-align-center:before { + content: "\f037"; +} +.fa-align-right:before { + content: "\f038"; +} +.fa-align-justify:before { + content: "\f039"; +} +.fa-list:before { + content: "\f03a"; +} +.fa-dedent:before, +.fa-outdent:before { + content: "\f03b"; +} +.fa-indent:before { + content: "\f03c"; +} +.fa-video-camera:before { + content: "\f03d"; +} +.fa-picture-o:before { + content: "\f03e"; +} +.fa-pencil:before { + content: "\f040"; +} +.fa-map-marker:before { + content: "\f041"; +} +.fa-adjust:before { + content: "\f042"; +} +.fa-tint:before { + content: "\f043"; +} +.fa-edit:before, +.fa-pencil-square-o:before { + content: "\f044"; +} +.fa-share-square-o:before { + content: "\f045"; +} +.fa-check-square-o:before { + content: "\f046"; +} +.fa-arrows:before { + content: "\f047"; +} +.fa-step-backward:before { + content: "\f048"; +} +.fa-fast-backward:before { + content: "\f049"; +} +.fa-backward:before { + content: "\f04a"; +} +.fa-play:before { + content: "\f04b"; +} +.fa-pause:before { + content: "\f04c"; +} +.fa-stop:before { + content: "\f04d"; +} +.fa-forward:before { + content: "\f04e"; +} +.fa-fast-forward:before { + content: "\f050"; +} +.fa-step-forward:before { + content: "\f051"; +} +.fa-eject:before { + content: "\f052"; +} +.fa-chevron-left:before { + content: "\f053"; +} +.fa-chevron-right:before { + content: "\f054"; +} +.fa-plus-circle:before { + content: "\f055"; +} +.fa-minus-circle:before { + content: "\f056"; +} +.fa-times-circle:before { + content: "\f057"; +} +.fa-check-circle:before { + content: "\f058"; +} +.fa-question-circle:before { + content: "\f059"; +} +.fa-info-circle:before { + content: "\f05a"; +} +.fa-crosshairs:before { + content: "\f05b"; +} +.fa-times-circle-o:before { + content: "\f05c"; +} +.fa-check-circle-o:before { + content: "\f05d"; +} +.fa-ban:before { + content: "\f05e"; +} +.fa-arrow-left:before { + content: "\f060"; +} +.fa-arrow-right:before { + content: "\f061"; +} +.fa-arrow-up:before { + content: "\f062"; +} +.fa-arrow-down:before { + content: "\f063"; +} +.fa-mail-forward:before, +.fa-share:before { + content: "\f064"; +} +.fa-expand:before { + content: "\f065"; +} +.fa-compress:before { + content: "\f066"; +} +.fa-plus:before { + content: "\f067"; +} +.fa-minus:before { + content: "\f068"; +} +.fa-asterisk:before { + content: "\f069"; +} +.fa-exclamation-circle:before { + content: "\f06a"; +} +.fa-gift:before { + content: "\f06b"; +} +.fa-leaf:before { + content: "\f06c"; +} +.fa-fire:before { + content: "\f06d"; +} +.fa-eye:before { + content: "\f06e"; +} +.fa-eye-slash:before { + content: "\f070"; +} +.fa-warning:before, +.fa-exclamation-triangle:before { + content: "\f071"; +} +.fa-plane:before { + content: "\f072"; +} +.fa-calendar:before { + content: "\f073"; +} +.fa-random:before { + content: "\f074"; +} +.fa-comment:before { + content: "\f075"; +} +.fa-magnet:before { + content: "\f076"; +} +.fa-chevron-up:before { + content: "\f077"; +} +.fa-chevron-down:before { + content: "\f078"; +} +.fa-retweet:before { + content: "\f079"; +} +.fa-shopping-cart:before { + content: "\f07a"; +} +.fa-folder:before { + content: "\f07b"; +} +.fa-folder-open:before { + content: "\f07c"; +} +.fa-arrows-v:before { + content: "\f07d"; +} +.fa-arrows-h:before { + content: "\f07e"; +} +.fa-bar-chart-o:before { + content: "\f080"; +} +.fa-twitter-square:before { + content: "\f081"; +} +.fa-facebook-square:before { + content: "\f082"; +} +.fa-camera-retro:before { + content: "\f083"; +} +.fa-key:before { + content: "\f084"; +} +.fa-gears:before, +.fa-cogs:before { + content: "\f085"; +} +.fa-comments:before { + content: "\f086"; +} +.fa-thumbs-o-up:before { + content: "\f087"; +} +.fa-thumbs-o-down:before { + content: "\f088"; +} +.fa-star-half:before { + content: "\f089"; +} +.fa-heart-o:before { + content: "\f08a"; +} +.fa-sign-out:before { + content: "\f08b"; +} +.fa-linkedin-square:before { + content: "\f08c"; +} +.fa-thumb-tack:before { + content: "\f08d"; +} +.fa-external-link:before { + content: "\f08e"; +} +.fa-sign-in:before { + content: "\f090"; +} +.fa-trophy:before { + content: "\f091"; +} +.fa-github-square:before { + content: "\f092"; +} +.fa-upload:before { + content: "\f093"; +} +.fa-lemon-o:before { + content: "\f094"; +} +.fa-phone:before { + content: "\f095"; +} +.fa-square-o:before { + content: "\f096"; +} +.fa-bookmark-o:before { + content: "\f097"; +} +.fa-phone-square:before { + content: "\f098"; +} +.fa-twitter:before { + content: "\f099"; +} +.fa-facebook:before { + content: "\f09a"; +} +.fa-github:before { + content: "\f09b"; +} +.fa-unlock:before { + content: "\f09c"; +} +.fa-credit-card:before { + content: "\f09d"; +} +.fa-rss:before { + content: "\f09e"; +} +.fa-hdd-o:before { + content: "\f0a0"; +} +.fa-bullhorn:before { + content: "\f0a1"; +} +.fa-bell:before { + content: "\f0f3"; +} +.fa-certificate:before { + content: "\f0a3"; +} +.fa-hand-o-right:before { + content: "\f0a4"; +} +.fa-hand-o-left:before { + content: "\f0a5"; +} +.fa-hand-o-up:before { + content: "\f0a6"; +} +.fa-hand-o-down:before { + content: "\f0a7"; +} +.fa-arrow-circle-left:before { + content: "\f0a8"; +} +.fa-arrow-circle-right:before { + content: "\f0a9"; +} +.fa-arrow-circle-up:before { + content: "\f0aa"; +} +.fa-arrow-circle-down:before { + content: "\f0ab"; +} +.fa-globe:before { + content: "\f0ac"; +} +.fa-wrench:before { + content: "\f0ad"; +} +.fa-tasks:before { + content: "\f0ae"; +} +.fa-filter:before { + content: "\f0b0"; +} +.fa-briefcase:before { + content: "\f0b1"; +} +.fa-arrows-alt:before { + content: "\f0b2"; +} +.fa-group:before, +.fa-users:before { + content: "\f0c0"; +} +.fa-chain:before, +.fa-link:before { + content: "\f0c1"; +} +.fa-cloud:before { + content: "\f0c2"; +} +.fa-flask:before { + content: "\f0c3"; +} +.fa-cut:before, +.fa-scissors:before { + content: "\f0c4"; +} +.fa-copy:before, +.fa-files-o:before { + content: "\f0c5"; +} +.fa-paperclip:before { + content: "\f0c6"; +} +.fa-save:before, +.fa-floppy-o:before { + content: "\f0c7"; +} +.fa-square:before { + content: "\f0c8"; +} +.fa-bars:before { + content: "\f0c9"; +} +.fa-list-ul:before { + content: "\f0ca"; +} +.fa-list-ol:before { + content: "\f0cb"; +} +.fa-strikethrough:before { + content: "\f0cc"; +} +.fa-underline:before { + content: "\f0cd"; +} +.fa-table:before { + content: "\f0ce"; +} +.fa-magic:before { + content: "\f0d0"; +} +.fa-truck:before { + content: "\f0d1"; +} +.fa-pinterest:before { + content: "\f0d2"; +} +.fa-pinterest-square:before { + content: "\f0d3"; +} +.fa-google-plus-square:before { + content: "\f0d4"; +} +.fa-google-plus:before { + content: "\f0d5"; +} +.fa-money:before { + content: "\f0d6"; +} +.fa-caret-down:before { + content: "\f0d7"; +} +.fa-caret-up:before { + content: "\f0d8"; +} +.fa-caret-left:before { + content: "\f0d9"; +} +.fa-caret-right:before { + content: "\f0da"; +} +.fa-columns:before { + content: "\f0db"; +} +.fa-unsorted:before, +.fa-sort:before { + content: "\f0dc"; +} +.fa-sort-down:before, +.fa-sort-asc:before { + content: "\f0dd"; +} +.fa-sort-up:before, +.fa-sort-desc:before { + content: "\f0de"; +} +.fa-envelope:before { + content: "\f0e0"; +} +.fa-linkedin:before { + content: "\f0e1"; +} +.fa-rotate-left:before, +.fa-undo:before { + content: "\f0e2"; +} +.fa-legal:before, +.fa-gavel:before { + content: "\f0e3"; +} +.fa-dashboard:before, +.fa-tachometer:before { + content: "\f0e4"; +} +.fa-comment-o:before { + content: "\f0e5"; +} +.fa-comments-o:before { + content: "\f0e6"; +} +.fa-flash:before, +.fa-bolt:before { + content: "\f0e7"; +} +.fa-sitemap:before { + content: "\f0e8"; +} +.fa-umbrella:before { + content: "\f0e9"; +} +.fa-paste:before, +.fa-clipboard:before { + content: "\f0ea"; +} +.fa-lightbulb-o:before { + content: "\f0eb"; +} +.fa-exchange:before { + content: "\f0ec"; +} +.fa-cloud-download:before { + content: "\f0ed"; +} +.fa-cloud-upload:before { + content: "\f0ee"; +} +.fa-user-md:before { + content: "\f0f0"; +} +.fa-stethoscope:before { + content: "\f0f1"; +} +.fa-suitcase:before { + content: "\f0f2"; +} +.fa-bell-o:before { + content: "\f0a2"; +} +.fa-coffee:before { + content: "\f0f4"; +} +.fa-cutlery:before { + content: "\f0f5"; +} +.fa-file-text-o:before { + content: "\f0f6"; +} +.fa-building-o:before { + content: "\f0f7"; +} +.fa-hospital-o:before { + content: "\f0f8"; +} +.fa-ambulance:before { + content: "\f0f9"; +} +.fa-medkit:before { + content: "\f0fa"; +} +.fa-fighter-jet:before { + content: "\f0fb"; +} +.fa-beer:before { + content: "\f0fc"; +} +.fa-h-square:before { + content: "\f0fd"; +} +.fa-plus-square:before { + content: "\f0fe"; +} +.fa-angle-double-left:before { + content: "\f100"; +} +.fa-angle-double-right:before { + content: "\f101"; +} +.fa-angle-double-up:before { + content: "\f102"; +} +.fa-angle-double-down:before { + content: "\f103"; +} +.fa-angle-left:before { + content: "\f104"; +} +.fa-angle-right:before { + content: "\f105"; +} +.fa-angle-up:before { + content: "\f106"; +} +.fa-angle-down:before { + content: "\f107"; +} +.fa-desktop:before { + content: "\f108"; +} +.fa-laptop:before { + content: "\f109"; +} +.fa-tablet:before { + content: "\f10a"; +} +.fa-mobile-phone:before, +.fa-mobile:before { + content: "\f10b"; +} +.fa-circle-o:before { + content: "\f10c"; +} +.fa-quote-left:before { + content: "\f10d"; +} +.fa-quote-right:before { + content: "\f10e"; +} +.fa-spinner:before { + content: "\f110"; +} +.fa-circle:before { + content: "\f111"; +} +.fa-mail-reply:before, +.fa-reply:before { + content: "\f112"; +} +.fa-github-alt:before { + content: "\f113"; +} +.fa-folder-o:before { + content: "\f114"; +} +.fa-folder-open-o:before { + content: "\f115"; +} +.fa-smile-o:before { + content: "\f118"; +} +.fa-frown-o:before { + content: "\f119"; +} +.fa-meh-o:before { + content: "\f11a"; +} +.fa-gamepad:before { + content: "\f11b"; +} +.fa-keyboard-o:before { + content: "\f11c"; +} +.fa-flag-o:before { + content: "\f11d"; +} +.fa-flag-checkered:before { + content: "\f11e"; +} +.fa-terminal:before { + content: "\f120"; +} +.fa-code:before { + content: "\f121"; +} +.fa-reply-all:before { + content: "\f122"; +} +.fa-mail-reply-all:before { + content: "\f122"; +} +.fa-star-half-empty:before, +.fa-star-half-full:before, +.fa-star-half-o:before { + content: "\f123"; +} +.fa-location-arrow:before { + content: "\f124"; +} +.fa-crop:before { + content: "\f125"; +} +.fa-code-fork:before { + content: "\f126"; +} +.fa-unlink:before, +.fa-chain-broken:before { + content: "\f127"; +} +.fa-question:before { + content: "\f128"; +} +.fa-info:before { + content: "\f129"; +} +.fa-exclamation:before { + content: "\f12a"; +} +.fa-superscript:before { + content: "\f12b"; +} +.fa-subscript:before { + content: "\f12c"; +} +.fa-eraser:before { + content: "\f12d"; +} +.fa-puzzle-piece:before { + content: "\f12e"; +} +.fa-microphone:before { + content: "\f130"; +} +.fa-microphone-slash:before { + content: "\f131"; +} +.fa-shield:before { + content: "\f132"; +} +.fa-calendar-o:before { + content: "\f133"; +} +.fa-fire-extinguisher:before { + content: "\f134"; +} +.fa-rocket:before { + content: "\f135"; +} +.fa-maxcdn:before { + content: "\f136"; +} +.fa-chevron-circle-left:before { + content: "\f137"; +} +.fa-chevron-circle-right:before { + content: "\f138"; +} +.fa-chevron-circle-up:before { + content: "\f139"; +} +.fa-chevron-circle-down:before { + content: "\f13a"; +} +.fa-html5:before { + content: "\f13b"; +} +.fa-css3:before { + content: "\f13c"; +} +.fa-anchor:before { + content: "\f13d"; +} +.fa-unlock-alt:before { + content: "\f13e"; +} +.fa-bullseye:before { + content: "\f140"; +} +.fa-ellipsis-h:before { + content: "\f141"; +} +.fa-ellipsis-v:before { + content: "\f142"; +} +.fa-rss-square:before { + content: "\f143"; +} +.fa-play-circle:before { + content: "\f144"; +} +.fa-ticket:before { + content: "\f145"; +} +.fa-minus-square:before { + content: "\f146"; +} +.fa-minus-square-o:before { + content: "\f147"; +} +.fa-level-up:before { + content: "\f148"; +} +.fa-level-down:before { + content: "\f149"; +} +.fa-check-square:before { + content: "\f14a"; +} +.fa-pencil-square:before { + content: "\f14b"; +} +.fa-external-link-square:before { + content: "\f14c"; +} +.fa-share-square:before { + content: "\f14d"; +} +.fa-compass:before { + content: "\f14e"; +} +.fa-toggle-down:before, +.fa-caret-square-o-down:before { + content: "\f150"; +} +.fa-toggle-up:before, +.fa-caret-square-o-up:before { + content: "\f151"; +} +.fa-toggle-right:before, +.fa-caret-square-o-right:before { + content: "\f152"; +} +.fa-euro:before, +.fa-eur:before { + content: "\f153"; +} +.fa-gbp:before { + content: "\f154"; +} +.fa-dollar:before, +.fa-usd:before { + content: "\f155"; +} +.fa-rupee:before, +.fa-inr:before { + content: "\f156"; +} +.fa-cny:before, +.fa-rmb:before, +.fa-yen:before, +.fa-jpy:before { + content: "\f157"; +} +.fa-ruble:before, +.fa-rouble:before, +.fa-rub:before { + content: "\f158"; +} +.fa-won:before, +.fa-krw:before { + content: "\f159"; +} +.fa-bitcoin:before, +.fa-btc:before { + content: "\f15a"; +} +.fa-file:before { + content: "\f15b"; +} +.fa-file-text:before { + content: "\f15c"; +} +.fa-sort-alpha-asc:before { + content: "\f15d"; +} +.fa-sort-alpha-desc:before { + content: "\f15e"; +} +.fa-sort-amount-asc:before { + content: "\f160"; +} +.fa-sort-amount-desc:before { + content: "\f161"; +} +.fa-sort-numeric-asc:before { + content: "\f162"; +} +.fa-sort-numeric-desc:before { + content: "\f163"; +} +.fa-thumbs-up:before { + content: "\f164"; +} +.fa-thumbs-down:before { + content: "\f165"; +} +.fa-youtube-square:before { + content: "\f166"; +} +.fa-youtube:before { + content: "\f167"; +} +.fa-xing:before { + content: "\f168"; +} +.fa-xing-square:before { + content: "\f169"; +} +.fa-youtube-play:before { + content: "\f16a"; +} +.fa-dropbox:before { + content: "\f16b"; +} +.fa-stack-overflow:before { + content: "\f16c"; +} +.fa-instagram:before { + content: "\f16d"; +} +.fa-flickr:before { + content: "\f16e"; +} +.fa-adn:before { + content: "\f170"; +} +.fa-bitbucket:before { + content: "\f171"; +} +.fa-bitbucket-square:before { + content: "\f172"; +} +.fa-tumblr:before { + content: "\f173"; +} +.fa-tumblr-square:before { + content: "\f174"; +} +.fa-long-arrow-down:before { + content: "\f175"; +} +.fa-long-arrow-up:before { + content: "\f176"; +} +.fa-long-arrow-left:before { + content: "\f177"; +} +.fa-long-arrow-right:before { + content: "\f178"; +} +.fa-apple:before { + content: "\f179"; +} +.fa-windows:before { + content: "\f17a"; +} +.fa-android:before { + content: "\f17b"; +} +.fa-linux:before { + content: "\f17c"; +} +.fa-dribbble:before { + content: "\f17d"; +} +.fa-skype:before { + content: "\f17e"; +} +.fa-foursquare:before { + content: "\f180"; +} +.fa-trello:before { + content: "\f181"; +} +.fa-female:before { + content: "\f182"; +} +.fa-male:before { + content: "\f183"; +} +.fa-gittip:before { + content: "\f184"; +} +.fa-sun-o:before { + content: "\f185"; +} +.fa-moon-o:before { + content: "\f186"; +} +.fa-archive:before { + content: "\f187"; +} +.fa-bug:before { + content: "\f188"; +} +.fa-vk:before { + content: "\f189"; +} +.fa-weibo:before { + content: "\f18a"; +} +.fa-renren:before { + content: "\f18b"; +} +.fa-pagelines:before { + content: "\f18c"; +} +.fa-stack-exchange:before { + content: "\f18d"; +} +.fa-arrow-circle-o-right:before { + content: "\f18e"; +} +.fa-arrow-circle-o-left:before { + content: "\f190"; +} +.fa-toggle-left:before, +.fa-caret-square-o-left:before { + content: "\f191"; +} +.fa-dot-circle-o:before { + content: "\f192"; +} +.fa-wheelchair:before { + content: "\f193"; +} +.fa-vimeo-square:before { + content: "\f194"; +} +.fa-turkish-lira:before, +.fa-try:before { + content: "\f195"; +} +.fa-plus-square-o:before { + content: "\f196"; +} diff --git a/backend/tests/integration/tests/pruning/website/css/style.css b/backend/tests/integration/tests/pruning/website/css/style.css new file mode 100644 index 00000000000..970a9e89e90 --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/css/style.css @@ -0,0 +1,1779 @@ +/* +Author URI: http://webthemez.com/ +Note: +Licence under Creative Commons Attribution 3.0 +Do not remove the back-link in this web template +-------------------------------------------------------*/ + +@import url("http://fonts.googleapis.com/css?family=Noto+Serif:400,400italic,700|Open+Sans:400,600,700"); +@import url("font-awesome.css"); +@import url("animate.css"); + +body { + font-family: "Open Sans", Arial, sans-serif; + font-size: 14px; + font-weight: 300; + line-height: 1.6em; + color: #656565; +} + +a:active { + outline: 0; +} + +.clear { + clear: both; +} + +h1, +h2, +h3, +h4, +h5, +h6 { + font-family: "Open Sans", Arial, sans-serif; + font-weight: 700; + line-height: 1.1em; + color: #333; + margin-bottom: 20px; +} + +.container { + padding: 0 20px 0 20px; + position: relative; +} + +#wrapper { + width: 100%; + margin: 0; + padding: 0; +} + +.row, +.row-fluid { + margin-bottom: 30px; +} + +.row .row, +.row-fluid .row-fluid { + margin-bottom: 30px; +} + +.row.nomargin, +.row-fluid.nomargin { + margin-bottom: 0; +} + +img.img-polaroid { + margin: 0 0 20px 0; +} +.img-box { + max-width: 100%; +} +/* Header +==================================== */ + +header .navbar { + margin-bottom: 0; +} + +.navbar-default { + border: none; +} + +.navbar-brand { + color: #222; + text-transform: uppercase; + font-size: 24px; + font-weight: 700; + line-height: 1em; + letter-spacing: -1px; + margin-top: 13px; + padding: 0 0 0 15px; +} +.navbar-default .navbar-brand { + color: #61b331; +} + +header .navbar-collapse ul.navbar-nav { + float: right; + margin-right: 0; +} + +header .navbar-default { + background-color: #ffffff; +} + +header .nav li a:hover, +header .nav li a:focus, +header .nav li.active a, +header .nav li.active a:hover, +header .nav li a.dropdown-toggle:hover, +header .nav li a.dropdown-toggle:focus, +header .nav li.active ul.dropdown-menu li a:hover, +header .nav li.active ul.dropdown-menu li.active a { + -webkit-transition: all 0.3s ease; + -moz-transition: all 0.3s ease; + -ms-transition: all 0.3s ease; + -o-transition: all 0.3s ease; + transition: all 0.3s ease; +} + +header .navbar-default .navbar-nav > .open > a, +header .navbar-default .navbar-nav > .open > a:hover, +header .navbar-default .navbar-nav > .open > a:focus { + -webkit-transition: all 0.3s ease; + -moz-transition: all 0.3s ease; + -ms-transition: all 0.3s ease; + -o-transition: all 0.3s ease; + transition: all 0.3s ease; +} + +header .navbar { + min-height: 70px; + padding: 18px 0; +} + +header .navbar-nav > li { + padding-bottom: 12px; + padding-top: 12px; +} + +header .navbar-nav > li > a { + padding-bottom: 6px; + padding-top: 5px; + margin-left: 2px; + line-height: 30px; + font-weight: 700; + -webkit-transition: all 0.3s ease; + -moz-transition: all 0.3s ease; + -ms-transition: all 0.3s ease; + -o-transition: all 0.3s ease; + transition: all 0.3s ease; +} + +.dropdown-menu li a:hover { + color: #fff !important; +} + +header .nav .caret { + border-bottom-color: #f5f5f5; + border-top-color: #f5f5f5; +} +.navbar-default .navbar-nav > .active > a, +.navbar-default .navbar-nav > .active > a:hover, +.navbar-default .navbar-nav > .active > a:focus { + background-color: #fff; +} +.navbar-default .navbar-nav > .open > a, +.navbar-default .navbar-nav > .open > a:hover, +.navbar-default .navbar-nav > .open > a:focus { + background-color: #fff; +} + +.dropdown-menu { + box-shadow: none; + border-radius: 0; + border: none; +} + +.dropdown-menu li:last-child { + padding-bottom: 0 !important; + margin-bottom: 0; +} + +header .nav li .dropdown-menu { + padding: 0; +} + +header .nav li .dropdown-menu li a { + line-height: 28px; + padding: 3px 12px; +} +.item-thumbs img { + margin-bottom: 15px; +} +.flex-control-paging li a.flex-active { + background: #000; + background: rgb(255, 255, 255); + cursor: default; +} +.flex-control-paging li a { + width: 30px; + height: 11px; + display: block; + background: #666; + background: rgba(0, 0, 0, 0.5); + cursor: pointer; + text-indent: -9999px; + -webkit-border-radius: 20px; + -moz-border-radius: 20px; + -o-border-radius: 20px; + border-radius: 20px; + box-shadow: inset 0 0 3px rgba(0, 0, 0, 0.3); +} +.panel-title > a { + color: inherit; + color: #fff; +} +.panel-group .panel-heading + .panel-collapse .panel-body { + border-top: 1px solid #ddd; + color: #fff; + background-color: #9c9c9c; +} +/* --- menu --- */ + +header .navigation { + float: right; +} + +header ul.nav li { + border: none; + margin: 0; +} + +header ul.nav li a { + font-size: 12px; + border: none; + font-weight: 700; + text-transform: uppercase; +} + +header ul.nav li ul li a { + font-size: 12px; + border: none; + font-weight: 300; + text-transform: uppercase; +} + +.navbar .nav > li > a { + color: #848484; + text-shadow: none; + border: 1px solid rgba(255, 255, 255, 0) !important; +} + +.navbar .nav a:hover { + background: none; + color: #14a085 !important; +} + +.navbar .nav > .active > a, +.navbar .nav > .active > a:hover { + background: none; + font-weight: 700; +} + +.navbar .nav > .active > a:active, +.navbar .nav > .active > a:focus { + background: none; + outline: 0; + font-weight: 700; +} + +.navbar .nav li .dropdown-menu { + z-index: 2000; +} + +header ul.nav li ul { + margin-top: 1px; +} +header ul.nav li ul li ul { + margin: 1px 0 0 1px; +} +.dropdown-menu .dropdown i { + position: absolute; + right: 0; + margin-top: 3px; + padding-left: 20px; +} + +.navbar .nav > li > .dropdown-menu:before { + display: inline-block; + border-right: none; + border-bottom: none; + border-left: none; + border-bottom-color: none; + content: none; +} +.navbar-default .navbar-nav > .active > a, +.navbar-default .navbar-nav > .active > a:hover, +.navbar-default .navbar-nav > .active > a:focus { + color: #14a085; +} + +ul.nav li.dropdown a { + z-index: 1000; + display: block; +} + +select.selectmenu { + display: none; +} +.pageTitle { + color: #fff; + margin: 30px 0 3px; + display: inline-block; +} + +#featured { + width: 100%; + background: #000; + position: relative; + margin: 0; + padding: 0; +} + +/* Sliders +==================================== */ +/* --- flexslider --- */ + +#featured .flexslider { + padding: 0; + background: #fff; + position: relative; + zoom: 1; +} +.flex-direction-nav .flex-prev { + left: 0px; +} +.flex-direction-nav .flex-next { + right: 0px; +} +.flex-caption { + zoom: 0; + color: #1c1d21; + margin: 0 auto; + padding: 1px; + position: absolute; + vertical-align: bottom; + text-align: center; + background-color: rgba(255, 255, 255, 0.26); + bottom: 5%; + display: block; + left: 0; + right: 0; +} +.flex-caption h3 { + color: #fff; + letter-spacing: 1px; + margin-bottom: 8px; + text-transform: uppercase; +} +.flex-caption p { + margin: 0 0 15px; +} +.skill-home { + margin-bottom: 50px; +} +.c1 { + border: #ed5441 1px solid; + background: #ed5441; +} +.c2 { + border: #d867b2 1px solid; + background: #d867b2; +} +.c3 { + border: #61b331 1px solid; + background: #4bc567; +} +.c4 { + border: #609cec 1px solid; + background: #26aff0; +} +.skill-home .icons { + padding: 33px 0 0 0; + width: 100%; + height: 178px; + color: rgb(255, 255, 255); + font-size: 42px; + font-size: 76px; + text-align: center; + -ms-border-radius: 50%; + -moz-border-radius: 50%; + -webkit-border-radius: 50%; + border-radius: 0; + display: inline-table; +} +.skill-home h2 { + padding-top: 20px; + font-size: 36px; + font-weight: 700; +} +.testimonial-solid { + padding: 50px 0 60px 0; + margin: 0 0 0 0; + background: #efefef; + text-align: center; +} +.testi-icon-area { + text-align: center; + position: absolute; + top: -84px; + margin: 0 auto; + width: 100%; + color: #000; +} +.testi-icon-area .quote { + padding: 15px 0 0 0; + margin: 0 0 0 0; + background: #ffffff; + text-align: center; + color: #26aff0; + display: inline-table; + width: 70px; + height: 70px; + -ms-border-radius: 50%; + -moz-border-radius: 50%; + -webkit-border-radius: 50%; + border-radius: 0; + font-size: 42px; + border: 1px solid #26aff0; + display: none; +} + +.testi-icon-area .carousel-inner { + margin: 20px 0; +} +.carousel-indicators { + bottom: -30px; +} +.team-member { + text-align: center; + background-color: #f9f9f9; + padding-bottom: 15px; +} +.fancybox-title-inside-wrap { + padding: 3px 30px 6px; + background: #292929; +} + +.item_introtext { + background-color: rgba(254, 254, 255, 0.66); + margin: 0 auto; + display: inline-block; + padding: 25px; +} +.item_introtext span { + font-size: 20px; + display: block; + font-weight: bold; +} +.item_introtext strong { + font-size: 50px; + display: block; + padding: 14px 0 30px; +} +.item_introtext p { + font-size: 20px !important; + color: #1c1d21; + font-weight: bold; +} + +.form-control { + border-radius: 0; +} + +/* Testimonial +----------------------------------*/ +.testimonial-area { + padding: 0 0 0 0; + margin: 0; + background: url(../img/low-poly01.jpg) fixed center center; + background-size: cover; + -webkit-background-size: cover; + -moz-background-size: cover; + -ms-background-size: cover; + color: red; +} +.testimonial-solid p { + color: #1f1f1f; + font-size: 16px; + line-height: 30px; + font-style: italic; +} +section.callaction { + background: #fff; + padding: 50px 0 0 0; +} + +/* Content +==================================== */ + +#content { + position: relative; + background: #fff; + padding: 50px 0 0px 0; +} + +#content img { + max-width: 100%; + height: auto; +} + +.cta-text { + text-align: center; + margin-top: 10px; +} + +.big-cta .cta { + margin-top: 10px; +} + +.box { + width: 100%; +} +.box-gray { + background: #f8f8f8; + padding: 20px 20px 30px; +} +.box-gray h4, +.box-gray i { + margin-bottom: 20px; +} +.box-bottom { + padding: 20px 0; + text-align: center; +} +.box-bottom a { + color: #fff; + font-weight: 700; +} +.box-bottom a:hover { + color: #eee; + text-decoration: none; +} + +/* Bottom +==================================== */ + +#bottom { + background: #fcfcfc; + padding: 50px 0 0; +} +/* twitter */ +#twitter-wrapper { + text-align: center; + width: 70%; + margin: 0 auto; +} +#twitter em { + font-style: normal; + font-size: 13px; +} + +#twitter em.twitterTime a { + font-weight: 600; +} + +#twitter ul { + padding: 0; + list-style: none; +} +#twitter ul li { + font-size: 20px; + line-height: 1.6em; + font-weight: 300; + margin-bottom: 20px; + position: relative; + word-break: break-word; +} + +/* page headline +==================================== */ + +#inner-headline { + background: #14a085; + position: relative; + margin: 0; + padding: 0; + color: #fefefe; + /* margin: 15px; */ + border-top: 10px solid #11967c; +} + +#inner-headline .inner-heading h2 { + color: #fff; + margin: 20px 0 0 0; +} + +/* --- breadcrumbs --- */ +#inner-headline ul.breadcrumb { + margin: 30px 0 0; + float: left; +} + +#inner-headline ul.breadcrumb li { + margin-bottom: 0; + padding-bottom: 0; +} +#inner-headline ul.breadcrumb li { + font-size: 13px; + color: #fff; +} + +#inner-headline ul.breadcrumb li i { + color: #dedede; +} + +#inner-headline ul.breadcrumb li a { + color: #fff; +} + +ul.breadcrumb li a:hover { + text-decoration: none; +} + +/* Forms +============================= */ + +/* --- contact form ---- */ +form#contactform input[type="text"] { + width: 100%; + border: 1px solid #f5f5f5; + min-height: 40px; + padding-left: 20px; + font-size: 13px; + padding-right: 20px; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; +} + +form#contactform textarea { + border: 1px solid #f5f5f5; + width: 100%; + padding-left: 20px; + padding-top: 10px; + font-size: 13px; + padding-right: 20px; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; +} + +form#contactform .validation { + font-size: 11px; +} + +#sendmessage { + border: 1px solid #e6e6e6; + background: #f6f6f6; + display: none; + text-align: center; + padding: 15px 12px 15px 65px; + margin: 10px 0; + font-weight: 600; + margin-bottom: 30px; +} + +#sendmessage.show, +.show { + display: block; +} + +form#commentform input[type="text"] { + width: 100%; + min-height: 40px; + padding-left: 20px; + font-size: 13px; + padding-right: 20px; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; + -webkit-border-radius: 2px 2px 2px 2px; + -moz-border-radius: 2px 2px 2px 2px; + border-radius: 2px 2px 2px 2px; +} + +form#commentform textarea { + width: 100%; + padding-left: 20px; + padding-top: 10px; + font-size: 13px; + padding-right: 20px; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; + -webkit-border-radius: 2px 2px 2px 2px; + -moz-border-radius: 2px 2px 2px 2px; + border-radius: 2px 2px 2px 2px; +} + +/* --- search form --- */ +.search { + float: right; + margin: 35px 0 0; + padding-bottom: 0; +} + +#inner-headline form.input-append { + margin: 0; + padding: 0; +} + +/* Portfolio +================================ */ + +.work-nav #filters { + margin: 0; + padding: 0; + list-style: none; +} + +.work-nav #filters li { + margin: 0 10px 30px 0; + padding: 0; + float: left; +} + +.work-nav #filters li a { + color: #7f8289; + font-size: 16px; + display: block; +} + +.work-nav #filters li a:hover { +} + +.work-nav #filters li a.selected { + color: #de5e60; +} + +#thumbs { + margin: 0; + padding: 0; +} + +#thumbs li { + list-style-type: none; +} + +.item-thumbs { + position: relative; + overflow: hidden; + margin-bottom: 30px; + cursor: pointer; +} + +.item-thumbs a + img { + width: 100%; +} + +.item-thumbs .hover-wrap { + position: absolute; + display: block; + width: 100%; + height: 100%; + + opacity: 0; + filter: alpha(opacity=0); + + -webkit-transition: all 450ms ease-out 0s; + -moz-transition: all 450ms ease-out 0s; + -o-transition: all 450ms ease-out 0s; + transition: all 450ms ease-out 0s; + + -webkit-transform: rotateY(180deg) scale(0.5, 0.5); + -moz-transform: rotateY(180deg) scale(0.5, 0.5); + -ms-transform: rotateY(180deg) scale(0.5, 0.5); + -o-transform: rotateY(180deg) scale(0.5, 0.5); + transform: rotateY(180deg) scale(0.5, 0.5); +} + +.item-thumbs:hover .hover-wrap, +.item-thumbs.active .hover-wrap { + opacity: 1; + filter: alpha(opacity=100); + + -webkit-transform: rotateY(0deg) scale(1, 1); + -moz-transform: rotateY(0deg) scale(1, 1); + -ms-transform: rotateY(0deg) scale(1, 1); + -o-transform: rotateY(0deg) scale(1, 1); + transform: rotateY(0deg) scale(1, 1); +} + +.item-thumbs .hover-wrap .overlay-img { + position: absolute; + width: 90%; + height: 91%; + opacity: 0.5; + filter: alpha(opacity=80); + background: #14a085; +} + +.item-thumbs .hover-wrap .overlay-img-thumb { + position: absolute; + border-radius: 60px; + top: 50%; + left: 45%; + margin: -16px 0 0 -16px; + color: #fff; + font-size: 32px; + line-height: 1em; + opacity: 1; + filter: alpha(opacity=100); +} + +ul.portfolio-categ { + margin: 10px 0 30px 0; + padding: 0; + float: left; + list-style: none; +} + +ul.portfolio-categ li { + margin: 0; + float: left; + list-style: none; + font-size: 13px; + font-weight: 600; + border: 1px solid #d5d5d5; + margin-right: 15px; +} + +ul.portfolio-categ li a { + display: block; + padding: 8px 20px; + color: #14a085; +} +ul.portfolio-categ li.active { + border: 1px solid #d7d8d6; + + background-color: #eaeaea; +} +ul.portfolio-categ li.active a:hover, +ul.portfolio-categ li a:hover, +ul.portfolio-categ li a:focus, +ul.portfolio-categ li a:active { + text-decoration: none; + outline: 0; +} +#accordion-alt3 .panel-heading h4 { + font-size: 13px; + line-height: 28px; + color: #6b6b6b; +} +.panel .panel-heading h4 { + font-weight: 400; +} +.panel-title { + margin-top: 0; + margin-bottom: 0; + font-size: 15px; + color: inherit; +} +.panel-group .panel { + margin-bottom: 0; + border-radius: 2px; +} +.panel { + margin-bottom: 18px; + background-color: #b9b9b9; + border: 1px solid transparent; + border-radius: 2px; + -webkit-box-shadow: 0 1px 1px rgba(0, 0, 0, 0.05); + box-shadow: 0 1px 1px rgba(0, 0, 0, 0.05); +} +#accordion-alt3 .panel-heading h4 a i { + font-size: 13px; + line-height: 18px; + width: 18px; + height: 18px; + margin-right: 5px; + color: #fff; + text-align: center; + border-radius: 50%; + margin-left: 6px; +} +.progress.pb-sm { + height: 6px !important; +} +.progress { + box-shadow: inset 0 0 2px rgba(0, 0, 0, 0.1); +} +.progress { + overflow: hidden; + height: 18px; + margin-bottom: 18px; + background-color: #f5f5f5; + border-radius: 2px; + -webkit-box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.1); + box-shadow: inset 0 1px 2px rgba(0, 0, 0, 0.1); +} +.progress .progress-bar.progress-bar-red { + background: #ed5441; +} +.progress .progress-bar.progress-bar-green { + background: #51d466; +} +.progress .progress-bar.progress-bar-lblue { + background: #32c8de; +} +/* --- portfolio detail --- */ +.top-wrapper { + margin-bottom: 20px; +} +.info-blocks { + margin-bottom: 15px; +} +.info-blocks i.icon-info-blocks { + float: left; + color: #318fcf; + font-size: 30px; + min-width: 50px; + margin-top: 6px; + text-align: center; + background-color: #efefef; + padding: 15px; +} +.info-blocks .info-blocks-in { + padding: 0 10px; + overflow: hidden; +} +.info-blocks .info-blocks-in h3 { + color: #555; + font-size: 20px; + line-height: 28px; + margin: 0px; +} +.info-blocks .info-blocks-in p { + font-size: 12px; +} + +blockquote { + font-size: 16px; + font-weight: 400; + font-family: "Noto Serif", serif; + font-style: italic; + padding-left: 0; + color: #a2a2a2; + line-height: 1.6em; + border: none; +} + +blockquote cite { + display: block; + font-size: 12px; + color: #666; + margin-top: 10px; +} +blockquote cite:before { + content: "\2014 \0020"; +} +blockquote cite a, +blockquote cite a:visited, +blockquote cite a:visited { + color: #555; +} + +/* --- pullquotes --- */ + +.pullquote-left { + display: block; + color: #a2a2a2; + font-family: "Noto Serif", serif; + font-size: 14px; + line-height: 1.6em; + padding-left: 20px; +} + +.pullquote-right { + display: block; + color: #a2a2a2; + font-family: "Noto Serif", serif; + font-size: 14px; + line-height: 1.6em; + padding-right: 20px; +} + +/* --- button --- */ +.btn { + text-align: center; + background: #318cca; + color: #fff; + border-radius: 0; + padding: 10px 30px; +} +.btn-theme { + color: #fff; +} +.btn-theme:hover { + color: #eee; +} + +/* --- list style --- */ + +ul.general { + list-style: none; + margin-left: 0; +} + +ul.link-list { + margin: 0; + padding: 0; + list-style: none; +} + +ul.link-list li { + margin: 0; + padding: 2px 0 2px 0; + list-style: none; +} +footer { + background: #14a085; +} +footer ul.link-list li a { + color: #ffffff; +} +footer ul.link-list li a:hover { + color: #e2e2e2; +} +/* --- Heading style --- */ + +h4.heading { + font-weight: 700; +} + +.heading { + margin-bottom: 30px; +} + +.heading { + position: relative; +} + +.widgetheading { + width: 100%; + + padding: 0; +} + +#bottom .widgetheading { + position: relative; + border-bottom: #e6e6e6 1px solid; + padding-bottom: 9px; +} + +aside .widgetheading { + position: relative; + border-bottom: #e9e9e9 1px solid; + padding-bottom: 9px; +} + +footer .widgetheading { + position: relative; +} + +footer .widget .social-network { + position: relative; +} + +#bottom .widget .widgetheading span, +aside .widget .widgetheading span, +footer .widget .widgetheading span { + position: absolute; + width: 60px; + height: 1px; + bottom: -1px; + right: 0; +} +.box-area { + border: 1px solid #f3f3f3; + padding: 0 15px 12px; + padding-top: 41px; + margin-top: -42px; + text-align: left; + background-color: #f9f9f9; + position: relative; +} +/* --- Map --- */ +.map { + position: relative; + margin-top: -50px; + margin-bottom: 40px; +} + +.map iframe { + width: 100%; + height: 450px; + border: none; +} + +.map-grid iframe { + width: 100%; + height: 350px; + border: none; + margin: 0 0 -5px 0; + padding: 0; +} + +ul.team-detail { + margin: -10px 0 0 0; + padding: 0; + list-style: none; +} + +ul.team-detail li { + border-bottom: 1px dotted #e9e9e9; + margin: 0 0 15px 0; + padding: 0 0 15px 0; + list-style: none; +} + +ul.team-detail li label { + font-size: 13px; +} + +ul.team-detail li h4, +ul.team-detail li label { + margin-bottom: 0; +} + +ul.team-detail li ul.social-network { + border: none; + margin: 0; + padding: 0; +} + +ul.team-detail li ul.social-network li { + border: none; + margin: 0; +} +ul.team-detail li ul.social-network li i { + margin: 0; +} + +.pricing-title { + background: #fff; + text-align: center; + padding: 10px 0 10px 0; +} + +.pricing-title h3 { + font-weight: 600; + margin-bottom: 0; +} + +.pricing-offer { + background: #fcfcfc; + text-align: center; + padding: 40px 0 40px 0; + font-size: 18px; + border-top: 1px solid #e6e6e6; + border-bottom: 1px solid #e6e6e6; +} + +.pricing-box.activeItem .pricing-offer { + color: #fff; +} + +.pricing-offer strong { + font-size: 78px; + line-height: 89px; +} + +.pricing-offer sup { + font-size: 28px; +} + +.pricing-container { + background: #fff; + text-align: center; + font-size: 14px; +} + +.pricing-container strong { + color: #353535; +} + +.pricing-container ul { + list-style: none; + padding: 0; + margin: 0; +} + +.pricing-container ul li { + border-bottom: 1px solid #f5f5f5; + list-style: none; + padding: 15px 0 15px 0; + margin: 0 0 0 0; + color: #222; +} + +.pricing-action { + margin: 0; + background: #fcfcfc; + text-align: center; + padding: 20px 0 30px 0; +} + +.pricing-wrapp { + margin: 0 auto; + width: 100%; + background: #fd0000; +} +.pricing-box-item { + border: 1px solid #f5f5f5; + + background: #f9f9f9; + position: relative; + margin: 0 0 20px 0; + padding: 0; + -webkit-box-shadow: 0 2px 0 rgba(0, 0, 0, 0.03); + -moz-box-shadow: 0 2px 0 rgba(0, 0, 0, 0.03); + box-shadow: 0 2px 0 rgba(0, 0, 0, 0.03); + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; +} + +.pricing-box-item .pricing-heading { + text-align: center; + padding: 0px 0 0px 0; + display: block; +} +.pricing-box-item.activeItem .pricing-heading { + text-align: center; + padding: 0px 0 1px 0; + border-bottom: none; + display: block; + color: #fff; +} +.pricing-box-item.activeItem .pricing-heading h3 { +} + +.pricing-box-item .pricing-heading h3 strong { + font-size: 20px; + font-weight: 700; + letter-spacing: -1px; +} +.pricing-box-item .pricing-heading h3 { + font-size: 35px; + font-weight: 300; + letter-spacing: -1px; +} + +.pricing-box-item .pricing-terms { + text-align: center; + display: block; + overflow: hidden; + padding: 11px 0 5px; +} + +.pricing-box-item .pricing-terms h6 { + font-style: italic; + margin-top: 10px; + color: #14a085; + font-size: 22px; + font-family: "Noto Serif", serif; +} + +.pricing-box-item .icon .price-circled { + margin: 10px 10px 10px 0; + display: inline-block !important; + text-align: center !important; + color: #fff; + width: 68px; + height: 68px; + padding: 12px; + font-size: 16px; + font-weight: 700; + line-height: 68px; + text-shadow: none; + cursor: pointer; + background-color: #888; + border-radius: 64px; + -moz-border-radius: 64px; + -webkit-border-radius: 64px; +} + +.pricing-box-item .pricing-action { + margin: 0; + text-align: center; + padding: 30px 0 30px 0; +} + +/* ===== Widgets ===== */ + +/* --- flickr --- */ +.widget .flickr_badge { + width: 100%; +} +.widget .flickr_badge img { + margin: 0 9px 20px 0; +} + +footer .widget .flickr_badge { + width: 100%; +} +footer .widget .flickr_badge img { + margin: 0 9px 20px 0; +} + +.flickr_badge img { + width: 50px; + height: 50px; + float: left; + margin: 0 9px 20px 0; +} + +/* --- Recent post widget --- */ + +.recent-post { + margin: 20px 0 0 0; + padding: 0; + line-height: 18px; +} + +.recent-post h5 a:hover { + text-decoration: none; +} + +.recent-post .text h5 a { + color: #353535; +} + +footer { + padding: 50px 0 0 0; + color: #f8f8f8; +} + +footer a { + color: #fff; +} + +footer a:hover { + color: #eee; +} + +footer h1, +footer h2, +footer h3, +footer h4, +footer h5, +footer h6 { + color: #fff; +} + +footer address { + line-height: 1.6em; + color: #ffffff; +} + +footer h5 a:hover, +footer a:hover { + text-decoration: none; +} + +ul.social-network { + list-style: none; + margin: 0; +} + +ul.social-network li { + display: inline; + margin: 0 5px; +} + +#sub-footer { + text-shadow: none; + color: #f5f5f5; + padding: 0; + padding-top: 30px; + margin: 20px 0 0 0; + background: #14a085; +} + +#sub-footer p { + margin: 0; + padding: 0; +} + +#sub-footer span { + color: #f5f5f5; +} + +.copyright { + text-align: left; + font-size: 12px; +} + +#sub-footer ul.social-network { + float: right; +} + +/* scroll to top */ +.scrollup { + position: fixed; + width: 32px; + height: 32px; + bottom: 0px; + right: 20px; + background: #222; +} + +a.scrollup { + outline: 0; + text-align: center; +} + +a.scrollup:hover, +a.scrollup:active, +a.scrollup:focus { + opacity: 1; + text-decoration: none; +} +a.scrollup i { + margin-top: 10px; + color: #fff; +} +a.scrollup i:hover { + text-decoration: none; +} + +.absolute { + position: absolute; +} + +.relative { + position: relative; +} + +.aligncenter { + text-align: center; +} + +.aligncenter span { + margin-left: 0; +} + +.floatright { + float: right; +} + +.floatleft { + float: left; +} + +.floatnone { + float: none; +} + +.aligncenter { + text-align: center; +} + +img.pull-left, +.align-left { + float: left; + margin: 0 15px 15px 0; +} + +.widget img.pull-left { + float: left; + margin: 0 15px 15px 0; +} + +img.pull-right, +.align-right { + float: right; + margin: 0 0 15px 15px; +} + +article img.pull-left, +article .align-left { + float: left; + margin: 5px 15px 15px 0; +} + +article img.pull-right, +article .align-right { + float: right; + margin: 5px 0 15px 15px; +} +============================= */ .clear-marginbot { + margin-bottom: 0; +} + +.marginbot10 { + margin-bottom: 10px; +} +.marginbot20 { + margin-bottom: 20px; +} +.marginbot30 { + margin-bottom: 30px; +} +.marginbot40 { + margin-bottom: 40px; +} + +.clear-margintop { + margin-top: 0; +} + +.margintop10 { + margin-top: 10px; +} + +.margintop20 { + margin-top: 20px; +} + +.margintop30 { + margin-top: 30px; +} + +.margintop40 { + margin-top: 40px; +} + +/* Media queries +============================= */ + +@media (min-width: 768px) and (max-width: 979px) { + a.detail { + background: none; + width: 100%; + } + + footer .widget form input#appendedInputButton { + display: block; + width: 91%; + -webkit-border-radius: 4px 4px 4px 4px; + -moz-border-radius: 4px 4px 4px 4px; + border-radius: 4px 4px 4px 4px; + } + + footer .widget form .input-append .btn { + display: block; + width: 100%; + padding-right: 0; + padding-left: 0; + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box; + margin-top: 10px; + } + + ul.related-folio li { + width: 156px; + margin: 0 20px 0 0; + } +} + +@media (max-width: 767px) { + body { + padding-right: 0; + padding-left: 0; + } + .navbar-brand { + margin-top: 10px; + border-bottom: none; + } + .navbar-header { + margin-top: 20px; + border-bottom: none; + } + + .navbar-nav { + border-top: none; + float: none; + width: 100%; + } + .navbar .nav > .active > a, + .navbar .nav > .active > a:hover { + background: none; + font-weight: 700; + color: #26aff0; + } + header .navbar-nav > li { + padding-bottom: 0px; + padding-top: 2px; + } + header .nav li .dropdown-menu { + margin-top: 0; + } + + .dropdown-menu { + position: absolute; + top: 0; + left: 40px; + z-index: 1000; + display: none; + float: left; + min-width: 160px; + padding: 5px 0; + margin: 2px 0 0; + font-size: 13px; + list-style: none; + background-color: #fff; + background-clip: padding-box; + border: 1px solid #f5f5f5; + border: 1px solid rgba(0, 0, 0, 0.15); + border-radius: 0; + -webkit-box-shadow: 0 6px 12px rgba(0, 0, 0, 0.175); + box-shadow: 0 6px 12px rgba(0, 0, 0, 0.175); + } + + .navbar-collapse.collapse { + border: none; + overflow: hidden; + } + + .box { + border-bottom: 1px solid #e9e9e9; + padding-bottom: 20px; + } + + #featured .flexslider .slide-caption { + width: 90%; + padding: 2%; + position: absolute; + left: 0; + bottom: -40px; + } + + #inner-headline .breadcrumb { + float: left; + clear: both; + width: 100%; + } + + .breadcrumb > li { + font-size: 13px; + } + + ul.portfolio li article a i.icon-48 { + width: 20px; + height: 20px; + font-size: 16px; + line-height: 20px; + } + + .left-sidebar { + border-right: none; + padding: 0 0 0 0; + border-bottom: 1px dotted #e6e6e6; + padding-bottom: 10px; + margin-bottom: 40px; + } + + .right-sidebar { + margin-top: 30px; + border-left: none; + padding: 0 0 0 0; + } + + footer .col-lg-1, + footer .col-lg-2, + footer .col-lg-3, + footer .col-lg-4, + footer .col-lg-5, + footer .col-lg-6, + footer .col-lg-7, + footer .col-lg-8, + footer .col-lg-9, + footer .col-lg-10, + footer .col-lg-11, + footer .col-lg-12 { + margin-bottom: 20px; + } + + #sub-footer ul.social-network { + float: left; + } + + [class*="span"] { + margin-bottom: 20px; + } +} + +@media (max-width: 480px) { + .bottom-article a.pull-right { + float: left; + margin-top: 20px; + } + + .search { + float: left; + } + + .flexslider .flex-caption { + display: none; + } + + .cta-text { + margin: 0 auto; + text-align: center; + } + + ul.portfolio li article a i { + width: 20px; + height: 20px; + font-size: 14px; + } +} + +.box-area:before { + position: absolute; + width: 100%; + height: 100%; + z-index: 0; + background-color: red; + content: ""; + position: absolute; + top: 7px; + left: -1px; + width: 100%; + height: 23px; + background: #f9f9f9; + -moz-transform: skewY(-3deg); + -o-transform: skewY(-3deg); + -ms-transform: skewY(-3deg); + -webkit-transform: skewY(-3deg); + transform: skewY(11deg); + background-size: cover; +} +.box-area:after { + position: absolute; + width: 100%; + height: 100%; + z-index: 0; + background-color: red; + content: ""; + position: absolute; + top: 7px; + left: 1px; + width: 100%; + height: 22px; + background: #f9f9f9; + -moz-transform: skewY(-3deg); + -o-transform: skewY(-3deg); + -ms-transform: skewY(-3deg); + -webkit-transform: skewY(-3deg); + transform: skewY(-11deg); + background-size: cover; +} +.box-area h3 { + margin-top: -16px; + z-index: 12; + position: relative; +} +.courses { + padding: 50px 0; +} +.carousel-indicators li { + display: inline-block; + border: 1px solid #929292; +} +.textbox { + background-color: #efefef; + padding: 4px 25px; +} +.textbox h3 { + margin: 0; + padding: 22px 0 14px; + font-size: 18px; +} diff --git a/backend/tests/integration/tests/pruning/website/fonts/customicon/icons.eot b/backend/tests/integration/tests/pruning/website/fonts/customicon/icons.eot new file mode 100644 index 00000000000..7ac16dbc9ea Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/fonts/customicon/icons.eot differ diff --git a/backend/tests/integration/tests/pruning/website/fonts/customicon/icons.svg b/backend/tests/integration/tests/pruning/website/fonts/customicon/icons.svg new file mode 100644 index 00000000000..5c2071ee504 --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/fonts/customicon/icons.svg @@ -0,0 +1,1186 @@ + + + + +This is a custom SVG font generated by IcoMoon. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/backend/tests/integration/tests/pruning/website/fonts/customicon/icons.ttf b/backend/tests/integration/tests/pruning/website/fonts/customicon/icons.ttf new file mode 100644 index 00000000000..0a2ac6fa70b Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/fonts/customicon/icons.ttf differ diff --git a/backend/tests/integration/tests/pruning/website/fonts/customicon/icons.woff b/backend/tests/integration/tests/pruning/website/fonts/customicon/icons.woff new file mode 100644 index 00000000000..f9391cb4fa6 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/fonts/customicon/icons.woff differ diff --git a/backend/tests/integration/tests/pruning/website/fonts/fontawesome-webfont.eot b/backend/tests/integration/tests/pruning/website/fonts/fontawesome-webfont.eot new file mode 100644 index 00000000000..7c79c6a6bc9 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/fonts/fontawesome-webfont.eot differ diff --git a/backend/tests/integration/tests/pruning/website/fonts/fontawesome-webfont.svg b/backend/tests/integration/tests/pruning/website/fonts/fontawesome-webfont.svg new file mode 100644 index 00000000000..45fdf338301 --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/fonts/fontawesome-webfont.svg @@ -0,0 +1,414 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/backend/tests/integration/tests/pruning/website/fonts/fontawesome-webfont.ttf b/backend/tests/integration/tests/pruning/website/fonts/fontawesome-webfont.ttf new file mode 100644 index 00000000000..e89738de5ea Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/fonts/fontawesome-webfont.ttf differ diff --git a/backend/tests/integration/tests/pruning/website/fonts/fontawesome-webfont.woff b/backend/tests/integration/tests/pruning/website/fonts/fontawesome-webfont.woff new file mode 100644 index 00000000000..8c1748aab7a Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/fonts/fontawesome-webfont.woff differ diff --git a/backend/tests/integration/tests/pruning/website/fonts/fontawesome.otf b/backend/tests/integration/tests/pruning/website/fonts/fontawesome.otf new file mode 100644 index 00000000000..8b0f54e47e1 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/fonts/fontawesome.otf differ diff --git a/backend/tests/integration/tests/pruning/website/img/avatar.png b/backend/tests/integration/tests/pruning/website/img/avatar.png new file mode 100644 index 00000000000..f11955333e0 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/avatar.png differ diff --git a/backend/tests/integration/tests/pruning/website/img/bg_direction_nav.png b/backend/tests/integration/tests/pruning/website/img/bg_direction_nav.png new file mode 100644 index 00000000000..59b2e718c83 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/bg_direction_nav.png differ diff --git a/backend/tests/integration/tests/pruning/website/img/glyphicons-halflings-white.png b/backend/tests/integration/tests/pruning/website/img/glyphicons-halflings-white.png new file mode 100644 index 00000000000..3bf6484a29d Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/glyphicons-halflings-white.png differ diff --git a/backend/tests/integration/tests/pruning/website/img/glyphicons-halflings.png b/backend/tests/integration/tests/pruning/website/img/glyphicons-halflings.png new file mode 100644 index 00000000000..a9969993201 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/glyphicons-halflings.png differ diff --git a/backend/tests/integration/tests/pruning/website/img/logo.png b/backend/tests/integration/tests/pruning/website/img/logo.png new file mode 100644 index 00000000000..04fb2a41478 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/logo.png differ diff --git a/backend/tests/integration/tests/pruning/website/img/nivo-bullets.png b/backend/tests/integration/tests/pruning/website/img/nivo-bullets.png new file mode 100644 index 00000000000..a84c9c0bdcc Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/nivo-bullets.png differ diff --git a/backend/tests/integration/tests/pruning/website/img/section-image-1.png b/backend/tests/integration/tests/pruning/website/img/section-image-1.png new file mode 100644 index 00000000000..9c0fab01c00 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/section-image-1.png differ diff --git a/backend/tests/integration/tests/pruning/website/img/service1.jpg b/backend/tests/integration/tests/pruning/website/img/service1.jpg new file mode 100644 index 00000000000..ed8c9c35579 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/service1.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/service2.jpg b/backend/tests/integration/tests/pruning/website/img/service2.jpg new file mode 100644 index 00000000000..1e42801fab2 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/service2.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/service3.jpg b/backend/tests/integration/tests/pruning/website/img/service3.jpg new file mode 100644 index 00000000000..0332b3e3dcd Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/service3.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/slides/1.jpg b/backend/tests/integration/tests/pruning/website/img/slides/1.jpg new file mode 100644 index 00000000000..872131c2dc6 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/slides/1.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/slides/2.jpg b/backend/tests/integration/tests/pruning/website/img/slides/2.jpg new file mode 100644 index 00000000000..0e7fc381d43 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/slides/2.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/slides/3.jpg b/backend/tests/integration/tests/pruning/website/img/slides/3.jpg new file mode 100644 index 00000000000..67eb62b93fe Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/slides/3.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/team1.jpg b/backend/tests/integration/tests/pruning/website/img/team1.jpg new file mode 100644 index 00000000000..0e0c282cad0 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/team1.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/team2.jpg b/backend/tests/integration/tests/pruning/website/img/team2.jpg new file mode 100644 index 00000000000..242d6c79d94 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/team2.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/team3.jpg b/backend/tests/integration/tests/pruning/website/img/team3.jpg new file mode 100644 index 00000000000..fcbb2908d4b Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/team3.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/team4.jpg b/backend/tests/integration/tests/pruning/website/img/team4.jpg new file mode 100644 index 00000000000..88039d54e8c Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/team4.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/works/1.jpg b/backend/tests/integration/tests/pruning/website/img/works/1.jpg new file mode 100644 index 00000000000..c6fce1776df Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/works/1.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/works/2.jpg b/backend/tests/integration/tests/pruning/website/img/works/2.jpg new file mode 100644 index 00000000000..4b6e0d1a713 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/works/2.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/works/3.jpg b/backend/tests/integration/tests/pruning/website/img/works/3.jpg new file mode 100644 index 00000000000..fd8b3b6729e Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/works/3.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/works/4.jpg b/backend/tests/integration/tests/pruning/website/img/works/4.jpg new file mode 100644 index 00000000000..a55d6eafbeb Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/works/4.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/works/5.jpg b/backend/tests/integration/tests/pruning/website/img/works/5.jpg new file mode 100644 index 00000000000..e5907a77938 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/works/5.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/works/6.jpg b/backend/tests/integration/tests/pruning/website/img/works/6.jpg new file mode 100644 index 00000000000..9758bd59378 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/works/6.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/works/7.jpg b/backend/tests/integration/tests/pruning/website/img/works/7.jpg new file mode 100644 index 00000000000..78c73c643c2 Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/works/7.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/img/works/8.jpg b/backend/tests/integration/tests/pruning/website/img/works/8.jpg new file mode 100644 index 00000000000..4570ff38ebb Binary files /dev/null and b/backend/tests/integration/tests/pruning/website/img/works/8.jpg differ diff --git a/backend/tests/integration/tests/pruning/website/index.html b/backend/tests/integration/tests/pruning/website/index.html new file mode 100644 index 00000000000..39e5fa6ff02 --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/index.html @@ -0,0 +1,309 @@ + + + + +Above Multi-purpose Free Bootstrap Responsive Template + + + + + + + + + + + + + + + + +
+ +
+ +
+ + +
+
+
+
+

Our Featured Courses

Lorem ipsum dolor sit amet, consectetur adipisicing elit. Dolores quae porro consequatur aliquam, incidunt eius magni provident, doloribus omnis minus temporibus perferendis nesciunt quam repellendus nulla nemo ipsum odit corrupti consequuntur possimus, vero mollitia velit ad consectetur. Alias, laborum excepturi nihil autem nemo numquam, ipsa architecto non, magni consequuntur quam.
+
+
+
+
+
+ + +
+
+
+
+
+

Web Development

Lorem ipsum dolor sit amet, consectetur adipisicing elit. Dolores quae porro consequatur aliquam, incidunt eius magni provident

+
+
+
+

UI Design

Lorem ipsum dolor sit amet, consectetur adipisicing elit. Dolores quae porro consequatur aliquam, incidunt eius magni provident

+
+
+
+

Interaction

Lorem ipsum dolor sit amet, consectetur adipisicing elit. Dolores quae porro consequatur aliquam, incidunt eius magni provident

+
+
+
+

User Experiance

Lorem ipsum dolor sit amet, consectetur adipisicing elit. Dolores quae porro consequatur aliquam, incidunt eius magni provident

+
+
+
+ + +
+
+
+
+
+
+
+ +
+
+ +
+
+
+
+
+ +
+
+

Courses We Offer

Lorem ipsum dolor sit amet, consectetur adipisicing elit. Dolores quae porro consequatur aliquam, incidunt eius magni provident, doloribus omnis minus temporibus perferendis nesciunt quam repellendus nulla nemo ipsum odit corrupti consequuntur possimus, vero mollitia velit ad consectetur. Alias, laborum excepturi nihil autem nemo numquam, ipsa architecto non, magni consequuntur quam.
+
+
+
+
+
+

Heading Course

+

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Praesent vest sit amet, consec ibulum molestie lacus. Aenean nonummy hendrerit mauris. Phasellus porta.

+
+
+
+

Heading Course

+

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Praesent vest sit amet, consec ibulum molestie lacus. Aenean nonummy hendrerit mauris. Phasellus porta.

+
+
+
+

Heading Course

+

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Praesent vest sit amet, consec ibulum molestie lacus. Aenean nonummy hendrerit mauris. Phasellus porta.

+
+
+
+
+
+

Heading Course

+

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Praesent vest sit amet, consec ibulum molestie lacus. Aenean nonummy hendrerit mauris. Phasellus porta.

+
+
+
+

Heading Course

+

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Praesent vest sit amet, consec ibulum molestie lacus. Aenean nonummy hendrerit mauris. Phasellus porta.

+
+
+
+

Heading Course

+

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Praesent vest sit amet, consec ibulum molestie lacus. Aenean nonummy hendrerit mauris. Phasellus porta.

+
+
+
+
+ +
+ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/backend/tests/integration/tests/pruning/website/js/animate.js b/backend/tests/integration/tests/pruning/website/js/animate.js new file mode 100644 index 00000000000..98875e1b657 --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/js/animate.js @@ -0,0 +1,477 @@ +jQuery(document).ready(function ($) { + //animate effect + $(".e_flash").hover( + function () { + $(this).addClass("animated flash"); + }, + function () { + $(this).removeClass("animated flash"); + }, + ); + $(".e_bounce").hover( + function () { + $(this).addClass("animated bounce"); + }, + function () { + $(this).removeClass("animated bounce"); + }, + ); + + $(".e_shake").hover( + function () { + $(this).addClass("animated shake"); + }, + function () { + $(this).removeClass("animated shake"); + }, + ); + $(".e_tada").hover( + function () { + $(this).addClass("animated tada"); + }, + function () { + $(this).removeClass("animated tada"); + }, + ); + $(".e_swing").hover( + function () { + $(this).addClass("animated swing"); + }, + function () { + $(this).removeClass("animated swing"); + }, + ); + $(".e_wobble").hover( + function () { + $(this).addClass("animated wobble"); + }, + function () { + $(this).removeClass("animated wobble"); + }, + ); + $(".e_wiggle").hover( + function () { + $(this).addClass("animated wiggle"); + }, + function () { + $(this).removeClass("animated wiggle"); + }, + ); + $(".e_pulse").hover( + function () { + $(this).addClass("animated pulse"); + }, + function () { + $(this).removeClass("animated pulse"); + }, + ); + + $(".e_flip").hover( + function () { + $(this).addClass("animated flip"); + }, + function () { + $(this).removeClass("animated flip"); + }, + ); + $(".e_flipInX").hover( + function () { + $(this).addClass("animated flipInX"); + }, + function () { + $(this).removeClass("animated flipInX"); + }, + ); + $(".e_flipOutX").hover( + function () { + $(this).addClass("animated flipOutX"); + }, + function () { + $(this).removeClass("animated flipOutX"); + }, + ); + $(".e_flipInY").hover( + function () { + $(this).addClass("animated flipInY"); + }, + function () { + $(this).removeClass("animated flipInY"); + }, + ); + $(".e_flipOutY").hover( + function () { + $(this).addClass("animated flipOutY"); + }, + function () { + $(this).removeClass("animated flipOutY"); + }, + ); + + //Fading entrances + $(".e_fadeIn").hover( + function () { + $(this).addClass("animated fadeIn"); + }, + function () { + $(this).removeClass("animated fadeIn"); + }, + ); + $(".e_fadeInUp").hover( + function () { + $(this).addClass("animated fadeInUp"); + }, + function () { + $(this).removeClass("animated fadeInUp"); + }, + ); + $(".e_fadeInDown").hover( + function () { + $(this).addClass("animated fadeInDown"); + }, + function () { + $(this).removeClass("animated fadeInDown"); + }, + ); + $(".e_fadeInLeft").hover( + function () { + $(this).addClass("animated fadeInLeft"); + }, + function () { + $(this).removeClass("animated fadeInLeft"); + }, + ); + $(".e_fadeInRight").hover( + function () { + $(this).addClass("animated fadeInRight"); + }, + function () { + $(this).removeClass("animated fadeInRight"); + }, + ); + $(".e_fadeInUpBig").hover( + function () { + $(this).addClass("animated fadeInUpBig"); + }, + function () { + $(this).removeClass("animated fadeInUpBig"); + }, + ); + $(".e_fadeInUpBig").hover( + function () { + $(this).addClass("animated fadeInUpBig"); + }, + function () { + $(this).removeClass("animated fadeInUpBig"); + }, + ); + $(".e_fadeInDownBig").hover( + function () { + $(this).addClass("animated fadeInDownBig"); + }, + function () { + $(this).removeClass("animated fadeInDownBig"); + }, + ); + $(".e_fadeInLeftBig").hover( + function () { + $(this).addClass("animated fadeInLeftBig"); + }, + function () { + $(this).removeClass("animated fadeInLeftBig"); + }, + ); + $(".e_fadeInRightBig").hover( + function () { + $(this).addClass("animated fadeInRightBig"); + }, + function () { + $(this).removeClass("animated fadeInRightBig"); + }, + ); + + //Fading exits + $(".e_fadeOut").hover( + function () { + $(this).addClass("animated fadeOut"); + }, + function () { + $(this).removeClass("animated fadeOut"); + }, + ); + $(".e_fadeOutUp").hover( + function () { + $(this).addClass("animated fadeOutUp"); + }, + function () { + $(this).removeClass("animated fadeOutUp"); + }, + ); + $(".e_fadeOutDown").hover( + function () { + $(this).addClass("animated fadeOutDown"); + }, + function () { + $(this).removeClass("animated fadeOutDown"); + }, + ); + $(".e_fadeOutLeft").hover( + function () { + $(this).addClass("animated fadeOutLeft"); + }, + function () { + $(this).removeClass("animated fadeOutLeft"); + }, + ); + $(".e_fadeOutRight").hover( + function () { + $(this).addClass("animated fadeOutRight"); + }, + function () { + $(this).removeClass("animated fadeOutRight"); + }, + ); + $(".e_fadeOutUpBig").hover( + function () { + $(this).addClass("animated fadeOutUpBig"); + }, + function () { + $(this).removeClass("animated fadeOutUpBig"); + }, + ); + $(".e_fadeOutDownBig").hover( + function () { + $(this).addClass("animated fadeOutDownBig"); + }, + function () { + $(this).removeClass("animated fadeOutDownBig"); + }, + ); + $(".e_fadeOutLeftBig").hover( + function () { + $(this).addClass("animated fadeOutLeftBig"); + }, + function () { + $(this).removeClass("animated fadeOutLeftBig"); + }, + ); + $(".e_fadeOutRightBig").hover( + function () { + $(this).addClass("animated fadeOutRightBig"); + }, + function () { + $(this).removeClass("animated fadeOutRightBig"); + }, + ); + + //Bouncing entrances + $(".e_bounceIn").hover( + function () { + $(this).addClass("animated bounceIn"); + }, + function () { + $(this).removeClass("animated bounceIn"); + }, + ); + $(".e_bounceInDown").hover( + function () { + $(this).addClass("animated bounceInDown"); + }, + function () { + $(this).removeClass("animated bounceInDown"); + }, + ); + $(".e_bounceInUp").hover( + function () { + $(this).addClass("animated bounceInUp"); + }, + function () { + $(this).removeClass("animated bounceInUp"); + }, + ); + $(".e_bounceInLeft").hover( + function () { + $(this).addClass("animated bounceInLeft"); + }, + function () { + $(this).removeClass("animated bounceInLeft"); + }, + ); + $(".e_bounceInRight").hover( + function () { + $(this).addClass("animated bounceInRight"); + }, + function () { + $(this).removeClass("animated bounceInRight"); + }, + ); + + //Bouncing exits + $(".e_bounceOut").hover( + function () { + $(this).addClass("animated bounceOut"); + }, + function () { + $(this).removeClass("animated bounceOut"); + }, + ); + $(".e_bounceOutDown").hover( + function () { + $(this).addClass("animated bounceOutDown"); + }, + function () { + $(this).removeClass("animated bounceOutDown"); + }, + ); + $(".e_bounceOutUp").hover( + function () { + $(this).addClass("animated bounceOutUp"); + }, + function () { + $(this).removeClass("animated bounceOutUp"); + }, + ); + $(".e_bounceOutLeft").hover( + function () { + $(this).addClass("animated bounceOutLeft"); + }, + function () { + $(this).removeClass("animated bounceOutLeft"); + }, + ); + $(".e_bounceOutRight").hover( + function () { + $(this).addClass("animated bounceOutRight"); + }, + function () { + $(this).removeClass("animated bounceOutRight"); + }, + ); + + //Rotating entrances + $(".e_rotateIn").hover( + function () { + $(this).addClass("animated rotateIn"); + }, + function () { + $(this).removeClass("animated rotateIn"); + }, + ); + $(".e_rotateInDownLeft").hover( + function () { + $(this).addClass("animated rotateInDownLeft"); + }, + function () { + $(this).removeClass("animated rotateInDownLeft"); + }, + ); + $(".e_rotateInDownRight").hover( + function () { + $(this).addClass("animated rotateInDownRight"); + }, + function () { + $(this).removeClass("animated rotateInDownRight"); + }, + ); + $(".e_rotateInUpRight").hover( + function () { + $(this).addClass("animated rotateInUpRight"); + }, + function () { + $(this).removeClass("animated rotateInUpRight"); + }, + ); + $(".e_rotateInUpLeft").hover( + function () { + $(this).addClass("animated rotateInUpLeft"); + }, + function () { + $(this).removeClass("animated rotateInUpLeft"); + }, + ); + + //Rotating exits + $(".e_rotateOut").hover( + function () { + $(this).addClass("animated rotateOut"); + }, + function () { + $(this).removeClass("animated rotateOut"); + }, + ); + $(".e_rotateOutDownLeft").hover( + function () { + $(this).addClass("animated rotateOutDownLeft"); + }, + function () { + $(this).removeClass("animated rotateOutDownLeft"); + }, + ); + $(".e_rotateOutDownRight").hover( + function () { + $(this).addClass("animated rotateOutDownRight"); + }, + function () { + $(this).removeClass("animated rotateOutDownRight"); + }, + ); + $(".e_rotateOutUpLeft").hover( + function () { + $(this).addClass("animated rotateOutUpLeft"); + }, + function () { + $(this).removeClass("animated rotateOutUpLeft"); + }, + ); + $(".e_rotateOutUpRight").hover( + function () { + $(this).addClass("animated rotateOutUpRight"); + }, + function () { + $(this).removeClass("animated rotateOutUpRight"); + }, + ); + + //Lightspeed + $(".e_lightSpeedIn").hover( + function () { + $(this).addClass("animated lightSpeedIn"); + }, + function () { + $(this).removeClass("animated lightSpeedIn"); + }, + ); + $(".e_lightSpeedOut").hover( + function () { + $(this).addClass("animated lightSpeedOut"); + }, + function () { + $(this).removeClass("animated lightSpeedOut"); + }, + ); + + //specials + $(".e_hinge").hover( + function () { + $(this).addClass("animated hinge"); + }, + function () { + $(this).removeClass("animated hinge"); + }, + ); + $(".e_rollIn").hover( + function () { + $(this).addClass("animated rollIn"); + }, + function () { + $(this).removeClass("animated rollIn"); + }, + ); + $(".e_rollOut").hover( + function () { + $(this).addClass("animated rollOut"); + }, + function () { + $(this).removeClass("animated rollOut"); + }, + ); +}); diff --git a/backend/tests/integration/tests/pruning/website/js/bootstrap.min.js b/backend/tests/integration/tests/pruning/website/js/bootstrap.min.js new file mode 100644 index 00000000000..d6c0c9a8f99 --- /dev/null +++ b/backend/tests/integration/tests/pruning/website/js/bootstrap.min.js @@ -0,0 +1,1352 @@ +/*! + * Bootstrap v3.1.0 (http://getbootstrap.com) + * Copyright 2011-2014 Twitter, Inc. + * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) + */ +if ("undefined" == typeof jQuery) throw new Error("Bootstrap requires jQuery"); ++(function (a) { + "use strict"; + function b() { + var a = document.createElement("bootstrap"), + b = { + WebkitTransition: "webkitTransitionEnd", + MozTransition: "transitionend", + OTransition: "oTransitionEnd otransitionend", + transition: "transitionend", + }; + for (var c in b) if (void 0 !== a.style[c]) return { end: b[c] }; + return !1; + } + (a.fn.emulateTransitionEnd = function (b) { + var c = !1, + d = this; + a(this).one(a.support.transition.end, function () { + c = !0; + }); + var e = function () { + c || a(d).trigger(a.support.transition.end); + }; + return setTimeout(e, b), this; + }), + a(function () { + a.support.transition = b(); + }); +})(jQuery), + +(function (a) { + "use strict"; + var b = '[data-dismiss="alert"]', + c = function (c) { + a(c).on("click", b, this.close); + }; + c.prototype.close = function (b) { + function c() { + f.trigger("closed.bs.alert").remove(); + } + var d = a(this), + e = d.attr("data-target"); + e || ((e = d.attr("href")), (e = e && e.replace(/.*(?=#[^\s]*$)/, ""))); + var f = a(e); + b && b.preventDefault(), + f.length || (f = d.hasClass("alert") ? d : d.parent()), + f.trigger((b = a.Event("close.bs.alert"))), + b.isDefaultPrevented() || + (f.removeClass("in"), + a.support.transition && f.hasClass("fade") + ? f.one(a.support.transition.end, c).emulateTransitionEnd(150) + : c()); + }; + var d = a.fn.alert; + (a.fn.alert = function (b) { + return this.each(function () { + var d = a(this), + e = d.data("bs.alert"); + e || d.data("bs.alert", (e = new c(this))), + "string" == typeof b && e[b].call(d); + }); + }), + (a.fn.alert.Constructor = c), + (a.fn.alert.noConflict = function () { + return (a.fn.alert = d), this; + }), + a(document).on("click.bs.alert.data-api", b, c.prototype.close); + })(jQuery), + +(function (a) { + "use strict"; + var b = function (c, d) { + (this.$element = a(c)), + (this.options = a.extend({}, b.DEFAULTS, d)), + (this.isLoading = !1); + }; + (b.DEFAULTS = { loadingText: "loading..." }), + (b.prototype.setState = function (b) { + var c = "disabled", + d = this.$element, + e = d.is("input") ? "val" : "html", + f = d.data(); + (b += "Text"), + f.resetText || d.data("resetText", d[e]()), + d[e](f[b] || this.options[b]), + setTimeout( + a.proxy(function () { + "loadingText" == b + ? ((this.isLoading = !0), d.addClass(c).attr(c, c)) + : this.isLoading && + ((this.isLoading = !1), d.removeClass(c).removeAttr(c)); + }, this), + 0, + ); + }), + (b.prototype.toggle = function () { + var a = !0, + b = this.$element.closest('[data-toggle="buttons"]'); + if (b.length) { + var c = this.$element.find("input"); + "radio" == c.prop("type") && + (c.prop("checked") && this.$element.hasClass("active") + ? (a = !1) + : b.find(".active").removeClass("active")), + a && + c + .prop("checked", !this.$element.hasClass("active")) + .trigger("change"); + } + a && this.$element.toggleClass("active"); + }); + var c = a.fn.button; + (a.fn.button = function (c) { + return this.each(function () { + var d = a(this), + e = d.data("bs.button"), + f = "object" == typeof c && c; + e || d.data("bs.button", (e = new b(this, f))), + "toggle" == c ? e.toggle() : c && e.setState(c); + }); + }), + (a.fn.button.Constructor = b), + (a.fn.button.noConflict = function () { + return (a.fn.button = c), this; + }), + a(document).on( + "click.bs.button.data-api", + "[data-toggle^=button]", + function (b) { + var c = a(b.target); + c.hasClass("btn") || (c = c.closest(".btn")), + c.button("toggle"), + b.preventDefault(); + }, + ); + })(jQuery), + +(function (a) { + "use strict"; + var b = function (b, c) { + (this.$element = a(b)), + (this.$indicators = this.$element.find(".carousel-indicators")), + (this.options = c), + (this.paused = + this.sliding = + this.interval = + this.$active = + this.$items = + null), + "hover" == this.options.pause && + this.$element + .on("mouseenter", a.proxy(this.pause, this)) + .on("mouseleave", a.proxy(this.cycle, this)); + }; + (b.DEFAULTS = { interval: 5e3, pause: "hover", wrap: !0 }), + (b.prototype.cycle = function (b) { + return ( + b || (this.paused = !1), + this.interval && clearInterval(this.interval), + this.options.interval && + !this.paused && + (this.interval = setInterval( + a.proxy(this.next, this), + this.options.interval, + )), + this + ); + }), + (b.prototype.getActiveIndex = function () { + return ( + (this.$active = this.$element.find(".item.active")), + (this.$items = this.$active.parent().children()), + this.$items.index(this.$active) + ); + }), + (b.prototype.to = function (b) { + var c = this, + d = this.getActiveIndex(); + return b > this.$items.length - 1 || 0 > b + ? void 0 + : this.sliding + ? this.$element.one("slid.bs.carousel", function () { + c.to(b); + }) + : d == b + ? this.pause().cycle() + : this.slide(b > d ? "next" : "prev", a(this.$items[b])); + }), + (b.prototype.pause = function (b) { + return ( + b || (this.paused = !0), + this.$element.find(".next, .prev").length && + a.support.transition && + (this.$element.trigger(a.support.transition.end), this.cycle(!0)), + (this.interval = clearInterval(this.interval)), + this + ); + }), + (b.prototype.next = function () { + return this.sliding ? void 0 : this.slide("next"); + }), + (b.prototype.prev = function () { + return this.sliding ? void 0 : this.slide("prev"); + }), + (b.prototype.slide = function (b, c) { + var d = this.$element.find(".item.active"), + e = c || d[b](), + f = this.interval, + g = "next" == b ? "left" : "right", + h = "next" == b ? "first" : "last", + i = this; + if (!e.length) { + if (!this.options.wrap) return; + e = this.$element.find(".item")[h](); + } + if (e.hasClass("active")) return (this.sliding = !1); + var j = a.Event("slide.bs.carousel", { + relatedTarget: e[0], + direction: g, + }); + return ( + this.$element.trigger(j), + j.isDefaultPrevented() + ? void 0 + : ((this.sliding = !0), + f && this.pause(), + this.$indicators.length && + (this.$indicators.find(".active").removeClass("active"), + this.$element.one("slid.bs.carousel", function () { + var b = a(i.$indicators.children()[i.getActiveIndex()]); + b && b.addClass("active"); + })), + a.support.transition && this.$element.hasClass("slide") + ? (e.addClass(b), + e[0].offsetWidth, + d.addClass(g), + e.addClass(g), + d + .one(a.support.transition.end, function () { + e.removeClass([b, g].join(" ")).addClass("active"), + d.removeClass(["active", g].join(" ")), + (i.sliding = !1), + setTimeout(function () { + i.$element.trigger("slid.bs.carousel"); + }, 0); + }) + .emulateTransitionEnd( + 1e3 * d.css("transition-duration").slice(0, -1), + )) + : (d.removeClass("active"), + e.addClass("active"), + (this.sliding = !1), + this.$element.trigger("slid.bs.carousel")), + f && this.cycle(), + this) + ); + }); + var c = a.fn.carousel; + (a.fn.carousel = function (c) { + return this.each(function () { + var d = a(this), + e = d.data("bs.carousel"), + f = a.extend({}, b.DEFAULTS, d.data(), "object" == typeof c && c), + g = "string" == typeof c ? c : f.slide; + e || d.data("bs.carousel", (e = new b(this, f))), + "number" == typeof c + ? e.to(c) + : g + ? e[g]() + : f.interval && e.pause().cycle(); + }); + }), + (a.fn.carousel.Constructor = b), + (a.fn.carousel.noConflict = function () { + return (a.fn.carousel = c), this; + }), + a(document).on( + "click.bs.carousel.data-api", + "[data-slide], [data-slide-to]", + function (b) { + var c, + d = a(this), + e = a( + d.attr("data-target") || + ((c = d.attr("href")) && c.replace(/.*(?=#[^\s]+$)/, "")), + ), + f = a.extend({}, e.data(), d.data()), + g = d.attr("data-slide-to"); + g && (f.interval = !1), + e.carousel(f), + (g = d.attr("data-slide-to")) && e.data("bs.carousel").to(g), + b.preventDefault(); + }, + ), + a(window).on("load", function () { + a('[data-ride="carousel"]').each(function () { + var b = a(this); + b.carousel(b.data()); + }); + }); + })(jQuery), + +(function (a) { + "use strict"; + var b = function (c, d) { + (this.$element = a(c)), + (this.options = a.extend({}, b.DEFAULTS, d)), + (this.transitioning = null), + this.options.parent && (this.$parent = a(this.options.parent)), + this.options.toggle && this.toggle(); + }; + (b.DEFAULTS = { toggle: !0 }), + (b.prototype.dimension = function () { + var a = this.$element.hasClass("width"); + return a ? "width" : "height"; + }), + (b.prototype.show = function () { + if (!this.transitioning && !this.$element.hasClass("in")) { + var b = a.Event("show.bs.collapse"); + if ((this.$element.trigger(b), !b.isDefaultPrevented())) { + var c = this.$parent && this.$parent.find("> .panel > .in"); + if (c && c.length) { + var d = c.data("bs.collapse"); + if (d && d.transitioning) return; + c.collapse("hide"), d || c.data("bs.collapse", null); + } + var e = this.dimension(); + this.$element.removeClass("collapse").addClass("collapsing")[e](0), + (this.transitioning = 1); + var f = function () { + this.$element + .removeClass("collapsing") + .addClass("collapse in") + [e]("auto"), + (this.transitioning = 0), + this.$element.trigger("shown.bs.collapse"); + }; + if (!a.support.transition) return f.call(this); + var g = a.camelCase(["scroll", e].join("-")); + this.$element + .one(a.support.transition.end, a.proxy(f, this)) + .emulateTransitionEnd(350) + [e](this.$element[0][g]); + } + } + }), + (b.prototype.hide = function () { + if (!this.transitioning && this.$element.hasClass("in")) { + var b = a.Event("hide.bs.collapse"); + if ((this.$element.trigger(b), !b.isDefaultPrevented())) { + var c = this.dimension(); + this.$element[c](this.$element[c]())[0].offsetHeight, + this.$element + .addClass("collapsing") + .removeClass("collapse") + .removeClass("in"), + (this.transitioning = 1); + var d = function () { + (this.transitioning = 0), + this.$element + .trigger("hidden.bs.collapse") + .removeClass("collapsing") + .addClass("collapse"); + }; + return a.support.transition + ? void this.$element[c](0) + .one(a.support.transition.end, a.proxy(d, this)) + .emulateTransitionEnd(350) + : d.call(this); + } + } + }), + (b.prototype.toggle = function () { + this[this.$element.hasClass("in") ? "hide" : "show"](); + }); + var c = a.fn.collapse; + (a.fn.collapse = function (c) { + return this.each(function () { + var d = a(this), + e = d.data("bs.collapse"), + f = a.extend({}, b.DEFAULTS, d.data(), "object" == typeof c && c); + !e && f.toggle && "show" == c && (c = !c), + e || d.data("bs.collapse", (e = new b(this, f))), + "string" == typeof c && e[c](); + }); + }), + (a.fn.collapse.Constructor = b), + (a.fn.collapse.noConflict = function () { + return (a.fn.collapse = c), this; + }), + a(document).on( + "click.bs.collapse.data-api", + "[data-toggle=collapse]", + function (b) { + var c, + d = a(this), + e = + d.attr("data-target") || + b.preventDefault() || + ((c = d.attr("href")) && c.replace(/.*(?=#[^\s]+$)/, "")), + f = a(e), + g = f.data("bs.collapse"), + h = g ? "toggle" : d.data(), + i = d.attr("data-parent"), + j = i && a(i); + (g && g.transitioning) || + (j && + j + .find('[data-toggle=collapse][data-parent="' + i + '"]') + .not(d) + .addClass("collapsed"), + d[f.hasClass("in") ? "addClass" : "removeClass"]("collapsed")), + f.collapse(h); + }, + ); + })(jQuery), + +(function (a) { + "use strict"; + function b(b) { + a(d).remove(), + a(e).each(function () { + var d = c(a(this)), + e = { relatedTarget: this }; + d.hasClass("open") && + (d.trigger((b = a.Event("hide.bs.dropdown", e))), + b.isDefaultPrevented() || + d.removeClass("open").trigger("hidden.bs.dropdown", e)); + }); + } + function c(b) { + var c = b.attr("data-target"); + c || + ((c = b.attr("href")), + (c = c && /#[A-Za-z]/.test(c) && c.replace(/.*(?=#[^\s]*$)/, ""))); + var d = c && a(c); + return d && d.length ? d : b.parent(); + } + var d = ".dropdown-backdrop", + e = "[data-toggle=dropdown]", + f = function (b) { + a(b).on("click.bs.dropdown", this.toggle); + }; + (f.prototype.toggle = function (d) { + var e = a(this); + if (!e.is(".disabled, :disabled")) { + var f = c(e), + g = f.hasClass("open"); + if ((b(), !g)) { + "ontouchstart" in document.documentElement && + !f.closest(".navbar-nav").length && + a('