Remove vision #1215
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Evals | |
on: | |
pull_request: | |
types: | |
- opened | |
- synchronize | |
- labeled | |
- unlabeled | |
env: | |
EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest" | |
EVAL_CATEGORIES: "observe,act,combination,extract,text_extract" | |
concurrency: | |
group: ${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
determine-evals: | |
runs-on: ubuntu-latest | |
outputs: | |
run-extract: ${{ steps.check-labels.outputs.run-extract }} | |
run-act: ${{ steps.check-labels.outputs.run-act }} | |
run-observe: ${{ steps.check-labels.outputs.run-observe }} | |
run-text-extract: ${{ steps.check-labels.outputs.run-text-extract }} | |
steps: | |
- id: check-labels | |
run: | | |
# Default to running all tests on main branch | |
if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then | |
echo "Running all tests for main branch" | |
echo "run-extract=true" >> $GITHUB_OUTPUT | |
echo "run-act=true" >> $GITHUB_OUTPUT | |
echo "run-observe=true" >> $GITHUB_OUTPUT | |
echo "run-text-extract=true" >> $GITHUB_OUTPUT | |
exit 0 | |
fi | |
# Check for specific labels | |
echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT | |
echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT | |
echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT | |
echo "run-text-extract=${{ contains(github.event.pull_request.labels.*.name, 'text-extract') }}" >> $GITHUB_OUTPUT | |
run-lint: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: | | |
rm -rf node_modules | |
rm -f package-lock.json | |
npm install | |
- name: Run Lint | |
run: npm run lint | |
run-build: | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: | | |
rm -rf node_modules | |
rm -f package-lock.json | |
npm install | |
- name: Run Build | |
run: npm run build | |
run-e2e-tests: | |
needs: [run-lint, run-build] | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
env: | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: | | |
rm -rf node_modules | |
rm -f package-lock.json | |
npm install | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Build Stagehand | |
run: npm run build | |
- name: Run E2E Tests (Deterministic Playwright) | |
run: npm run e2e | |
run-e2e-bb-tests: | |
needs: [run-e2e-tests] | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
if: > | |
github.event_name == 'push' || | |
(github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: | | |
rm -rf node_modules | |
rm -f package-lock.json | |
npm install | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Build Stagehand | |
run: npm run build | |
- name: Run E2E Tests (browserbase) | |
run: npm run e2e:bb | |
run-combination-evals: | |
needs: [run-e2e-bb-tests, run-e2e-tests, determine-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 40 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Set up Node.js | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
run: | | |
rm -rf node_modules | |
rm -f package-lock.json | |
npm install | |
- name: Build Stagehand | |
run: npm run build | |
- name: Install Playwright browsers | |
run: npm exec playwright install --with-deps | |
- name: Run Combination Evals | |
run: npm run evals category combination | |
- name: Log Combination Evals Performance | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
combination_score=$(jq '.categories.combination' eval-summary.json) | |
echo "Combination category score: $combination_score%" | |
exit 0 | |
else | |
echo "Eval summary not found for combination category. Failing CI." | |
exit 1 | |
fi | |
run-act-evals: | |
needs: [run-combination-evals, determine-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 25 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Check for 'act' label | |
id: label-check | |
run: | | |
if [ "${{ needs.determine-evals.outputs.run-act }}" != "true" ]; then | |
echo "has_label=false" >> $GITHUB_OUTPUT | |
echo "No label for ACT. Exiting with success." | |
else | |
echo "has_label=true" >> $GITHUB_OUTPUT | |
fi | |
- name: Set up Node.js | |
if: needs.determine-evals.outputs.run-act == 'true' | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
if: needs.determine-evals.outputs.run-act == 'true' | |
run: | | |
rm -rf node_modules | |
rm -f package-lock.json | |
npm install | |
- name: Build Stagehand | |
if: needs.determine-evals.outputs.run-act == 'true' | |
run: npm run build | |
- name: Install Playwright browsers | |
if: needs.determine-evals.outputs.run-act == 'true' | |
run: npm exec playwright install --with-deps | |
- name: Run Act Evals | |
if: needs.determine-evals.outputs.run-act == 'true' | |
run: npm run evals category act | |
- name: Log Act Evals Performance | |
if: needs.determine-evals.outputs.run-act == 'true' | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
act_score=$(jq '.categories.act' eval-summary.json) | |
echo "Act category score: $act_score%" | |
if (( $(echo "$act_score < 80" | bc -l) )); then | |
echo "Act category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
else | |
echo "Eval summary not found for act category. Failing CI." | |
exit 1 | |
fi | |
run-extract-evals: | |
needs: [run-act-evals, determine-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 50 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Check for 'extract' label | |
id: label-check | |
run: | | |
if [ "${{ needs.determine-evals.outputs.run-extract }}" != "true" ]; then | |
echo "has_label=false" >> $GITHUB_OUTPUT | |
echo "No label for EXTRACT. Exiting with success." | |
else | |
echo "has_label=true" >> $GITHUB_OUTPUT | |
fi | |
- name: Set up Node.js | |
if: needs.determine-evals.outputs.run-extract == 'true' | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
if: needs.determine-evals.outputs.run-extract == 'true' | |
run: | | |
rm -rf node_modules | |
rm -f package-lock.json | |
npm install | |
- name: Build Stagehand | |
if: needs.determine-evals.outputs.run-extract == 'true' | |
run: npm run build | |
- name: Install Playwright browsers | |
if: needs.determine-evals.outputs.run-extract == 'true' | |
run: npm exec playwright install --with-deps | |
# 1. Run extract category with domExtract | |
- name: Run Extract Evals (domExtract) | |
if: needs.determine-evals.outputs.run-extract == 'true' | |
run: npm run evals category extract -- --extract-method=domExtract | |
- name: Save Extract Dom Results | |
if: needs.determine-evals.outputs.run-extract == 'true' | |
run: mv eval-summary.json eval-summary-extract-dom.json | |
# 2. Then run extract category with textExtract | |
- name: Run Extract Evals (textExtract) | |
if: needs.determine-evals.outputs.run-extract == 'true' | |
run: npm run evals category extract -- --extract-method=textExtract | |
- name: Save Extract Text Results | |
if: needs.determine-evals.outputs.run-extract == 'true' | |
run: mv eval-summary.json eval-summary-extract-text.json | |
# 3. Log and Compare Extract Evals Performance | |
- name: Log and Compare Extract Evals Performance | |
if: needs.determine-evals.outputs.run-extract == 'true' | |
run: | | |
experimentNameDom=$(jq -r '.experimentName' eval-summary-extract-dom.json) | |
dom_score=$(jq '.categories.extract' eval-summary-extract-dom.json) | |
echo "DomExtract Extract category score: $dom_score%" | |
echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}" | |
experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json) | |
text_score=$(jq '.categories.extract' eval-summary-extract-text.json) | |
echo "TextExtract Extract category score: $text_score%" | |
echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}" | |
# If domExtract <80% fail CI | |
if (( $(echo "$dom_score < 80" | bc -l) )); then | |
echo "DomExtract extract category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
run-text-extract-evals: | |
needs: [run-extract-evals, determine-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 120 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Check for 'text-extract' label | |
id: label-check | |
run: | | |
if [ "${{ needs.determine-evals.outputs.run-text-extract }}" != "true" ]; then | |
echo "has_label=false" >> $GITHUB_OUTPUT | |
echo "No label for TEXT-EXTRACT. Exiting with success." | |
else | |
echo "has_label=true" >> $GITHUB_OUTPUT | |
fi | |
- name: Set up Node.js | |
if: needs.determine-evals.outputs.run-text-extract == 'true' | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
if: needs.determine-evals.outputs.run-text-extract == 'true' | |
run: | | |
rm -rf node_modules | |
rm -f package-lock.json | |
npm install | |
- name: Install Playwright browsers | |
if: needs.determine-evals.outputs.run-text-extract == 'true' | |
run: npm exec playwright install --with-deps | |
- name: Build Stagehand | |
if: needs.determine-evals.outputs.run-text-extract == 'true' | |
run: npm run build | |
# 1. Run text_extract category with textExtract first | |
- name: Run text_extract Evals (textExtract) | |
if: needs.determine-evals.outputs.run-text-extract == 'true' | |
run: npm run evals category text_extract -- --extract-method=textExtract | |
- name: Save text_extract Text Results | |
if: needs.determine-evals.outputs.run-text-extract == 'true' | |
run: mv eval-summary.json eval-summary-text_extract-text.json | |
# 2. Then run text_extract category with domExtract | |
- name: Run text_extract Evals (domExtract) | |
if: needs.determine-evals.outputs.run-text-extract == 'true' | |
run: npm run evals category text_extract -- --extract-method=domExtract | |
- name: Save text_extract Dom Results | |
if: needs.determine-evals.outputs.run-text-extract == 'true' | |
run: mv eval-summary.json eval-summary-text_extract-dom.json | |
# 3. Log and Compare text_extract Evals Performance | |
- name: Log and Compare text_extract Evals Performance | |
if: needs.determine-evals.outputs.run-text-extract == 'true' | |
run: | | |
experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json) | |
text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json) | |
echo "TextExtract text_extract category score: $text_score%" | |
echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}" | |
experimentNameDom=$(jq -r '.experimentName' eval-summary-text_extract-dom.json) | |
dom_score=$(jq '.categories.text_extract' eval-summary-text_extract-dom.json) | |
echo "DomExtract text_extract category score: $dom_score%" | |
echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}" | |
# If text_score <80% fail CI | |
if (( $(echo "$text_score < 80" | bc -l) )); then | |
echo "textExtract text_extract category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
run-observe-evals: | |
needs: [run-text-extract-evals, determine-evals] | |
runs-on: ubuntu-latest | |
timeout-minutes: 25 | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} | |
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} | |
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} | |
HEADLESS: true | |
EVAL_ENV: browserbase | |
steps: | |
- name: Check out repository code | |
uses: actions/checkout@v4 | |
- name: Check for 'observe' label | |
id: label-check | |
run: | | |
if [ "${{ needs.determine-evals.outputs.run-observe }}" != "true" ]; then | |
echo "has_label=false" >> $GITHUB_OUTPUT | |
echo "No label for OBSERVE. Exiting with success." | |
else | |
echo "has_label=true" >> $GITHUB_OUTPUT | |
fi | |
- name: Set up Node.js | |
if: needs.determine-evals.outputs.run-observe == 'true' | |
uses: actions/setup-node@v4 | |
with: | |
node-version: "20" | |
- name: Install dependencies | |
if: needs.determine-evals.outputs.run-observe == 'true' | |
run: | | |
rm -rf node_modules | |
rm -f package-lock.json | |
npm install | |
- name: Install Playwright browsers | |
if: needs.determine-evals.outputs.run-observe == 'true' | |
run: npm exec playwright install --with-deps | |
- name: Build Stagehand | |
if: needs.determine-evals.outputs.run-observe == 'true' | |
run: npm run build | |
- name: Run Observe Evals | |
if: needs.determine-evals.outputs.run-observe == 'true' | |
run: npm run evals category observe | |
- name: Log Observe Evals Performance | |
if: needs.determine-evals.outputs.run-observe == 'true' | |
run: | | |
experimentName=$(jq -r '.experimentName' eval-summary.json) | |
echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" | |
if [ -f eval-summary.json ]; then | |
observe_score=$(jq '.categories.observe' eval-summary.json) | |
echo "Observe category score: $observe_score%" | |
if (( $(echo "$observe_score < 80" | bc -l) )); then | |
echo "Observe category score is below 80%. Failing CI." | |
exit 1 | |
fi | |
else | |
echo "Eval summary not found for observe category. Failing CI." | |
exit 1 | |
fi |