browserbase · kamath · Mar 10, 2025 · Mar 14, 2025 · Mar 14, 2025 · Mar 14, 2025
diff --git a/.changeset/cool-lemons-report.md b/.changeset/cool-lemons-report.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+pass observeHandler into actHandler
diff --git a/.changeset/curly-rules-build.md b/.changeset/curly-rules-build.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": minor
+---
+
+Added support for offloading agent tasks to the API.
diff --git a/.changeset/empty-spoons-float.md b/.changeset/empty-spoons-float.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": minor
+---
+
+Added a `stagehand.history` array which stores an array of `act`, `extract`, `observe`, and `goto` calls made. Since this history array is stored on the `StagehandPage` level, it will capture methods even if indirectly called by an agent.
diff --git a/.changeset/fifty-crabs-arrive.md b/.changeset/fifty-crabs-arrive.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+you can now call stagehand.metrics to get token usage metrics. you can also set logInferenceToFile in stagehand config to log the entire call/response history from stagehand & the LLM.
diff --git a/.changeset/four-hoops-mix.md b/.changeset/four-hoops-mix.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": minor
+---
+
+add custom error classes
diff --git a/.changeset/free-pots-move.md b/.changeset/free-pots-move.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Added CDP support for screenshots, find more about the benefits here: https://docs.browserbase.com/features/screenshots#why-use-cdp-for-screenshots%3F
diff --git a/.changeset/full-trams-learn.md b/.changeset/full-trams-learn.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Fix to remove unnecessary healtcheck ping on sdk
diff --git a/.changeset/gold-hounds-stand.md b/.changeset/gold-hounds-stand.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Add BB SDK 2.4.0 to get connectUrl from an existing session
diff --git a/.changeset/petite-donuts-lead.md b/.changeset/petite-donuts-lead.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+support api usage for extract with no args
diff --git a/.changeset/puny-garlics-join.md b/.changeset/puny-garlics-join.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Fix the open operator handler to work with anthropic
diff --git a/.changeset/rare-tires-turn.md b/.changeset/rare-tires-turn.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+Added support for resuming a Stagehand session created on the API.
diff --git a/.changeset/shiny-windows-attack.md b/.changeset/shiny-windows-attack.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+remove debugDom
diff --git a/.changeset/six-lies-lie.md b/.changeset/six-lies-lie.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": patch
+---
+
+rm unused handlePossiblePageNavigation
diff --git a/.changeset/wise-worlds-pull.md b/.changeset/wise-worlds-pull.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": major
+---
+
+temporary placeholder
diff --git a/.changeset/young-dots-fry.md b/.changeset/young-dots-fry.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand": minor
+---
+
+Added native Stagehand agentic loop functionality. This allows you to build agentic workflows with a single prompt without using a computer-use model. To try it out, create a `stagehand.agent` without passing in a provider.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -20,6 +20,7 @@ jobs:
   determine-evals:
     runs-on: ubuntu-latest
     outputs:
+      run-combination: ${{ steps.check-labels.outputs.run-combination }}
       run-extract: ${{ steps.check-labels.outputs.run-extract }}
       run-act: ${{ steps.check-labels.outputs.run-act }}
       run-observe: ${{ steps.check-labels.outputs.run-observe }}
@@ -31,6 +32,7 @@ jobs:
           # Default to running all tests on main branch
           if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
             echo "Running all tests for main branch"
+            echo "run-combination=true" >> $GITHUB_OUTPUT
             echo "run-extract=true" >> $GITHUB_OUTPUT
             echo "run-act=true" >> $GITHUB_OUTPUT
             echo "run-observe=true" >> $GITHUB_OUTPUT
@@ -40,6 +42,7 @@ jobs:
           fi
 
           # Check for specific labels
+          echo "run-combination=${{ contains(github.event.pull_request.labels.*.name, 'combination') }}" >> $GITHUB_OUTPUT
           echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT
           echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT
           echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT
@@ -147,7 +150,7 @@ jobs:
         run: npm run e2e:local
 
   run-e2e-bb-tests:
-    needs: [run-e2e-tests]
+    needs: [run-lint, run-build]
     runs-on: ubuntu-latest
     timeout-minutes: 50
     if: >
@@ -183,8 +186,129 @@ jobs:
       - name: Run E2E Tests (browserbase)
         run: npm run e2e:bb
 
+  run-regression-evals-dom-extract:
+    needs:
+      [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
+    runs-on: ubuntu-latest
+    timeout-minutes: 7
+    outputs:
+      regression_dom_score: ${{ steps.set-dom-score.outputs.regression_dom_score }}
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
+      BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
+      BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
+      HEADLESS: true
+      EVAL_ENV: browserbase
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Install dependencies
+        run: |
+          rm -rf node_modules
+          rm -f package-lock.json
+          npm install
+
+      - name: Build Stagehand
+        run: npm run build
+
+      - name: Install Playwright browsers
+        run: npm exec playwright install --with-deps
+
+      - name: Run Regression Evals (domExtract)
+        run: npm run evals category regression_dom_extract trials=2 concurrency=12 env=BROWSERBASE -- --extract-method=domExtract
+
+      - name: Save Regression domExtract Results
+        run: mv eval-summary.json eval-summary-regression-dom.json
+
+      - name: Log and Regression (domExtract) Evals Performance
+        id: set-dom-score
+        run: |
+          experimentNameRegressionDom=$(jq -r '.experimentName' eval-summary-regression-dom.json)
+          regression_dom_score=$(jq '.categories.regression_dom_extract' eval-summary-regression-dom.json)
+          echo "regression_dom_extract category score: ${regression_dom_score}%"
+          echo "View regression_dom_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionDom}"
+          echo "regression_dom_score=$regression_dom_score" >> "$GITHUB_OUTPUT"
+
+  run-regression-evals-text-extract:
+    needs:
+      [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals]
+    runs-on: ubuntu-latest
+    timeout-minutes: 7
+    outputs:
+      regression_text_score: ${{ steps.set-text-score.outputs.regression_text_score }}
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
+      BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
+      BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
+      HEADLESS: true
+      EVAL_ENV: browserbase
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - name: Install dependencies
+        run: |
+          rm -rf node_modules
+          rm -f package-lock.json
+          npm install
+
+      - name: Build Stagehand
+        run: npm run build
+
+      - name: Install Playwright browsers
+        run: npm exec playwright install --with-deps
+
+      - name: Run Regression Evals (textExtract)
+        run: npm run evals category regression_text_extract trials=2 concurrency=12 env=BROWSERBASE -- --extract-method=textExtract
+
+      - name: Save Regression textExtract Results
+        run: mv eval-summary.json eval-summary-regression-text.json
+
+      - name: Log Regression (textExtract) Evals Performance
+        id: set-text-score
+        run: |
+          experimentNameRegressionText=$(jq -r '.experimentName' eval-summary-regression-text.json)
+          regression_text_score=$(jq '.categories.regression_text_extract' eval-summary-regression-text.json)
+          echo "regression_text_extract category score: ${regression_text_score}%"
+          echo "View regression_text_extract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameRegressionText}"
+          echo "regression_text_score=$regression_text_score" >> "$GITHUB_OUTPUT"
+
+  check-regression-evals-score:
+    needs: [run-regression-evals-text-extract, run-regression-evals-dom-extract]
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - name: Compare Overall Regression Evals Score
+        run: |
+          regression_dom_score="${{ needs.run-regression-evals-dom-extract.outputs.regression_dom_score }}"
+          regression_text_score="${{ needs.run-regression-evals-text-extract.outputs.regression_text_score }}"
+
+          overall_score=$(echo "(${regression_dom_score} + ${regression_text_score}) / 2" | bc -l)
+          echo "Overall regression score: ${overall_score}%"
+
+          # Fail if overall score is below 90%
+          if (( $(echo "${overall_score} < 90" | bc -l) )); then
+            echo "Overall regression score is below 90%. Failing CI."
+            exit 1
+          fi
+
   run-combination-evals:
-    needs: [run-e2e-bb-tests, run-e2e-tests, determine-evals]
+    needs: [check-regression-evals-score, determine-evals]
     runs-on: ubuntu-latest
     timeout-minutes: 40
     env:
@@ -199,27 +323,43 @@ jobs:
       - name: Check out repository code
         uses: actions/checkout@v4
 
+      - name: Check for 'combination' label
+        id: label-check
+        run: |
+          if [ "${{ needs.determine-evals.outputs.run-combination }}" != "true" ]; then
+            echo "has_label=false" >> $GITHUB_OUTPUT
+            echo "No label for COMBINATION. Exiting with success."
+          else
+            echo "has_label=true" >> $GITHUB_OUTPUT
+          fi
+
       - name: Set up Node.js
+        if: needs.determine-evals.outputs.run-combination == 'true'
         uses: actions/setup-node@v4
         with:
           node-version: "20"
 
       - name: Install dependencies
+        if: needs.determine-evals.outputs.run-combination == 'true'
         run: |
           rm -rf node_modules
           rm -f package-lock.json
           npm install
 
       - name: Build Stagehand
+        if: needs.determine-evals.outputs.run-combination == 'true'
         run: npm run build
 
       - name: Install Playwright browsers
+        if: needs.determine-evals.outputs.run-combination == 'true'
         run: npm exec playwright install --with-deps
 
       - name: Run Combination Evals
+        if: needs.determine-evals.outputs.run-combination == 'true'
         run: npm run evals category combination
 
       - name: Log Combination Evals Performance
+        if: needs.determine-evals.outputs.run-combination == 'true'
         run: |
           experimentName=$(jq -r '.experimentName' eval-summary.json)
           echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}"

diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,4 @@ tmp/
 eval-summary.json
 pnpm-lock.yaml
 evals/deterministic/tests/BrowserContext/tmp-test.har
+examples/example.ts
diff --git a/evals/deterministic/bb.stagehand.config.ts b/evals/deterministic/bb.stagehand.config.ts
@@ -0,0 +1,13 @@
+import { default as DefaultStagehandConfig } from "@/stagehand.config";
+import type { ConstructorParams } from "@/dist";
+import dotenv from "dotenv";
+dotenv.config({ path: "../../.env" });
+
+const StagehandConfig: ConstructorParams = {
+  ...DefaultStagehandConfig,
+  env: "BROWSERBASE" /* Environment to run Stagehand in */,
+  browserbaseSessionCreateParams: {
+    projectId: process.env.BROWSERBASE_PROJECT_ID,
+  },
+};
+export default StagehandConfig;
diff --git a/evals/deterministic/stagehand.config.ts → evals/deterministic/e2e.stagehand.config.ts b/evals/deterministic/stagehand.config.ts → evals/deterministic/e2e.stagehand.config.ts
@@ -6,11 +6,6 @@ dotenv.config({ path: "../../.env" });
 const StagehandConfig: ConstructorParams = {
   ...DefaultStagehandConfig,
   env: "LOCAL" /* Environment to run Stagehand in */,
-  verbose: 1 /* Logging verbosity level (0=quiet, 1=normal, 2=verbose) */,
   headless: true /* Run browser in headless mode */,
-  browserbaseSessionCreateParams: {
-    projectId: process.env.BROWSERBASE_PROJECT_ID,
-  },
-  enableCaching: false /* Enable caching functionality */,
 };
 export default StagehandConfig;
diff --git a/evals/deterministic/tests/BrowserContext/addInitScript.test.ts b/evals/deterministic/tests/BrowserContext/addInitScript.test.ts
@@ -1,6 +1,6 @@
 import { test, expect } from "@playwright/test";
 import { Stagehand } from "@/dist";
-import StagehandConfig from "@/evals/deterministic/stagehand.config";
+import StagehandConfig from "@/evals/deterministic/e2e.stagehand.config";
 
 test.describe("StagehandContext - addInitScript", () => {
   test("should inject a script on the context before pages load", async () => {

diff --git a/evals/deterministic/tests/BrowserContext/cookies.test.ts b/evals/deterministic/tests/BrowserContext/cookies.test.ts
@@ -1,6 +1,6 @@
 import { test, expect } from "@playwright/test";
 import { Stagehand } from "@/dist";
-import StagehandConfig from "@/evals/deterministic/stagehand.config";
+import StagehandConfig from "@/evals/deterministic/e2e.stagehand.config";
 
 test.describe("StagehandContext - Cookies", () => {
   let stagehand: Stagehand;

diff --git a/evals/deterministic/tests/BrowserContext/multiPage.test.ts b/evals/deterministic/tests/BrowserContext/multiPage.test.ts
@@ -1,6 +1,6 @@
 import { test, expect } from "@playwright/test";
 import { Stagehand } from "@/dist";
-import StagehandConfig from "@/evals/deterministic/stagehand.config";
+import StagehandConfig from "@/evals/deterministic/e2e.stagehand.config";
 import { Page } from "@/dist";
 
 import http from "http";

diff --git a/evals/deterministic/tests/BrowserContext/page.test.ts b/evals/deterministic/tests/BrowserContext/page.test.ts
@@ -1,6 +1,6 @@
 import { test, expect } from "@playwright/test";
 import { Stagehand } from "@/dist";
-import StagehandConfig from "@/evals/deterministic/stagehand.config";
+import StagehandConfig from "@/evals/deterministic/e2e.stagehand.config";
 
 import http from "http";
 import express from "express";

diff --git a/evals/deterministic/tests/BrowserContext/routing.test.ts b/evals/deterministic/tests/BrowserContext/routing.test.ts
@@ -1,6 +1,6 @@
 import { test, expect } from "@playwright/test";
 import { Stagehand } from "@/dist";
-import StagehandConfig from "@/evals/deterministic/stagehand.config";
+import StagehandConfig from "@/evals/deterministic/e2e.stagehand.config";
 
 import http from "http";
 import express from "express";

diff --git a/evals/deterministic/tests/Errors/apiKeyError.test.ts b/evals/deterministic/tests/Errors/apiKeyError.test.ts
@@ -1,6 +1,6 @@
 import { test, expect } from "@playwright/test";
 import { Stagehand } from "@/dist";
-import StagehandConfig from "@/evals/deterministic/stagehand.config";
+import StagehandConfig from "@/evals/deterministic/e2e.stagehand.config";
 import { z } from "zod";
 
 test.describe("API key/LLMClient error", () => {

diff --git a/evals/deterministic/tests/browserbase/contexts.test.ts b/evals/deterministic/tests/browserbase/contexts.test.ts
@@ -1,6 +1,6 @@
 import Browserbase from "@browserbasehq/sdk";
 import { expect, test } from "@playwright/test";
-import StagehandConfig from "@/evals/deterministic/stagehand.config";
+import StagehandConfig from "@/evals/deterministic/bb.stagehand.config";
 import { Stagehand } from "@/dist";
 
 // Configuration
@@ -76,6 +76,7 @@ test.describe("Contexts", () => {
       // We will be adding cookies to the context in this session, so we need mark persist=true
       stagehand = new Stagehand({
         ...StagehandConfig,
+        env: "BROWSERBASE",
         browserbaseSessionCreateParams: {
           projectId: BROWSERBASE_PROJECT_ID,
           browserSettings: {