Skip to content

Commit 73a3e7a

Browse files
revert back to original default eval models (#697)
* revert back to original default eval models * add gemini & 4.1 * update CI * trigger CI * set google api key
1 parent 7d81b3c commit 73a3e7a

File tree

2 files changed

+10
-2
lines changed

2 files changed

+10
-2
lines changed

.github/workflows/ci.yml

+9-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99
- unlabeled
1010

1111
env:
12-
EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest"
12+
EVAL_MODELS: "gpt-4.1,gemini-2.0-flash,claude-3-5-sonnet-latest"
1313
EVAL_CATEGORIES: "observe,act,combination,extract,text_extract,targeted_extract"
1414

1515
concurrency:
@@ -159,6 +159,7 @@ jobs:
159159
env:
160160
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
161161
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
162+
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
162163
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
163164
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
164165
HEADLESS: true
@@ -196,6 +197,7 @@ jobs:
196197
env:
197198
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
198199
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
200+
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
199201
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
200202
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
201203
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
@@ -248,6 +250,7 @@ jobs:
248250
env:
249251
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
250252
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
253+
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
251254
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
252255
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
253256
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
@@ -313,6 +316,7 @@ jobs:
313316
env:
314317
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
315318
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
319+
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
316320
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
317321
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
318322
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
@@ -381,6 +385,7 @@ jobs:
381385
env:
382386
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
383387
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
388+
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
384389
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
385390
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
386391
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
@@ -466,6 +471,7 @@ jobs:
466471
env:
467472
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
468473
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
474+
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
469475
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
470476
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
471477
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
@@ -535,6 +541,7 @@ jobs:
535541
env:
536542
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
537543
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
544+
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
538545
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
539546
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
540547
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
@@ -603,6 +610,7 @@ jobs:
603610
env:
604611
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
605612
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
613+
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
606614
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
607615
BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
608616
BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}

evals/taskConfig.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ if (filterByEvalName && !tasksByName[filterByEvalName]) {
100100
*/
101101
const DEFAULT_EVAL_MODELS = process.env.EVAL_MODELS
102102
? process.env.EVAL_MODELS.split(",")
103-
: ["gemini-2.5-pro-preview-03-25", "o3"];
103+
: ["gemini-2.0-flash", "gpt-4.1-mini", "claude-3-5-sonnet-latest"];
104104

105105
const DEFAULT_AGENT_MODELS = process.env.EVAL_AGENT_MODELS
106106
? process.env.EVAL_AGENT_MODELS.split(",")

0 commit comments

Comments
 (0)