From 442b2990421af6c7bdb5065223bd691f3b3bbe01 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Mon, 2 Mar 2026 22:27:26 -0800 Subject: [PATCH 001/712] Count tokens by open ai endpoint --- .../token-count/__tests__/token-count.test.ts | 479 ++++++++++++++++++ web/src/app/api/v1/token-count/_post.ts | 186 ++++++- 2 files changed, 657 insertions(+), 8 deletions(-) diff --git a/web/src/app/api/v1/token-count/__tests__/token-count.test.ts b/web/src/app/api/v1/token-count/__tests__/token-count.test.ts index 903521b91f..22c89bf640 100644 --- a/web/src/app/api/v1/token-count/__tests__/token-count.test.ts +++ b/web/src/app/api/v1/token-count/__tests__/token-count.test.ts @@ -3,6 +3,8 @@ import { describe, expect, it } from 'bun:test' import { convertContentToAnthropic, convertToAnthropicMessages, + convertToResponsesApiInput, + countTokensViaOpenAI, formatToolContent, } from '../_post' @@ -433,6 +435,483 @@ describe('convertToAnthropicMessages', () => { }) }) +describe('convertToResponsesApiInput', () => { + it('converts a simple user message', () => { + const result = convertToResponsesApiInput([ + { role: 'user', content: 'Hello world' }, + ]) + expect(result).toEqual([ + { type: 'message', role: 'user', content: 'Hello world' }, + ]) + }) + + it('maps system messages to developer role', () => { + const result = convertToResponsesApiInput([ + { role: 'system', content: 'You are helpful' }, + { role: 'user', content: 'Hi' }, + ]) + expect(result).toEqual([ + { type: 'message', role: 'developer', content: 'You are helpful' }, + { type: 'message', role: 'user', content: 'Hi' }, + ]) + }) + + it('converts tool messages to function_call_output', () => { + const result = convertToResponsesApiInput([ + { role: 'tool', toolCallId: 'call-1', content: 'File contents here' }, + ]) + expect(result).toEqual([ + { type: 'function_call_output', call_id: 'call-1', output: 'File contents here' }, + ]) + }) + + it('uses unknown call_id when toolCallId is missing', () => { + const result = convertToResponsesApiInput([ + { role: 'tool', content: 'Some output' }, + ]) + expect(result).toEqual([ + { type: 'function_call_output', call_id: 'unknown', output: 'Some output' }, + ]) + }) + + it('converts assistant messages', () => { + const result = convertToResponsesApiInput([ + { role: 'assistant', content: 'I can help with that.' }, + ]) + expect(result).toEqual([ + { type: 'message', role: 'assistant', content: 'I can help with that.' }, + ]) + }) + + it('handles array content with text parts', () => { + const result = convertToResponsesApiInput([ + { + role: 'user', + content: [{ type: 'text', text: 'What is TypeScript?' }], + }, + ]) + expect(result).toEqual([ + { type: 'message', role: 'user', content: 'What is TypeScript?' }, + ]) + }) + + it('converts tool-call content to function_call items', () => { + const result = convertToResponsesApiInput([ + { + role: 'assistant', + content: [ + { + type: 'tool-call', + toolCallId: 'call-1', + toolName: 'read_file', + input: { path: 'src/index.ts' }, + }, + ], + }, + ]) + expect(result).toEqual([ + { + type: 'function_call', + id: 'call-1', + name: 'read_file', + arguments: '{"path":"src/index.ts"}', + }, + ]) + }) + + it('splits assistant messages with text and tool-calls', () => { + const result = convertToResponsesApiInput([ + { + role: 'assistant', + content: [ + { type: 'text', text: 'Let me read that file.' }, + { + type: 'tool-call', + toolCallId: 'call-2', + toolName: 'read_file', + input: { path: 'test.ts' }, + }, + ], + }, + ]) + expect(result).toEqual([ + { type: 'message', role: 'assistant', content: 'Let me read that file.' }, + { + type: 'function_call', + id: 'call-2', + name: 'read_file', + arguments: '{"path":"test.ts"}', + }, + ]) + }) + + it('handles json content parts', () => { + const result = convertToResponsesApiInput([ + { + role: 'user', + content: [{ type: 'json', value: { key: 'value' } }], + }, + ]) + expect(result).toEqual([ + { type: 'message', role: 'user', content: '{"key":"value"}' }, + ]) + }) + + it('converts a multi-turn conversation', () => { + const result = convertToResponsesApiInput([ + { role: 'user', content: 'Hello' }, + { role: 'assistant', content: 'Hi there!' }, + { role: 'user', content: 'How are you?' }, + ]) + expect(result).toEqual([ + { type: 'message', role: 'user', content: 'Hello' }, + { type: 'message', role: 'assistant', content: 'Hi there!' }, + { type: 'message', role: 'user', content: 'How are you?' }, + ]) + }) + + describe('image handling', () => { + it('converts user message with URL image to content array', () => { + const result = convertToResponsesApiInput([ + { + role: 'user', + content: [ + { type: 'text', text: 'What is in this image?' }, + { + type: 'image', + image: 'https://example.com/photo.png', + }, + ], + }, + ]) + expect(result).toEqual([ + { + type: 'message', + role: 'user', + content: [ + { type: 'input_text', text: 'What is in this image?' }, + { type: 'input_image', image_url: 'https://example.com/photo.png' }, + ], + }, + ]) + }) + + it('converts base64 image to data: URI', () => { + const result = convertToResponsesApiInput([ + { + role: 'user', + content: [ + { type: 'text', text: 'Describe this' }, + { + type: 'image', + image: 'iVBORw0KGgoAAAANSUhEUg', + mediaType: 'image/png', + }, + ], + }, + ]) + expect(result).toEqual([ + { + type: 'message', + role: 'user', + content: [ + { type: 'input_text', text: 'Describe this' }, + { type: 'input_image', image_url: 'data:image/png;base64,iVBORw0KGgoAAAANSUhEUg' }, + ], + }, + ]) + }) + + it('uses default media type for base64 when not specified', () => { + const result = convertToResponsesApiInput([ + { + role: 'user', + content: [ + { + type: 'image', + image: 'base64data', + }, + ], + }, + ]) + expect(result).toEqual([ + { + type: 'message', + role: 'user', + content: [ + { type: 'input_image', image_url: 'data:image/png;base64,base64data' }, + ], + }, + ]) + }) + + it('passes through data: URIs as-is', () => { + const result = convertToResponsesApiInput([ + { + role: 'user', + content: [ + { + type: 'image', + image: 'data:image/jpeg;base64,/9j/4AAQ', + mediaType: 'image/jpeg', + }, + ], + }, + ]) + expect(result).toEqual([ + { + type: 'message', + role: 'user', + content: [ + { type: 'input_image', image_url: 'data:image/jpeg;base64,/9j/4AAQ' }, + ], + }, + ]) + }) + + it('handles http:// image URLs', () => { + const result = convertToResponsesApiInput([ + { + role: 'user', + content: [ + { + type: 'image', + image: 'http://example.com/image.jpg', + }, + ], + }, + ]) + expect(result).toEqual([ + { + type: 'message', + role: 'user', + content: [ + { type: 'input_image', image_url: 'http://example.com/image.jpg' }, + ], + }, + ]) + }) + + it('handles multiple images with text', () => { + const result = convertToResponsesApiInput([ + { + role: 'user', + content: [ + { type: 'text', text: 'Compare these images' }, + { type: 'image', image: 'https://example.com/a.png' }, + { type: 'image', image: 'https://example.com/b.png' }, + ], + }, + ]) + expect(result).toEqual([ + { + type: 'message', + role: 'user', + content: [ + { type: 'input_text', text: 'Compare these images' }, + { type: 'input_image', image_url: 'https://example.com/a.png' }, + { type: 'input_image', image_url: 'https://example.com/b.png' }, + ], + }, + ]) + }) + + it('skips images with missing image field', () => { + const result = convertToResponsesApiInput([ + { + role: 'user', + content: [ + { type: 'text', text: 'Hello' }, + { type: 'image' }, + ], + }, + ]) + expect(result).toEqual([ + { type: 'message', role: 'user', content: 'Hello' }, + ]) + }) + + it('skips images with empty string image field', () => { + const result = convertToResponsesApiInput([ + { + role: 'user', + content: [ + { type: 'text', text: 'Hello' }, + { type: 'image', image: '' }, + ], + }, + ]) + expect(result).toEqual([ + { type: 'message', role: 'user', content: 'Hello' }, + ]) + }) + + it('uses plain string content when no valid images are present', () => { + const result = convertToResponsesApiInput([ + { + role: 'user', + content: [ + { type: 'text', text: 'Just text' }, + { type: 'image' }, + ], + }, + ]) + expect(result).toEqual([ + { type: 'message', role: 'user', content: 'Just text' }, + ]) + }) + }) + + it('handles a full tool-use round trip', () => { + const result = convertToResponsesApiInput([ + { role: 'user', content: 'Read the file' }, + { + role: 'assistant', + content: [ + { + type: 'tool-call', + toolCallId: 'call-abc', + toolName: 'read_file', + input: { path: 'index.ts' }, + }, + ], + }, + { + role: 'tool', + toolCallId: 'call-abc', + content: 'console.log("hello")', + }, + { role: 'assistant', content: 'The file contains a log statement.' }, + ]) + expect(result).toEqual([ + { type: 'message', role: 'user', content: 'Read the file' }, + { + type: 'function_call', + id: 'call-abc', + name: 'read_file', + arguments: '{"path":"index.ts"}', + }, + { + type: 'function_call_output', + call_id: 'call-abc', + output: 'console.log("hello")', + }, + { + type: 'message', + role: 'assistant', + content: 'The file contains a log statement.', + }, + ]) + }) +}) + +describe('countTokensViaOpenAI', () => { + const mockLogger = { + info: () => {}, + error: () => {}, + warn: () => {}, + debug: () => {}, + } as any + + function createMockFetch(inputTokens: number) { + return (async () => + new Response(JSON.stringify({ object: 'response.input_tokens', input_tokens: inputTokens }), { + status: 200, + headers: { 'Content-Type': 'application/json' }, + })) as unknown as typeof globalThis.fetch + } + + it('returns token count from OpenAI API', async () => { + const result = await countTokensViaOpenAI({ + messages: [{ role: 'user', content: 'Hello world' }], + system: undefined, + model: 'openai/gpt-5.3-codex', + fetch: createMockFetch(42), + logger: mockLogger, + }) + expect(result).toBe(42) + }) + + it('passes system prompt as instructions', async () => { + let capturedBody: any + const mockFetch = async (_url: string, init: RequestInit) => { + capturedBody = JSON.parse(init.body as string) + return new Response( + JSON.stringify({ object: 'response.input_tokens', input_tokens: 10 }), + { status: 200, headers: { 'Content-Type': 'application/json' } }, + ) + } + + await countTokensViaOpenAI({ + messages: [{ role: 'user', content: 'Hi' }], + system: 'You are a helpful assistant.', + model: 'openai/gpt-5.3', + fetch: mockFetch as any, + logger: mockLogger, + }) + + expect(capturedBody.instructions).toBe('You are a helpful assistant.') + expect(capturedBody.model).toBe('gpt-5.3') + }) + + it('strips openai/ prefix from model', async () => { + let capturedBody: any + const mockFetch = async (_url: string, init: RequestInit) => { + capturedBody = JSON.parse(init.body as string) + return new Response( + JSON.stringify({ object: 'response.input_tokens', input_tokens: 5 }), + { status: 200, headers: { 'Content-Type': 'application/json' } }, + ) + } + + await countTokensViaOpenAI({ + messages: [{ role: 'user', content: 'Test' }], + system: undefined, + model: 'openai/gpt-5.3-codex', + fetch: mockFetch as any, + logger: mockLogger, + }) + + expect(capturedBody.model).toBe('gpt-5.3-codex') + }) + + it('omits instructions when system is undefined', async () => { + let capturedBody: any + const mockFetch = async (_url: string, init: RequestInit) => { + capturedBody = JSON.parse(init.body as string) + return new Response( + JSON.stringify({ object: 'response.input_tokens', input_tokens: 5 }), + { status: 200, headers: { 'Content-Type': 'application/json' } }, + ) + } + + await countTokensViaOpenAI({ + messages: [{ role: 'user', content: 'Test' }], + system: undefined, + model: 'openai/gpt-5.3', + fetch: mockFetch as any, + logger: mockLogger, + }) + + expect(capturedBody.instructions).toBeUndefined() + }) + + it('throws on API error', async () => { + const mockFetch = async () => + new Response('Internal Server Error', { status: 500 }) + + await expect( + countTokensViaOpenAI({ + messages: [{ role: 'user', content: 'Test' }], + system: undefined, + model: 'openai/gpt-5.3-codex', + fetch: mockFetch as any, + logger: mockLogger, + }), + ).rejects.toThrow('OpenAI API error: 500') + }) +}) + describe('formatToolContent', () => { it('returns string content as-is', () => { expect(formatToolContent('simple string')).toBe('simple string') diff --git a/web/src/app/api/v1/token-count/_post.ts b/web/src/app/api/v1/token-count/_post.ts index 9e2ce09cb1..616164ee39 100644 --- a/web/src/app/api/v1/token-count/_post.ts +++ b/web/src/app/api/v1/token-count/_post.ts @@ -1,4 +1,5 @@ import { AnalyticsEvent } from '@codebuff/common/constants/analytics-events' +import { isOpenAIProviderModel } from '@codebuff/common/constants/chatgpt-oauth' import { isClaudeModel, toAnthropicModelId, @@ -77,13 +78,16 @@ export async function postTokenCount(params: { const { messages, system, model } = bodyResult.data try { - const inputTokens = await countTokensViaAnthropic({ - messages, - system, - model, - fetch, - logger, - }) + const useOpenAI = model != null && isOpenAIProviderModel(model) + const inputTokens = useOpenAI + ? await countTokensViaOpenAI({ messages, system, model, fetch, logger }) + : await countTokensViaAnthropic({ + messages, + system, + model, + fetch, + logger, + }) logger.info({ userId, @@ -91,6 +95,7 @@ export async function postTokenCount(params: { hasSystem: !!system, model: model ?? DEFAULT_ANTHROPIC_MODEL, tokenCount: inputTokens, + provider: useOpenAI ? 'openai' : 'anthropic', }, `Token count: ${inputTokens}` ) @@ -99,7 +104,7 @@ export async function postTokenCount(params: { } catch (error) { logger.error( { error: getErrorObject(error), userId }, - 'Failed to count tokens via Anthropic API', + 'Failed to count tokens', ) return NextResponse.json( @@ -112,6 +117,171 @@ export async function postTokenCount(params: { // Buffer to add to token count for non-Anthropic models since tokenizers differ const NON_ANTHROPIC_TOKEN_BUFFER = 0.3 +export async function countTokensViaOpenAI(params: { + messages: TokenCountRequest['messages'] + system: string | undefined + model: string + fetch: typeof globalThis.fetch + logger: Logger +}): Promise { + const { messages, system, model, fetch, logger } = params + + const openaiModelId = model.startsWith('openai/') + ? model.slice('openai/'.length) + : model + + const input = convertToResponsesApiInput(messages) + + const response = await fetch( + 'https://api.openai.com/v1/responses/input_tokens', + { + method: 'POST', + headers: { + Authorization: `Bearer ${env.OPENAI_API_KEY}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model: openaiModelId, + input, + ...(system && { instructions: system }), + }), + }, + ) + + if (!response.ok) { + const errorText = await response.text() + logger.error( + { status: response.status, errorText, model }, + 'OpenAI token count API error', + ) + throw new Error(`OpenAI API error: ${response.status} - ${errorText}`) + } + + const data = await response.json() + return data.input_tokens +} + +export type ResponsesApiContentPart = + | { type: 'input_text'; text: string } + | { type: 'input_image'; image_url: string } + +export type ResponsesApiInputItem = + | { type: 'message'; role: 'user' | 'assistant' | 'developer'; content: string | ResponsesApiContentPart[] } + | { type: 'function_call'; id: string; name: string; arguments: string } + | { type: 'function_call_output'; call_id: string; output: string } + +export function convertToResponsesApiInput( + messages: TokenCountRequest['messages'], +): ResponsesApiInputItem[] { + const input: ResponsesApiInputItem[] = [] + + for (const message of messages) { + if (message.role === 'system') { + const content = buildMessageContent(message.content) + if (content) { + input.push({ type: 'message', role: 'developer', content }) + } + continue + } + + if (message.role === 'tool') { + input.push({ + type: 'function_call_output', + call_id: message.toolCallId ?? 'unknown', + output: formatToolContent(message.content), + }) + continue + } + + if (message.role === 'user') { + const content = buildMessageContent(message.content) + if (content) { + input.push({ type: 'message', role: 'user', content }) + } + continue + } + + if (message.role === 'assistant') { + const content = buildMessageContent(message.content) + if (content) { + input.push({ type: 'message', role: 'assistant', content }) + } + if (Array.isArray(message.content)) { + for (const part of message.content) { + if (part.type === 'tool-call') { + input.push({ + type: 'function_call', + id: part.toolCallId ?? 'unknown', + name: part.toolName, + arguments: JSON.stringify(part.input ?? {}), + }) + } + } + } + } + } + + return input +} + +function buildMessageContent( + content: unknown, +): string | ResponsesApiContentPart[] | null { + if (typeof content === 'string') return content || null + if (!Array.isArray(content)) { + const text = JSON.stringify(content) + return text || null + } + + const hasImages = content.some( + (part) => part.type === 'image' && typeof part.image === 'string' && part.image, + ) + + if (!hasImages) { + const text = extractTextParts(content) + return text || null + } + + const parts: ResponsesApiContentPart[] = [] + for (const part of content) { + if (part.type === 'text' && typeof part.text === 'string' && part.text) { + parts.push({ type: 'input_text', text: part.text }) + } else if (part.type === 'json') { + const text = typeof part.value === 'string' ? part.value : JSON.stringify(part.value) + if (text) { + parts.push({ type: 'input_text', text }) + } + } else if (part.type === 'image') { + const imageUrl = toImageUrl(part.image, part.mediaType) + if (imageUrl) { + parts.push({ type: 'input_image', image_url: imageUrl }) + } + } + } + + return parts.length > 0 ? parts : null +} + +function toImageUrl(image: unknown, mediaType?: string): string | null { + if (typeof image !== 'string' || !image) return null + if (image.startsWith('http://') || image.startsWith('https://') || image.startsWith('data:')) { + return image + } + return `data:${mediaType ?? 'image/png'};base64,${image}` +} + +function extractTextParts(content: Array>): string { + const parts: string[] = [] + for (const part of content) { + if (part.type === 'text' && typeof part.text === 'string') { + parts.push(part.text) + } else if (part.type === 'json') { + parts.push(typeof part.value === 'string' ? part.value : JSON.stringify(part.value)) + } + } + return parts.join('\n') +} + async function countTokensViaAnthropic(params: { messages: TokenCountRequest['messages'] system: string | undefined From 84166f379d08e874be742523fa1f1448623e1048 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Tue, 3 Mar 2026 09:24:26 -0800 Subject: [PATCH 002/712] base-deep-evals --- agents/base2/base-deep-evals.ts | 8 ++++++ agents/base2/base-deep.ts | 48 +++++++++++++++++++-------------- evals/buffbench/main.ts | 2 +- 3 files changed, 37 insertions(+), 21 deletions(-) create mode 100644 agents/base2/base-deep-evals.ts diff --git a/agents/base2/base-deep-evals.ts b/agents/base2/base-deep-evals.ts new file mode 100644 index 0000000000..d51c4ed38e --- /dev/null +++ b/agents/base2/base-deep-evals.ts @@ -0,0 +1,8 @@ +import { createBaseDeep } from './base-deep' + +const definition = { + ...createBaseDeep({ noAskUser: true }), + id: 'base-deep-evals', + displayName: 'Buffy the Codex Evals Orchestrator', +} +export default definition diff --git a/agents/base2/base-deep.ts b/agents/base2/base-deep.ts index cb125813e6..ad9d1f4705 100644 --- a/agents/base2/base-deep.ts +++ b/agents/base2/base-deep.ts @@ -1,10 +1,13 @@ +import { buildArray } from '@codebuff/common/util/array' + import { publisher } from '../constants' import { PLACEHOLDER, type SecretAgentDefinition, } from '../types/secret-agent-definition' -const SYSTEM_PROMPT = `You are Buffy, a strategic assistant that orchestrates complex coding tasks through specialized sub-agents. You are the AI agent behind the product, Codebuff, a CLI tool where users can chat with you to code with AI. +function buildDeepSystemPrompt(noAskUser: boolean): string { + return `You are Buffy, a strategic assistant that orchestrates complex coding tasks through specialized sub-agents. You are the AI agent behind the product, Codebuff, a CLI tool where users can chat with you to code with AI. # Core Mandates @@ -14,8 +17,8 @@ const SYSTEM_PROMPT = `You are Buffy, a strategic assistant that orchestrates co - **Spawn mentioned agents:** If the user uses "@AgentName" in their message, you must spawn that agent. - **Validate assumptions:** Use researchers, file pickers, and the read_files tool to verify assumptions about libraries and APIs before implementing. - **Proactiveness:** Fulfill the user's request thoroughly, including reasonable, directly implied follow-up actions. -- **Confirm Ambiguity/Expansion:** Do not take significant actions beyond the clear scope of the request without confirming with the user. If asked *how* to do something, explain first, don't just do it. -- **Ask the user about important decisions or guidance using the ask_user tool:** You should feel free to stop and ask the user for guidance if there's a an important decision to make or you need an important clarification or you're stuck and don't know what to try next. Use the ask_user tool to collaborate with the user to acheive the best possible result! Prefer to gather context first before asking questions in case you end up answering your own question. +- **Confirm Ambiguity/Expansion:** Do not take significant actions beyond the clear scope of the request without confirming with the user. If asked *how* to do something, explain first, don't just do it.${noAskUser ? '' : ` +- **Ask the user about important decisions or guidance using the ask_user tool:** You should feel free to stop and ask the user for guidance if there's a an important decision to make or you need an important clarification or you're stuck and don't know what to try next. Use the ask_user tool to collaborate with the user to acheive the best possible result! Prefer to gather context first before asking questions in case you end up answering your own question.`} - **Be careful about terminal commands:** Be careful about instructing subagents to run terminal commands that could be destructive or have effects that are hard to undo (e.g. git push, git commit, running any scripts -- especially ones that could alter production environments (!), installing packages globally, etc). Don't run any of these effectful commands unless the user explicitly asks you to. - **Do what the user asks:** If the user asks you to do something, even running a risky terminal command, do it. @@ -96,8 +99,10 @@ The following is the state of the git repository at the start of the conversatio ${PLACEHOLDER.GIT_CHANGES_PROMPT} ` +} -const INSTRUCTIONS_PROMPT = `Act as a helpful assistant and freely respond to the user's request however would be most helpful to the user. Use your judgement to orchestrate the completion of the user's request using your specialized sub-agents and tools as needed. Take your time and be comprehensive. Don't surprise the user. For example, don't modify files if the user has not asked you to do so at least implicitly. +function buildDeepInstructionsPrompt(noAskUser: boolean): string { + return `Act as a helpful assistant and freely respond to the user's request however would be most helpful to the user. Use your judgement to orchestrate the completion of the user's request using your specialized sub-agents and tools as needed. Take your time and be comprehensive. Don't surprise the user. For example, don't modify files if the user has not asked you to do so at least implicitly. Follow this 7-phase workflow for implementation tasks. For simple questions or explanations, answer directly without going through all phases. @@ -138,7 +143,7 @@ Draft a spec first, then refine it with the user: - **Technical Approach**: How the implementation will work at a high level - **Files to Create/Modify**: List of files that will be touched - **Out of Scope**: Anything explicitly excluded - - The spec defines WHAT to build and WHY — it should NOT include detailed implementation steps or a plan. That belongs in Phase 3. + - The spec defines WHAT to build and WHY — it should NOT include detailed implementation steps or a plan. That belongs in Phase 3.${noAskUser ? '' : ` 3. Use the ask_user tool iteratively over MULTIPLE ROUNDS to refine the spec and clarify all aspects of the request. Ask ~2-5 focused questions per round. Continue until you have clarity on: - The exact scope and boundaries of the task - Key requirements and acceptance criteria @@ -148,13 +153,13 @@ Draft a spec first, then refine it with the user: - Any constraints or preferences on implementation approach 4. Between rounds, update SPEC.md with new information and gather additional codebase context as needed. 5. **Do NOT ask obvious questions.** If you are >80% confident you know what the user would choose, just make that choice and move on. Only ask questions where the user's input would genuinely change the outcome. -6. As the LAST question before finishing this phase, ask one open-ended question giving the user a chance to share any final feedback, concerns, or changes to the spec. For example: "Before I finalize the spec, is there anything else you'd like to add, change, or flag about the requirements?" -7. Iteratively critique the spec: +6. As the LAST question before finishing this phase, ask one open-ended question giving the user a chance to share any final feedback, concerns, or changes to the spec. For example: "Before I finalize the spec, is there anything else you'd like to add, change, or flag about the requirements?"`} +${noAskUser ? '3' : '7'}. Iteratively critique the spec: a. Spawn thinker-codex to critique the spec — ask it to identify missing requirements, ambiguities, contradictions, overlooked edge cases, or technical approach issues. b. If the thinker raises valid critiques, update SPEC.md to address them. c. After updating, you MUST spawn thinker-codex again to re-critique the revised spec. d. Repeat until the thinker finds no new substantive critiques. Do NOT skip the re-critique — every revision must be verified. -8. Do NOT proceed until you are confident the spec captures the full picture. +${noAskUser ? '4' : '8'}. Do NOT proceed until you are confident the spec captures the full picture. ## Phase 3 — Plan @@ -231,8 +236,8 @@ Capture learnings for future sessions: a. Spawn thinker-codex to critique your LESSONS.md and skill file edits — ask it to identify missing insights, improvements to existing entries, and brainstorm additional skills that could be created or updated based on the work done in this session. b. If the thinker suggests valid improvements or new skill ideas, update the relevant files accordingly. c. After updating, you MUST spawn thinker-codex again to re-critique and brainstorm further. - d. Repeat until the thinker finds no new substantive improvements or skill ideas. Do NOT skip the re-critique — every revision must be verified. -4. Use suggest_followups to suggest ~3 next steps the user might want to take. + d. Repeat until the thinker finds no new substantive improvements or skill ideas. Do NOT skip the re-critique — every revision must be verified.${noAskUser ? '' : ` +4. Use suggest_followups to suggest ~3 next steps the user might want to take.`} Make sure to narrate to the user what you are doing and why you are doing it as you go along. Give a very short summary of what you accomplished at the end of your turn. @@ -240,10 +245,13 @@ Make sure to narrate to the user what you are doing and why you are doing it as If the full 7-phase workflow has already been completed in this conversation and the user is asking for a followup change (e.g. "also add X" or "tweak Y"), you do NOT need to repeat the entire workflow. Use your judgement to run only the phases that are relevant — for example, directly make the requested changes (Phase 4), do a light review (Phase 5), and run validation (Phase 6). Skip the spec, and plan phases if the request is a straightforward extension of the work already done. Still update LESSONS.md and skills if you learn anything new. ` +} -export function createBaseDeep(): SecretAgentDefinition { +export function createBaseDeep(options?: { + noAskUser?: boolean +}): Omit { + const { noAskUser = false } = options ?? {} return { - id: 'base-deep', publisher, model: 'openai/gpt-5.3-codex', displayName: 'Buffy the Codex Orchestrator', @@ -266,18 +274,18 @@ export function createBaseDeep(): SecretAgentDefinition { }, outputMode: 'last_message', includeMessageHistory: true, - toolNames: [ + toolNames: buildArray( 'spawn_agents', 'read_files', 'read_subtree', - 'suggest_followups', + !noAskUser && 'suggest_followups', 'apply_patch', 'write_file', 'write_todos', - 'ask_user', + !noAskUser && 'ask_user', 'skill', 'set_output', - ], + ), spawnableAgents: [ 'file-picker', 'code-searcher', @@ -291,13 +299,13 @@ export function createBaseDeep(): SecretAgentDefinition { 'gpt-5-agent', 'context-pruner', ], - systemPrompt: SYSTEM_PROMPT, - instructionsPrompt: INSTRUCTIONS_PROMPT, + systemPrompt: buildDeepSystemPrompt(noAskUser), + instructionsPrompt: buildDeepInstructionsPrompt(noAskUser), stepPrompt: `Workflow phases reminder (7 phases): **Planning todos** (write at start): Phase 1 → Phase 2 → Phase 3 1. Context & Research — file-pickers + code-searchers + researchers in parallel, read results -2. Spec — draft SPEC.md, iterative ask_user to refine (skip obvious Qs), open-ended final Q, thinker-codex critique loop +2. Spec — draft SPEC.md, ${noAskUser ? '' : 'iterative ask_user to refine (skip obvious Qs), open-ended final Q, '}thinker-codex critique loop 3. Plan — write PLAN.md, thinker-codex critique loop **Implementation todos** (write after Plan): one todo per plan step + phases 5-7 @@ -326,5 +334,5 @@ export function createBaseDeep(): SecretAgentDefinition { } } -const definition = createBaseDeep() +const definition = { ...createBaseDeep(), id: 'base-deep' } export default definition diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts index 7f22cd2c10..5c23fb980b 100644 --- a/evals/buffbench/main.ts +++ b/evals/buffbench/main.ts @@ -8,7 +8,7 @@ async function main() { // Use 'external:codex' for OpenAI Codex CLI await runBuffBench({ evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')], - agents: ['base-deep'], + agents: ['base-deep-evals'], taskConcurrency: 5, }) From 82ab4ea718d623309cc57c6146014678111766de Mon Sep 17 00:00:00 2001 From: James Grugett Date: Tue, 3 Mar 2026 09:28:10 -0800 Subject: [PATCH 003/712] Add no learning param --- agents/base2/base-deep-evals.ts | 2 +- agents/base2/base-deep.ts | 40 +++++++++++++++++---------------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/agents/base2/base-deep-evals.ts b/agents/base2/base-deep-evals.ts index d51c4ed38e..ce458d71ec 100644 --- a/agents/base2/base-deep-evals.ts +++ b/agents/base2/base-deep-evals.ts @@ -1,7 +1,7 @@ import { createBaseDeep } from './base-deep' const definition = { - ...createBaseDeep({ noAskUser: true }), + ...createBaseDeep({ noAskUser: true, noLearning: true }), id: 'base-deep-evals', displayName: 'Buffy the Codex Evals Orchestrator', } diff --git a/agents/base2/base-deep.ts b/agents/base2/base-deep.ts index ad9d1f4705..9b3d7e1484 100644 --- a/agents/base2/base-deep.ts +++ b/agents/base2/base-deep.ts @@ -6,7 +6,7 @@ import { type SecretAgentDefinition, } from '../types/secret-agent-definition' -function buildDeepSystemPrompt(noAskUser: boolean): string { +function buildDeepSystemPrompt(noAskUser: boolean, noLearning: boolean): string { return `You are Buffy, a strategic assistant that orchestrates complex coding tasks through specialized sub-agents. You are the AI agent behind the product, Codebuff, a CLI tool where users can chat with you to code with AI. # Core Mandates @@ -70,9 +70,9 @@ For other questions, you can direct them to codebuff.com, or especially codebuff [ Phase 5 — Review Loop: You spawn code-reviewer-codex, fix any issues found, and re-run the reviewer until no new issues are found ] -[ Phase 6 — Validate: You run unit tests, add new tests, fix failures, and attempt E2E verification by running the application ] +[ Phase 6 — Validate: You run unit tests, add new tests, fix failures, and attempt E2E verification by running the application ]${noLearning ? '' : ` -[ Phase 7 — Lessons: You write LESSONS.md in the session directory and update/create skill files with key learnings ] +[ Phase 7 — Lessons: You write LESSONS.md in the session directory and update/create skill files with key learnings ]`} @@ -101,10 +101,11 @@ ${PLACEHOLDER.GIT_CHANGES_PROMPT} ` } -function buildDeepInstructionsPrompt(noAskUser: boolean): string { +function buildDeepInstructionsPrompt(noAskUser: boolean, noLearning: boolean): string { + const totalPhases = noLearning ? 6 : 7 return `Act as a helpful assistant and freely respond to the user's request however would be most helpful to the user. Use your judgement to orchestrate the completion of the user's request using your specialized sub-agents and tools as needed. Take your time and be comprehensive. Don't surprise the user. For example, don't modify files if the user has not asked you to do so at least implicitly. -Follow this 7-phase workflow for implementation tasks. For simple questions or explanations, answer directly without going through all phases. +Follow this ${totalPhases}-phase workflow for implementation tasks. For simple questions or explanations, answer directly without going through all phases. ## Two-Phase Todo Tracking @@ -119,8 +120,8 @@ These help the user understand what's about to happen before any code is written **Implementation todos** — Write these AFTER Phase 3 (Plan) is complete, replacing the planning todos: - One todo per implementation step from the finalized PLAN.md - Phase 5: Review loop -- Phase 6: Validate changes -- Phase 7: Capture lessons & update skills +- Phase 6: Validate changes${noLearning ? '' : ` +- Phase 7: Capture lessons & update skills`} Update these as you complete each step during implementation. ## Phase 1 — Codebase Context & Research @@ -174,7 +175,7 @@ Create a detailed implementation plan, iteratively critique it, and save it alon b. If the thinker raises valid critiques, update PLAN.md to address them. c. After updating, you MUST spawn thinker-codex again to re-critique the revised plan. d. Repeat until the thinker finds no new substantive critiques. Do NOT skip the re-critique — every revision must be verified. -3. Write implementation todos (the second phase of todos) — one todo per plan step, plus todos for phases 5-7. +3. Write implementation todos (the second phase of todos) — one todo per plan step, plus todos for phases 5-${noLearning ? '6' : '7'}. ## Phase 4 — Implement @@ -205,7 +206,7 @@ Thoroughly validate the changes: - For a CLI tool: run it with relevant arguments - For a library: write and run a small integration script - For config/infra changes: validate the configuration is correct -4. If E2E verification reveals issues, fix them and re-validate. +4. If E2E verification reveals issues, fix them and re-validate.${noLearning ? '' : ` ## Phase 7 — Lessons @@ -236,21 +237,22 @@ Capture learnings for future sessions: a. Spawn thinker-codex to critique your LESSONS.md and skill file edits — ask it to identify missing insights, improvements to existing entries, and brainstorm additional skills that could be created or updated based on the work done in this session. b. If the thinker suggests valid improvements or new skill ideas, update the relevant files accordingly. c. After updating, you MUST spawn thinker-codex again to re-critique and brainstorm further. - d. Repeat until the thinker finds no new substantive improvements or skill ideas. Do NOT skip the re-critique — every revision must be verified.${noAskUser ? '' : ` -4. Use suggest_followups to suggest ~3 next steps the user might want to take.`} + d. Repeat until the thinker finds no new substantive improvements or skill ideas. Do NOT skip the re-critique — every revision must be verified.`}${noAskUser ? '' : ` +${noLearning ? '1' : '4'}. Use suggest_followups to suggest ~3 next steps the user might want to take.`} Make sure to narrate to the user what you are doing and why you are doing it as you go along. Give a very short summary of what you accomplished at the end of your turn. ## Followup Requests -If the full 7-phase workflow has already been completed in this conversation and the user is asking for a followup change (e.g. "also add X" or "tweak Y"), you do NOT need to repeat the entire workflow. Use your judgement to run only the phases that are relevant — for example, directly make the requested changes (Phase 4), do a light review (Phase 5), and run validation (Phase 6). Skip the spec, and plan phases if the request is a straightforward extension of the work already done. Still update LESSONS.md and skills if you learn anything new. +If the full ${totalPhases}-phase workflow has already been completed in this conversation and the user is asking for a followup change (e.g. "also add X" or "tweak Y"), you do NOT need to repeat the entire workflow. Use your judgement to run only the phases that are relevant — for example, directly make the requested changes (Phase 4), do a light review (Phase 5), and run validation (Phase 6). Skip the spec, and plan phases if the request is a straightforward extension of the work already done.${noLearning ? '' : ' Still update LESSONS.md and skills if you learn anything new.'} ` } export function createBaseDeep(options?: { noAskUser?: boolean + noLearning?: boolean }): Omit { - const { noAskUser = false } = options ?? {} + const { noAskUser = false, noLearning = false } = options ?? {} return { publisher, model: 'openai/gpt-5.3-codex', @@ -299,20 +301,20 @@ export function createBaseDeep(options?: { 'gpt-5-agent', 'context-pruner', ], - systemPrompt: buildDeepSystemPrompt(noAskUser), - instructionsPrompt: buildDeepInstructionsPrompt(noAskUser), - stepPrompt: `Workflow phases reminder (7 phases): + systemPrompt: buildDeepSystemPrompt(noAskUser, noLearning), + instructionsPrompt: buildDeepInstructionsPrompt(noAskUser, noLearning), + stepPrompt: `Workflow phases reminder (${noLearning ? 6 : 7} phases): **Planning todos** (write at start): Phase 1 → Phase 2 → Phase 3 1. Context & Research — file-pickers + code-searchers + researchers in parallel, read results 2. Spec — draft SPEC.md, ${noAskUser ? '' : 'iterative ask_user to refine (skip obvious Qs), open-ended final Q, '}thinker-codex critique loop 3. Plan — write PLAN.md, thinker-codex critique loop -**Implementation todos** (write after Plan): one todo per plan step + phases 5-7 +**Implementation todos** (write after Plan): one todo per plan step + phases 5-${noLearning ? '6' : '7'} 4. Implement — fully build the spec using file editing tools 5. Review Loop — code-reviewer-codex → fix → re-review until clean -6. Validate — run tests + typechecks, add new tests, do E2E verification -7. Lessons — write LESSONS.md, update/create skills, iterative thinker-codex brainstorm loop`, +6. Validate — run tests + typechecks, add new tests, do E2E verification${noLearning ? '' : ` +7. Lessons — write LESSONS.md, update/create skills, iterative thinker-codex brainstorm loop`}`, handleSteps: function* ({ params }) { while (true) { // Run context-pruner before each step. From ef06634a03df209fb9fe570e4e28001354ce8b12 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Tue, 3 Mar 2026 09:29:10 -0800 Subject: [PATCH 004/712] turn off openai token count for now --- web/src/app/api/v1/token-count/_post.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/web/src/app/api/v1/token-count/_post.ts b/web/src/app/api/v1/token-count/_post.ts index 616164ee39..ceb3d71e4a 100644 --- a/web/src/app/api/v1/token-count/_post.ts +++ b/web/src/app/api/v1/token-count/_post.ts @@ -1,5 +1,4 @@ import { AnalyticsEvent } from '@codebuff/common/constants/analytics-events' -import { isOpenAIProviderModel } from '@codebuff/common/constants/chatgpt-oauth' import { isClaudeModel, toAnthropicModelId, @@ -78,7 +77,7 @@ export async function postTokenCount(params: { const { messages, system, model } = bodyResult.data try { - const useOpenAI = model != null && isOpenAIProviderModel(model) + const useOpenAI = model != null && false // isOpenAIProviderModel(model) const inputTokens = useOpenAI ? await countTokensViaOpenAI({ messages, system, model, fetch, logger }) : await countTokensViaAnthropic({ From 9e9f788948b65c562c0ec76a12a1167c40145dcb Mon Sep 17 00:00:00 2001 From: layla <111667698+04cb@users.noreply.github.com> Date: Wed, 4 Mar 2026 02:51:41 +0800 Subject: [PATCH 005/712] Fix docs: align markdown table in knowledge-files.mdx (#449) --- web/src/content/tips/knowledge-files.mdx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/web/src/content/tips/knowledge-files.mdx b/web/src/content/tips/knowledge-files.mdx index 5d20178d26..64df4be714 100644 --- a/web/src/content/tips/knowledge-files.mdx +++ b/web/src/content/tips/knowledge-files.mdx @@ -107,12 +107,12 @@ Then add your global preferences: ### When to Use Home Directory vs Project Knowledge Files -| Home Directory (`~/.knowledge.md`) | Project (`knowledge.md`) | -|-----------------------------------|------------------------------------| -| Personal coding preferences | Project-specific conventions | -| Preferred frameworks/tools | Architecture decisions | -| Communication style | Build and deploy commands | -| Global defaults | Team coding standards | +| Home Directory (`~/.knowledge.md`) | Project (`knowledge.md`) | +|-----------------------------------|-----------------------------| +| Personal coding preferences | Project-specific conventions | +| Preferred frameworks/tools | Architecture decisions | +| Communication style | Build and deploy commands | +| Global defaults | Team coding standards | Both files are loaded—project knowledge files add to (and can override) your home directory preferences. From 5d8d3cd8a4b236e67d9f861f0dea9200987538f8 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Tue, 3 Mar 2026 11:04:37 -0800 Subject: [PATCH 006/712] .md files from run --- .../03-03-09:09-add-console-log/LESSONS.md | 15 +++++++++++ .../03-03-09:09-add-console-log/PLAN.md | 16 ++++++++++++ .../03-03-09:09-add-console-log/SPEC.md | 25 +++++++++++++++++++ .agents/skills/meta/SKILL.md | 10 ++++++++ 4 files changed, 66 insertions(+) create mode 100644 .agents/sessions/03-03-09:09-add-console-log/LESSONS.md create mode 100644 .agents/sessions/03-03-09:09-add-console-log/PLAN.md create mode 100644 .agents/sessions/03-03-09:09-add-console-log/SPEC.md create mode 100644 .agents/skills/meta/SKILL.md diff --git a/.agents/sessions/03-03-09:09-add-console-log/LESSONS.md b/.agents/sessions/03-03-09:09-add-console-log/LESSONS.md new file mode 100644 index 0000000000..271cfead5b --- /dev/null +++ b/.agents/sessions/03-03-09:09-add-console-log/LESSONS.md @@ -0,0 +1,15 @@ +# LESSONS + +## What went well +- `git diff -- cli/src/index.tsx` immediately after editing made it easy to enforce exact scope for a one-line change. +- Validating with `bun run cli/src/index.tsx --help` gave a quick, non-effectful end-to-end check that startup output works. + +## What was tricky +- Bun script invocation shape from repo root was easy to misremember: `bun --cwd cli run typecheck` failed, while `bun run --cwd cli typecheck` succeeded. + +## Useful patterns +- Entrypoint logs placed at the top of `main()` apply to all command paths that enter `main()`; verify with a non-interactive path first. +- For tiny requests, combine: (1) minimal code edit, (2) scoped diff check, (3) one runtime smoke check, (4) one typecheck. + +## Future efficiency notes +- Put exact validation commands directly in `PLAN.md` to avoid command-syntax backtracking during validation. diff --git a/.agents/sessions/03-03-09:09-add-console-log/PLAN.md b/.agents/sessions/03-03-09:09-add-console-log/PLAN.md new file mode 100644 index 0000000000..5b27b95678 --- /dev/null +++ b/.agents/sessions/03-03-09:09-add-console-log/PLAN.md @@ -0,0 +1,16 @@ +# PLAN + +## Implementation Steps +1. Update `cli/src/index.tsx` by adding `console.log('Codebuff CLI starting')` as the first statement in `main()`. +2. Inspect the diff to confirm scope: exactly one new `console.log` line in `cli/src/index.tsx` and no unintended edits. +3. Run lightweight validation for CLI startup behavior: + - Run a non-interactive path (`--help`) and confirm the line appears once. + - Confirm the log sits before command branching in `main()` so it applies to all `main()` paths. + +## Dependencies / Ordering +- Step 1 must happen before Step 2 and Step 3. +- Step 2 should complete before Step 3 to ensure we validate the intended change only. + +## Risk Areas +- Low risk overall. +- Minor UX risk: the new stdout line appears for all command paths entering `main()` (including `--help`, `login`, and `publish`). This is intentional per spec. diff --git a/.agents/sessions/03-03-09:09-add-console-log/SPEC.md b/.agents/sessions/03-03-09:09-add-console-log/SPEC.md new file mode 100644 index 0000000000..69d397f76c --- /dev/null +++ b/.agents/sessions/03-03-09:09-add-console-log/SPEC.md @@ -0,0 +1,25 @@ +# SPEC + +## Overview +Add a single startup `console.log` to the CLI entrypoint so there is explicit stdout output when the CLI boots. + +## Requirements +1. Modify `cli/src/index.tsx` only for functional code changes. +2. Add exactly one `console.log(...)` statement. +3. Place the log at the start of `main()`. +4. Use a static message string (no timestamp or dynamic args). Chosen message: `Codebuff CLI starting`. +5. The log should print for any execution path that enters `main()` (including normal startup and command modes like `login`/`publish`). +6. Keep all existing behavior unchanged aside from the added stdout line. + +## Technical Approach +Insert one `console.log('Codebuff CLI starting')` call as the first statement inside `main()` so it prints once per process run before the rest of startup flow proceeds. + +## Files to Create/Modify +- `cli/src/index.tsx` (modify) +- `.agents/sessions/03-03-09:09-add-console-log/SPEC.md` (this spec) + +## Out of Scope +- Replacing existing logger usage with `console.log` +- Adding additional logs +- Refactoring startup flow or command handling +- Any server/web/API changes diff --git a/.agents/skills/meta/SKILL.md b/.agents/skills/meta/SKILL.md new file mode 100644 index 0000000000..7dd06229d2 --- /dev/null +++ b/.agents/skills/meta/SKILL.md @@ -0,0 +1,10 @@ +--- +name: meta +description: Broad project-level implementation and validation heuristics +--- + +# Meta + +- When validating CLI changes, run a non-effectful command path first (for example `--help`) before any command that could trigger external side effects. (from .agents/sessions/03-03-09:09-add-console-log) +- For tightly scoped edits, pair runtime smoke-checks with `git diff -- ` to verify no unintended spillover. (from .agents/sessions/03-03-09:09-add-console-log) +- From monorepo root, run workspace scripts as `bun run --cwd