diff --git a/.agents/claude-code-cli.ts b/.agents/claude-code-cli.ts index 2de48ff5c5..075d9f23e4 100644 --- a/.agents/claude-code-cli.ts +++ b/.agents/claude-code-cli.ts @@ -10,7 +10,7 @@ const baseDefinition = createCliAgent({ startCommand: 'claude --dangerously-skip-permissions', permissionNote: 'Always use `--dangerously-skip-permissions` when testing to avoid permission prompts that would block automated tests.', - model: 'anthropic/claude-opus-4.6', + model: 'anthropic/claude-opus-4.7', }) // Constants must be inside handleSteps since it gets serialized via .toString() diff --git a/.agents/codebuff-local-cli.ts b/.agents/codebuff-local-cli.ts index 978a2236a5..8cb367a08a 100644 --- a/.agents/codebuff-local-cli.ts +++ b/.agents/codebuff-local-cli.ts @@ -10,8 +10,18 @@ const baseDefinition = createCliAgent({ startCommand: 'bun --cwd=cli run dev', permissionNote: 'No permission flags needed for Codebuff local dev server.', - model: 'anthropic/claude-opus-4.6', + model: 'anthropic/claude-opus-4.7', skipPrepPhase: true, + cliSpecificDocs: `## Codebuff CLI Specific Guidance + +- The ready state is the Codebuff banner, working directory, and bordered input box with the agent selector. +- For smoke tests, \`/help\` is useful because it validates the overlay, shortcuts, features, and credits copy in one step. +- For implementation-oriented tests, prefer asking the CLI to inspect or reason about a specific file rather than making edits unless the parent prompt explicitly asks for edits. +- Long Codebuff responses live in a scrollable viewport. If the bottom of the answer already shows the core recommendation, do not spend many extra steps trying to reconstruct every hidden line. +- Avoid key combinations like Shift+Arrow or repeated history/navigation probing unless you have a clear reason; they can open overlays or mutate the input state unexpectedly. +- A good implementation-test flow is usually: initial ready capture → task sent/in-progress capture → response-complete capture → optional follow-up-ready or follow-up-complete capture. +- If you need a follow-up, keep it narrow and specific rather than re-asking the whole task. +- If the current session becomes clearly unusable, report that failure; do not silently start a replacement session and continue as though nothing happened.`, spawnerPromptExtras: `**Purpose:** E2E visual testing of the Codebuff CLI itself. This agent starts a local dev Codebuff CLI instance and interacts with it to verify UI behavior. **When to use:** @@ -97,7 +107,7 @@ const definition: AgentDefinition = { input: { role: 'user', content: 'A ' + CLI_NAME + ' tmux session has been started: `' + sessionName + '`\n\n' + - 'Use this session for all CLI interactions. The session name must be included in your final output.\n\n' + + 'Use this session for all CLI interactions. Treat it as the canonical session for this run. If it fails, report that explicitly instead of silently starting another session. The session name must be included in your final output.\n\n' + 'Proceed with the task using the helper scripts:\n' + '- Send commands: `./scripts/tmux/tmux-cli.sh send "' + sessionName + '" "..."`\n' + '- Capture output: `./scripts/tmux/tmux-cli.sh capture "' + sessionName + '" --label "..."`\n' + diff --git a/.agents/codex-cli.ts b/.agents/codex-cli.ts index 9914e3d7c7..e7b18473a8 100644 --- a/.agents/codex-cli.ts +++ b/.agents/codex-cli.ts @@ -81,7 +81,7 @@ const baseDefinition = createCliAgent({ startCommand: 'codex -a never -s danger-full-access', permissionNote: 'Always use `-a never -s danger-full-access` when testing to avoid approval prompts that would block automated tests.', - model: 'anthropic/claude-opus-4.6', + model: 'anthropic/claude-opus-4.7', extraInputParams: { reviewType: { type: 'string', diff --git a/.agents/gemini-cli.ts b/.agents/gemini-cli.ts index 38186add48..d5eb7f45e2 100644 --- a/.agents/gemini-cli.ts +++ b/.agents/gemini-cli.ts @@ -10,7 +10,7 @@ const baseDefinition = createCliAgent({ startCommand: 'gemini --yolo', permissionNote: 'Always use `--yolo` (or `--approval-mode yolo`) when testing to auto-approve all tool actions and avoid prompts that would block automated tests.', - model: 'anthropic/claude-opus-4.6', + model: 'anthropic/claude-opus-4.7', cliSpecificDocs: `## Gemini CLI Commands Gemini CLI uses slash commands for navigation: diff --git a/.agents/lib/cli-agent-prompts.ts b/.agents/lib/cli-agent-prompts.ts index 3bccb168dc..ff206345dc 100644 --- a/.agents/lib/cli-agent-prompts.ts +++ b/.agents/lib/cli-agent-prompts.ts @@ -111,6 +111,16 @@ export function getSystemPrompt(config: CliAgentConfig): string { **Important:** ${config.permissionNote} ${cliSpecificSection} +## Operating Heuristics + +- Treat the provided tmux session as the single source of truth. Do not start a second session unless the current one has clearly failed and you are explicitly recovering from that failure. +- Prefer fewer, higher-value captures over many overlapping captures. +- A capture is worth taking when the UI meaningfully changes: startup ready state, help overlay open, task in progress, task complete, clean follow-up-ready state, or an error state. +- Avoid exploratory key presses that can mutate the UI state unless they are necessary for the task. +- If the CLI already shows enough evidence in the current viewport, do not keep scrolling or recapturing just to get a more perfect screenshot. +- If a long response is partially off-screen, prefer summarizing from the visible evidence instead of repeatedly trying viewport-recovery tricks unless the missing content is essential. +- Do not use \`read_files\` on tmux capture artifacts from inside the CLI tester run; rely on the terminal capture output you already obtained and let the parent agent inspect saved capture files later if needed. + ## Helper Scripts Use these scripts in \`scripts/tmux/\` to interact with the CLI session: @@ -238,6 +248,8 @@ Use ${config.cliName} to complete implementation tasks like building features, f ./scripts/tmux/tmux-cli.sh capture "$SESSION" --label "work-continued" --wait 30 \`\`\` + Prefer at most 1-2 progress captures before deciding whether you already have enough evidence. + 4. **Send follow-up prompts** if needed to refine or continue the work: \`\`\`bash ./scripts/tmux/tmux-cli.sh send "$SESSION" "" @@ -258,7 +270,7 @@ Use ${config.cliName} to complete implementation tasks like building features, f ### Tips - Break complex tasks into smaller prompts -- Capture frequently to track progress +- Prefer high-value captures tied to meaningful UI changes rather than frequent overlapping captures - Use descriptive labels for captures - Check intermediate results before moving on` } diff --git a/.agents/notion-agent.ts b/.agents/notion-agent.ts index 8bdfefc56c..37bfb88e9f 100644 --- a/.agents/notion-agent.ts +++ b/.agents/notion-agent.ts @@ -3,7 +3,7 @@ import type { AgentDefinition } from './types/agent-definition' const definition: AgentDefinition = { id: 'notion-query-agent', displayName: 'Notion Query Agent', - model: 'x-ai/grok-4-fast', + model: 'google/gemini-3.1-flash-lite-preview', spawnerPrompt: 'Expert at querying Notion databases and pages to find information and answer questions about content stored in Notion workspaces.', diff --git a/.agents/notion-researcher.ts b/.agents/notion-researcher.ts index 590c87a6c4..341e7d30b3 100644 --- a/.agents/notion-researcher.ts +++ b/.agents/notion-researcher.ts @@ -6,7 +6,7 @@ const definition: AgentDefinition = { id: 'notion-researcher', publisher, displayName: 'Notion Researcher', - model: 'x-ai/grok-4-fast', + model: 'google/gemini-3.1-flash-lite-preview', spawnerPrompt: 'Expert at conducting comprehensive research across Notion workspaces by spawning multiple notion agents in parallel waves to gather information from different angles and sources.', diff --git a/.agents/package.json b/.agents/package.json index e6dd6fc4e7..053d1e6c66 100644 --- a/.agents/package.json +++ b/.agents/package.json @@ -5,7 +5,6 @@ "type": "module", "scripts": { "typecheck": "bun x tsc --noEmit -p tsconfig.json", - "test": "bun test __tests__", - "test:e2e": "bun test e2e" + "test": "bun test __tests__" } } diff --git a/.agents/sessions/03-02-1407-chatgpt-oauth-direct/LESSONS.md b/.agents/sessions/03-02-1407-chatgpt-oauth-direct/LESSONS.md new file mode 100644 index 0000000000..0dbb6fd5b9 --- /dev/null +++ b/.agents/sessions/03-02-1407-chatgpt-oauth-direct/LESSONS.md @@ -0,0 +1,42 @@ +# LESSONS — ChatGPT OAuth Direct Routing + +Session: `.agents/sessions/03-02-14:07-chatgpt-oauth-direct/` + +## What went well +- Building this feature behind a strict feature flag (`CHATGPT_OAUTH_ENABLED=false`) reduced rollout risk while allowing full end-to-end wiring. +- Reusing the Claude OAuth architectural pattern (credentials helpers, refresh mutex, routing split) accelerated implementation without coupling the two providers. +- Splitting policy logic into `classifyChatGptOAuthStreamError` made fallback/auth/fail-fast behavior easier to test and reason about. +- Adding focused CLI tests for `/connect:chatgpt` gating and utility sanitization caught regression risk early. + +## Current confidence / known gaps +- Runtime ChatGPT stream policy is **partially tested**: `classifyChatGptOAuthStreamError` is covered, but we do not yet have full behavioral tests for `promptAiSdkStream` recursion branches (actual fallback recursion and post-partial-output behavior). +- CLI routing coverage is strongest for **feature-flag OFF** paths; flag-ON auth-code routing should get explicit dedicated tests in a future pass. + +## What was tricky +- The repo had unrelated local drift during implementation; explicit scope cleanup (`git checkout -- `) was necessary to avoid accidental cross-feature commits. +- CLI module mocking is path-sensitive. Test modules under `cli/src/commands/__tests__` must mock sibling modules with correct relative paths (e.g. `../../state/chat-store`), or mocks silently fail. +- Over-mocking analytics can break transitive imports (`setAnalyticsErrorLogger` export expectations). A safe pattern is spreading real analytics exports and overriding only `trackEvent`. + +## Unexpected behaviors / gotchas +- A staged unrelated file can survive despite working-tree revert; both staged and worktree states must be checked before final handoff. +- “Looks correct” tests can still miss runtime branches if they only validate helper classification, not route wiring; reviewer loops were useful to force coverage on practical paths. +- For OAuth tooling/scripts, sanitize error text aggressively. Returning status-only errors avoids accidental token payload leakage. + +## Useful patterns discovered +- Keep direct-provider routing stream-only initially; explicitly forcing non-streaming/structured calls to backend avoided broad compatibility risk. +- Use deterministic model allowlist + normalization mapping in constants to avoid relying on provider-side parsing/errors for unsupported models. +- Treat temporary protocol validation scripts as first-class validation artifacts: they are valuable for real-account smoke checks without coupling to full CLI runtime. + +## Temporary script disposition +- `scripts/chatgpt-oauth-validate.ts` is currently kept as a **dev utility** for manual protocol revalidation while the feature remains experimental/off by default. +- Removal criteria: if protocol endpoints are either officially documented or the CLI flow gets stable automated integration coverage, this script can be retired. + +## Repeatable security verification +- For redaction checks, run targeted searches against changed code/log handling paths for sensitive markers before handoff, e.g. `access_token`, `refresh_token`, and `Authorization: Bearer`. +- Keep surfaced token exchange errors status-only and avoid echoing raw provider response bodies. + +## Follow-up improvements worth considering +- Add deeper runtime-behavior tests for `promptAiSdkStream` recursive fallback branches (not just policy classifier). +- Add explicit CLI test for flag-ON connect flow path once flag toggling is test-harness friendly. +- If feature graduates from experimental, add richer direct-path observability while preserving strict token redaction. +- Add periodic protocol drift checks (authorize/token/callback PKCE assumptions) before enabling the feature flag in production defaults. diff --git a/.agents/sessions/03-02-1407-chatgpt-oauth-direct/PLAN.md b/.agents/sessions/03-02-1407-chatgpt-oauth-direct/PLAN.md new file mode 100644 index 0000000000..9684c95329 --- /dev/null +++ b/.agents/sessions/03-02-1407-chatgpt-oauth-direct/PLAN.md @@ -0,0 +1,104 @@ +# PLAN — ChatGPT Subscription OAuth Direct Routing + +## Implementation Steps +1. **Add shared ChatGPT OAuth constants** + - Create `common/src/constants/chatgpt-oauth.ts` with: + - feature flag (`CHATGPT_OAUTH_ENABLED=false`) + - endpoints/client id/redirect URI/env var + - model allowlist + normalization helpers + - Export through `common/src/constants/index.ts`. + +2. **Build core OAuth utility + temporary protocol validation script (early gate)** + - Create `cli/src/utils/chatgpt-oauth.ts` with PKCE URL generation, browser-open helper, pasted code/URL parsing, token exchange helper. + - Create `scripts/chatgpt-oauth-validate.ts` to test OAuth URL generation + paste parsing + token exchange interaction. + - **Run this script before full integration** as go/no-go checkpoint for endpoint assumptions. + +3. **Add SDK env + credential support** + - Extend `sdk/src/env.ts` with `getChatGptOAuthTokenFromEnv()`. + - Extend `sdk/src/credentials.ts` with `chatgptOAuth` schema and helpers: + - get/save/clear + - valid-check + refresh mutex + - get-valid-with-refresh + - Preserve all non-target credentials in read/write operations. + +4. **Add CLI connect flow UI and command routing** + - Create `cli/src/components/chatgpt-connect-banner.tsx` with state machine + `handleChatGptAuthCode`. + - Update input modes (`connect:chatgpt`) and banner registry. + - Add `/connect:chatgpt` command + alias handling and slash command entry (feature-gated). + - Extend router to process pasted auth code in `connect:chatgpt` mode. + - Verify command visibility: hidden when flag OFF, present when flag ON. + +5. **Implement direct routing primitives in model-provider (decomposed)** + - 5.1 Add ChatGPT direct eligibility checks (feature flag + creds + model scope + skip flag + rate-limit cache state). + - 5.2 Add model normalization + prevalidation helpers (OpenRouter-style -> provider-native). + - 5.3 Add strict payload sanitization helper for direct requests. + - 5.4 Add ChatGPT OAuth direct model construction using OpenAI-compatible transport. + - 5.5 Add ChatGPT rate-limit cache helpers (parallel to Claude cache pattern). + - Keep Claude OAuth path unchanged. + +6. **Update stream execution + fallback/error policy** + - Extend `sdk/src/impl/llm.ts` to: + - recognize ChatGPT direct route usage + - emit ChatGPT OAuth analytics + - fallback only on rate-limit errors + - fail with reconnect guidance on auth errors + - fail fast for all other direct errors + - skip cost accounting for successful ChatGPT direct requests + - avoid fallback once output has already streamed + +7. **Wire startup refresh + CLI status surfacing** + - Update `cli/src/init/init-app.ts` for background ChatGPT OAuth credential refresh when enabled. + - Update `cli/src/chat.tsx`, `cli/src/components/bottom-status-line.tsx`, and `cli/src/components/usage-banner.tsx` to surface ChatGPT connection/active status. + +8. **Add analytics constants + SDK exports** + - Extend `common/src/constants/analytics-events.ts` with ChatGPT OAuth request/rate-limit/auth-error events. + - Ensure SDK exports newly needed helper(s) in `sdk/src/index.ts`. + +9. **Add/adjust tests (explicit matrix)** + - SDK credentials tests: + - env precedence + - persisted read/write/clear + - refresh success/failure + mutex + - Model-provider tests: + - rate-limit cache lifecycle + - allowlist prevalidation + unsupported-model error + - normalization behavior for mapped/unknown variants + - LLM routing/fallback tests (targeted): + - 429 fallback + - 401/403 no-fallback + reconnect path + - timeout/5xx fail-fast + - no fallback after content emitted + - CLI tests/wiring checks: + - command/mode visibility by feature flag + - connect mode routing and handler call. + - Non-streaming/structured guard check: + - confirm backend-only behavior unchanged. + +10. **Validation and cleanup decision for temporary script** + - Run targeted tests/typechecks for touched packages. + - Run OAuth validation script in manual mode (with your account interaction if needed). + - Decide and apply final disposition of temporary script: + - keep as dev utility, or + - remove before finalization. + +11. **Security/redaction verification** + - Validate no token values are logged in direct feature code paths. + - Grep/check for accidental logging of authorization headers, token payload fields, or raw callback query params. + +## Dependencies / Ordering +- Step 1 must be first. +- Step 2 must run before deep integration (early protocol validation gate). +- Step 3 precedes Steps 5–7. +- Step 4 can run in parallel with Step 3 after constants/util setup. +- Step 5 must precede Step 6. +- Step 8 can be implemented alongside Steps 5–6 but must complete before final validation. +- Step 9 follows core implementation completion. +- Steps 10–11 are final validation/cleanup/security passes. + +## Risk Areas +1. **Unofficial OAuth contract drift** — endpoint/field incompatibility can break token exchange. +2. **Direct payload compatibility** — strict sanitization must retain required OpenAI fields. +3. **Error classification correctness** — misclassification can violate requested fallback policy. +4. **Model normalization accuracy** — wrong mapping yields avoidable provider failures. +5. **Token redaction** — avoid leakage in logs, errors, or analytics payloads. +6. **Streaming boundary behavior** — fallback must not happen after partial output is emitted. diff --git a/.agents/sessions/03-02-1407-chatgpt-oauth-direct/SPEC.md b/.agents/sessions/03-02-1407-chatgpt-oauth-direct/SPEC.md new file mode 100644 index 0000000000..d56a415caf --- /dev/null +++ b/.agents/sessions/03-02-1407-chatgpt-oauth-direct/SPEC.md @@ -0,0 +1,155 @@ +# SPEC — ChatGPT Subscription OAuth Direct Routing + +## Overview +Implement an **experimental, default-disabled** ChatGPT subscription OAuth feature that allows the local CLI to route eligible OpenAI-model **streaming** requests directly to OpenAI instead of Codebuff backend routing, mirroring the prior Claude OAuth architecture pattern. + +## Protocol Assumptions (Explicit) +Because this is unofficial/experimental, this implementation proceeds under the following explicit assumptions: + +1. OAuth authorize endpoint: `https://auth.openai.com/oauth/authorize` +2. OAuth token endpoint: `https://auth.openai.com/oauth/token` +3. Public client id is configurable constant, defaulting to Codex-compatible value from ecosystem references. +4. PKCE (`S256`) is required. +5. Redirect URI is pinned to: `http://localhost:1455/auth/callback` +6. User can paste either: + - raw authorization code, or + - full callback URL containing code/state query params. +7. Token response includes at least `access_token`, optional `refresh_token`, and expiry info (`expires_in` or equivalent). +8. Refresh uses standard `grant_type=refresh_token`. + +If any assumption fails at runtime, the feature fails with explicit guidance and remains safely fallbackable only where policy allows. + +## Requirements +1. Add ChatGPT OAuth feature set, default disabled behind `CHATGPT_OAUTH_ENABLED = false`. +2. Add a new CLI command and mode: `/connect:chatgpt` with dedicated banner flow. +3. Implement browser-based PKCE code-paste flow (no device-code flow in this iteration). +4. Keep user-facing warning minimal (per user preference), while leaving code comments clearly marking experimental nature. +5. Store ChatGPT OAuth credentials in local credentials JSON alongside existing credentials. +6. Support env-var token override (power-user/automation use), but env var **must not bypass feature flag**. +7. Add refresh-token support with concurrency guard (mutex) for persisted credentials. +8. Direct routing scope is **streaming only** (`promptAiSdkStream` path); non-streaming and structured stay backend-routed. +9. Add model allowlist for direct routing; include optimistic aliases: + - `openai/gpt-5.3` + - `openai/gpt-5.3-codex` + - `openai/gpt-5.2` + - `openai/gpt-5.2-codex` + - plus selected nearby GPT/Codex IDs already present in repo config. +10. Provide deterministic model normalization for direct requests (OpenRouter-style -> provider-native): + - Example: `openai/gpt-5.3-codex` -> `gpt-5.3-codex` + - Mapping table lives in constants and is used for prevalidation. +11. Unsupported model handling must be deterministic and prevalidated: + - if model is not in allowlist/mapping for direct route, fail with explicit unsupported-model error (no fallback). +12. Fallback policy: + - Rate-limit/overload classification: auto-fallback to Codebuff backend. + - Auth errors (401/403): fail explicitly with reconnect guidance (no fallback). + - All other direct errors: fail fast (no fallback), per user decision. +13. Successful direct ChatGPT OAuth requests do **not** consume Codebuff credits. +14. Add lightweight ChatGPT connection status surfacing in CLI (usage banner and/or bottom status line), without quota API dependency. +15. Preserve existing Claude OAuth behavior unchanged. +16. Add temporary OAuth validation script that tests auth URL generation + token exchange manually before/alongside full wiring. +17. Add/update tests for credential parsing/storage/refresh, model gating, routing/fallback classification, and CLI command/mode wiring. +18. Never log OAuth tokens in analytics or error logs. + +## Direct Request Transformation Rules +Before sending direct streaming requests to OpenAI, enforce strict sanitization: + +1. Rewrite `model` from `openai/*` format to provider-native mapped id. +2. Remove provider-specific/non-OpenAI fields (e.g., codebuff metadata/provider routing payloads). +3. Preserve fields known to be valid for OpenAI-compatible chat completions. +4. Do not inject Codex-specific required prefix by default in v1 (user preference), but structure code so optional future injection is easy. + +## Error Classification Table +| Class | Detection | Behavior | +|---|---|---| +| Rate limit | HTTP 429 or message/body contains rate-limit indicators | Fallback to backend (if no output emitted yet) | +| Auth | HTTP 401/403 or auth-token-invalid indicators | Fail with reconnect guidance; no fallback | +| Unsupported model | Local allowlist/mapping precheck failure | Fail explicit unsupported-model error; no fallback | +| Other | Network timeout, 5xx, malformed payload, unknown 4xx | Fail fast; no fallback | + +## Routing Scope +1. Direct routing applies only to `promptAiSdkStream` eligible requests. +2. `promptAiSdk` and `promptAiSdkStructured` remain backend-only for this iteration. +3. Backend routing remains unchanged for all non-eligible models and when feature disabled/disconnected. + +## Credentials & Precedence Rules +1. Credentials file schema extends with `chatgptOAuth` object. +2. Precedence: env token override > persisted OAuth credentials > none. +3. Env token produces synthetic non-refreshing credentials object. +4. Persisted credentials refresh when expired/near-expiry (5-minute buffer). +5. On refresh failure for persisted credentials, clear only `chatgptOAuth` entry (preserve other credentials). + +## Feature Gating Matrix +1. `CHATGPT_OAUTH_ENABLED = false` + - hide `/connect:chatgpt` command and banner UX + - disable direct routing even if env token exists +2. `CHATGPT_OAUTH_ENABLED = true` and credentials available + - enable command/UI + - enable direct routing for eligible models + +## Logging/Redaction Requirements +1. Never log raw access tokens, refresh tokens, authorization headers, or token response payloads. +2. If callback URL is logged for debugging, redact query values for `code`, `access_token`, `refresh_token`, and similar sensitive keys. +3. Analytics properties must not include token-bearing strings. + +## Technical Approach +1. Create `common/src/constants/chatgpt-oauth.ts`: + - feature flag, endpoints, client id, redirect URI, env var name, model allowlist/mapping helpers. +2. Export new constants via `common/src/constants/index.ts` so legacy `old-constants` re-export path includes them. +3. Extend `sdk/src/env.ts` with ChatGPT OAuth env-token helper. +4. Extend `sdk/src/credentials.ts` with ChatGPT OAuth schema+helpers mirroring Claude pattern. +5. Create `cli/src/utils/chatgpt-oauth.ts` for PKCE start/open/exchange/disconnect/status. +6. Create `cli/src/components/chatgpt-connect-banner.tsx` and auth-code handler. +7. Wire CLI command/input mode/slash menu/router/banner registry for `connect:chatgpt`. +8. Extend model provider (`sdk/src/impl/model-provider.ts`): + - add ChatGPT direct route decision path for `openai/*` allowlisted models + - add rate-limit cache helpers for ChatGPT path + - build direct OpenAI-compatible language model with OAuth bearer auth + - enforce strict body sanitization + model normalization in the direct path. +9. Extend stream error handling (`sdk/src/impl/llm.ts`) for ChatGPT direct path with required fallback/fail rules and analytics. +10. Extend app init (`cli/src/init/init-app.ts`) for background ChatGPT credential refresh when enabled. +11. Add analytics events for ChatGPT OAuth request/rate-limit/auth-error. +12. Update usage/status UI text to include ChatGPT connection state. +13. Add temporary validation script (e.g., `scripts/chatgpt-oauth-validate.ts`) to exercise OAuth setup interactively. + +## Acceptance Criteria +1. With feature disabled, `/connect:chatgpt` is unavailable and no direct routing occurs. +2. With feature enabled, user can run `/connect:chatgpt`, complete browser flow, paste code/URL, and connect. +3. Eligible streaming requests on allowlisted `openai/*` models use direct OAuth path. +4. Direct request payloads are sanitized and model ids normalized before transmission. +5. Rate-limited direct requests fallback to backend automatically. +6. Auth failures produce reconnect guidance and do not fallback. +7. Unsupported models fail immediately with explicit unsupported-model message. +8. Successful direct requests skip Codebuff credit accounting path. +9. Existing Claude OAuth flow remains behaviorally unchanged. +10. New/updated tests pass for touched behavior. +11. Temporary validation script can run and guide manual OAuth exchange checks. + +## Files to Create/Modify +- Create: `common/src/constants/chatgpt-oauth.ts` +- Create: `cli/src/utils/chatgpt-oauth.ts` +- Create: `cli/src/components/chatgpt-connect-banner.tsx` +- Create: `scripts/chatgpt-oauth-validate.ts` (temporary validation utility) +- Modify: `common/src/constants/index.ts` +- Modify: `common/src/constants/analytics-events.ts` +- Modify: `sdk/src/env.ts` +- Modify: `sdk/src/credentials.ts` +- Modify: `sdk/src/impl/model-provider.ts` +- Modify: `sdk/src/impl/llm.ts` +- Modify: `sdk/src/index.ts` +- Modify: `cli/src/utils/input-modes.ts` +- Modify: `cli/src/components/input-mode-banner.tsx` +- Modify: `cli/src/data/slash-commands.ts` +- Modify: `cli/src/commands/command-registry.ts` +- Modify: `cli/src/commands/router.ts` +- Modify: `cli/src/chat.tsx` +- Modify: `cli/src/components/usage-banner.tsx` +- Modify: `cli/src/components/bottom-status-line.tsx` +- Modify: `cli/src/init/init-app.ts` +- Modify tests in SDK/CLI for new behavior. + +## Out of Scope +1. Device-code auth flow. +2. Legal/policy guarantees around undocumented endpoints. +3. Full quota/usage API integration for ChatGPT subscription plans. +4. Local callback server daemon beyond paste-based flow. +5. Enabling feature by default. diff --git a/.agents/sessions/03-03-0909-add-console-log/LESSONS.md b/.agents/sessions/03-03-0909-add-console-log/LESSONS.md new file mode 100644 index 0000000000..271cfead5b --- /dev/null +++ b/.agents/sessions/03-03-0909-add-console-log/LESSONS.md @@ -0,0 +1,15 @@ +# LESSONS + +## What went well +- `git diff -- cli/src/index.tsx` immediately after editing made it easy to enforce exact scope for a one-line change. +- Validating with `bun run cli/src/index.tsx --help` gave a quick, non-effectful end-to-end check that startup output works. + +## What was tricky +- Bun script invocation shape from repo root was easy to misremember: `bun --cwd cli run typecheck` failed, while `bun run --cwd cli typecheck` succeeded. + +## Useful patterns +- Entrypoint logs placed at the top of `main()` apply to all command paths that enter `main()`; verify with a non-interactive path first. +- For tiny requests, combine: (1) minimal code edit, (2) scoped diff check, (3) one runtime smoke check, (4) one typecheck. + +## Future efficiency notes +- Put exact validation commands directly in `PLAN.md` to avoid command-syntax backtracking during validation. diff --git a/.agents/sessions/03-03-0909-add-console-log/PLAN.md b/.agents/sessions/03-03-0909-add-console-log/PLAN.md new file mode 100644 index 0000000000..5b27b95678 --- /dev/null +++ b/.agents/sessions/03-03-0909-add-console-log/PLAN.md @@ -0,0 +1,16 @@ +# PLAN + +## Implementation Steps +1. Update `cli/src/index.tsx` by adding `console.log('Codebuff CLI starting')` as the first statement in `main()`. +2. Inspect the diff to confirm scope: exactly one new `console.log` line in `cli/src/index.tsx` and no unintended edits. +3. Run lightweight validation for CLI startup behavior: + - Run a non-interactive path (`--help`) and confirm the line appears once. + - Confirm the log sits before command branching in `main()` so it applies to all `main()` paths. + +## Dependencies / Ordering +- Step 1 must happen before Step 2 and Step 3. +- Step 2 should complete before Step 3 to ensure we validate the intended change only. + +## Risk Areas +- Low risk overall. +- Minor UX risk: the new stdout line appears for all command paths entering `main()` (including `--help`, `login`, and `publish`). This is intentional per spec. diff --git a/.agents/sessions/03-03-0909-add-console-log/SPEC.md b/.agents/sessions/03-03-0909-add-console-log/SPEC.md new file mode 100644 index 0000000000..4b69f71768 --- /dev/null +++ b/.agents/sessions/03-03-0909-add-console-log/SPEC.md @@ -0,0 +1,25 @@ +# SPEC + +## Overview +Add a single startup `console.log` to the CLI entrypoint so there is explicit stdout output when the CLI boots. + +## Requirements +1. Modify `cli/src/index.tsx` only for functional code changes. +2. Add exactly one `console.log(...)` statement. +3. Place the log at the start of `main()`. +4. Use a static message string (no timestamp or dynamic args). Chosen message: `Codebuff CLI starting`. +5. The log should print for any execution path that enters `main()` (including normal startup and command modes like `login`/`publish`). +6. Keep all existing behavior unchanged aside from the added stdout line. + +## Technical Approach +Insert one `console.log('Codebuff CLI starting')` call as the first statement inside `main()` so it prints once per process run before the rest of startup flow proceeds. + +## Files to Create/Modify +- `cli/src/index.tsx` (modify) +- `.agents/sessions/03-03-0909-add-console-log/SPEC.md` (this spec) + +## Out of Scope +- Replacing existing logger usage with `console.log` +- Adding additional logs +- Refactoring startup flow or command handling +- Any server/web/API changes diff --git a/.agents/sessions/03-06-0850-cli-tester-efficiency/LESSONS.md b/.agents/sessions/03-06-0850-cli-tester-efficiency/LESSONS.md new file mode 100644 index 0000000000..b2eacf94dd --- /dev/null +++ b/.agents/sessions/03-06-0850-cli-tester-efficiency/LESSONS.md @@ -0,0 +1,73 @@ +# Lessons: CLI tester efficiency and CLI knowledge improvements + +## What went well + +- The SDK-driven harness made it straightforward to collect full event streams, stream chunks, structured outputs, and tmux capture paths for repeated `codebuff-local-cli` runs. +- The baseline runs clearly exposed behavior patterns instead of relying on intuition. +- The Codebuff CLI itself was capable and informative during implementation-oriented runs; most inefficiency came from the tester agent’s workflow rather than the CLI under test. + +## What was tricky + +- The `codebuff-local-cli` agent uses only `run_terminal_command`, `add_message`, and `set_output`, so all tester intelligence has to come from prompt/instruction quality rather than richer tooling. +- Long Codebuff CLI responses live in a scrollable viewport. The tester spent many extra steps trying to recover hidden content even when the visible portion already contained enough evidence. +- One smoke run silently started a second tmux session mid-run, showing that the current guidance was too weak about preserving session continuity and treating failure recovery explicitly. +- Reading tmux capture artifacts from inside the tester run is ineffective because the agent does not have `read_files`; attempts to recover more evidence should therefore be avoided unless the current viewport is truly insufficient. + +## Quantified before/after findings + +### Smoke scenario + +- Baseline smoke runs: `27` and `38` total events, with one run silently starting a replacement tmux session mid-run. +- Post-change smoke run: `27` total events, `10` tool calls, `3` captures, no replacement session, and clearer capture labels (`initial-state`, `after-help`, `after-2plus2`). + +### Implementation scenario + +- Baseline implementation runs: + - tool calls: `19` and `21` + - captures: `8` and `7` + - total cost: `30` and `40` + - strong evidence of wasted viewport-recovery actions (page up/down, history keys, extra captures, direct tmux scrollback commands) +- Post-change implementation run: + - tool calls: `10` + - captures: `4` + - total cost: `14` + - no viewport-recovery thrashing; the tester captured the ready state, in-progress state, response, and follow-up response and then stopped. + +## Baseline findings + +- Smoke runs were mostly efficient, but their capture labels were generic and the agent did not explicitly reason about why each capture was worth taking. +- One smoke run restarted the session instead of treating the original session as canonical, inflating event/tool counts. +- Implementation runs showed the biggest inefficiency: excessive viewport recovery actions (page up/down, arrow keys, extra captures, direct tmux scrollback commands) after the key recommendation was already visible. +- The tester lacked Codebuff-specific guidance about: + - what the ready state looks like, + - when `/help` is especially valuable, + - how to structure a good implementation-oriented test, + - and when to stop chasing perfect captures of long responses. + +## What changed behavior most + +- Adding a canonical-session instruction prevented silent session replacement behavior and made failure handling expectations explicit. +- Adding the shared “high-value capture” heuristic reduced redundant captures and discouraged overlapping progress snapshots. +- Adding explicit guidance to stop chasing hidden viewport text eliminated the biggest source of waste in implementation-oriented runs. +- Adding Codebuff-specific flow guidance improved follow-up quality and reduced exploratory key usage. + +## Changes made from baseline evidence + +- Added shared operating heuristics to bias CLI testers toward fewer, higher-value captures and away from unnecessary UI mutation. +- Added explicit guidance to avoid `read_files` on tmux artifacts from inside the tester run. +- Added Codebuff-specific testing guidance covering ready state, smoke-test flow, implementation-test flow, long-response behavior, and session continuity expectations. +- Added best-effort harness cleanup when a run throws after a tmux session has already been created. + +## Cautionary note + +- Different runs may disagree about whether adjacent edge cases are worth fixing. For example, one post-change implementation run argued that the original-case `isEnvFile` call path was acceptable because `.env` files are conventionally lowercase, while earlier baseline runs framed nearby case handling as security-sensitive. Future work should settle those questions with source-of-truth tests or project policy, not by trusting a single run’s opinion. + +## Known limitation + +- The analysis harness now does best-effort tmux cleanup when a run throws after a session has already been created, but it still does not implement a hard per-run abort/timeout with guaranteed teardown if `client.run()` stalls indefinitely. Future iterations should add explicit run cancellation once the preferred timeout mechanism is settled. + +## What we intentionally did not change + +- We did not change the tmux helper scripts because the baseline problems were primarily agent-behavior issues, not script failures. +- We did not broaden the tester’s tool access; this pass focuses on making the current workflow smarter rather than increasing power. +- We did not change the shared output schema because the existing `set_output` contract was sufficient for analysis once the agent behavior improved. diff --git a/.agents/sessions/03-06-0850-cli-tester-efficiency/PLAN.md b/.agents/sessions/03-06-0850-cli-tester-efficiency/PLAN.md new file mode 100644 index 0000000000..13c4cb61e5 --- /dev/null +++ b/.agents/sessions/03-06-0850-cli-tester-efficiency/PLAN.md @@ -0,0 +1,57 @@ +# Plan: CLI tester efficiency and CLI knowledge improvements + +## Implementation Steps + +1. Build an SDK-driven analysis harness for the CLI tester runs. + - Add a reproducible script or test helper that runs `codebuff-local-cli` through the SDK with `handleEvent` and `handleStreamChunk` collection. + - Standardize artifact naming for comparison (for example `baseline-smoke-run1`, `baseline-implementation-run2`, `post-smoke-run1`). + - Define and persist a consistent metrics schema per run, including event counts by type, tool-call counts, unique tool names, spawned-agent counts, capture counts, and notable wait/capture observations. + - Build in explicit failure-path handling for missing API key, auth failure, tmux startup failure, and hung runs, including cleanup where possible. + +2. Execute baseline mixed-scenario runs and document findings. + - Run the smoke scenario twice and the implementation scenario twice. + - Keep the comparison controlled by using the same prompts, logging granularity, and timeout policy across baseline runs. + - Inspect each run’s SDK trace and tmux session logs. + - Record concrete inefficiencies, wasted actions, and missing Codebuff-CLI knowledge to drive the prompt/template changes. + +3. Improve the shared CLI tester prompt layer. + - Update `.agents/lib/cli-agent-prompts.ts` so CLI testers have sharper workflow guidance. + - Add targeted guidance on when to gather prep context, when to capture, how to detect progress/completion, and how to avoid low-value repeated actions. + - Keep knowledge additions evidence-based and avoid prompt bloat. + +4. Improve shared CLI tester orchestration and the concrete `codebuff-local-cli` agent. + - Update `.agents/lib/create-cli-agent.ts` if shared orchestration behavior needs refinement. + - Update `.agents/codebuff-local-cli.ts` with Codebuff-CLI-specific knowledge and workflow refinements informed by baseline evidence. + - Ensure the agent remains focused on CLI UI testing and uses the tmux helper scripts efficiently. + - Keep output contract compatibility intact. + +5. Add or update validation coverage. + - Add tests for shared CLI-agent prompt/template behavior and/or the analysis harness. + - Include compatibility-oriented checks for the shared CLI-agent layer. + - At minimum, verify the `.agents` layer still typechecks and that `claude-code-cli`, `codex-cli`, `gemini-cli`, and `codebuff-local-cli` still satisfy shared construction/schema expectations. + +6. Re-run post-change verification scenarios. + - Run at least one smoke and one implementation scenario after changes using the same prompts and comparison controls. + - Compare outputs/artifacts against the baseline. + - Treat the step as successful if the post-change runs show at least two improvement signals such as fewer duplicate captures, fewer redundant waits/follow-ups, clearer evidence in captures/output, or better scenario-specific verification behavior. + +7. Write session documentation and capture durable lessons. + - Record before/after findings in `LESSONS.md`. + - Document what was intentionally not changed and why. + - Update relevant skill files only with broadly reusable insights. + +## Dependencies / Ordering + +- Step 1 must happen before baseline analysis in Step 2. +- Step 2 should happen before Steps 3–4 so improvements are evidence-based. +- Step 3 should happen before or alongside Step 4 because shared prompt guidance informs the concrete agent behavior. +- Step 5 should follow implementation so tests validate the actual behavior. +- Step 6 depends on Steps 3–5 being complete. +- Step 7 should happen after validation so lessons reflect the final state. + +## Risk Areas + +- The requested `cli-ui-tester` name does not exist directly in the repo, so the harness must target the correct concrete agent (`codebuff-local-cli`) and shared template layer consistently. +- SDK-driven CLI runs may fail due to auth, tmux availability, or local CLI startup issues; the harness should make failures inspectable rather than opaque. +- Richer CLI knowledge can easily become prompt bloat, so additions must stay targeted to observed failures. +- Shared-layer changes can affect multiple CLI tester agents, so compatibility checks are important. diff --git a/.agents/sessions/03-06-0850-cli-tester-efficiency/SPEC.md b/.agents/sessions/03-06-0850-cli-tester-efficiency/SPEC.md new file mode 100644 index 0000000000..15c2f383c0 --- /dev/null +++ b/.agents/sessions/03-06-0850-cli-tester-efficiency/SPEC.md @@ -0,0 +1,76 @@ +# Spec: CLI tester efficiency and CLI knowledge improvements + +## Overview + +Evaluate the shared tmux-based CLI tester agent framework and the concrete `codebuff-local-cli` agent as the implementation of the requested CLI UI tester. Do this by running the tester through the Codebuff SDK multiple times with full event logging, inspecting the resulting SDK event traces and tmux session logs after each run, and then improving the agent(s) so they use fewer wasted steps, capture more useful evidence, and have stronger built-in knowledge of the Codebuff CLI under test. + +## Requirements + +1. Treat `codebuff-local-cli` plus the shared CLI-agent template/prompt layer as the concrete implementation of the requested CLI UI tester for this pass. +2. Run the relevant tester via the Codebuff SDK multiple times with per-event logging enabled. +3. Use a fixed mixed scenario set for analysis: + 1. a visual smoke-test flow for startup/help/basic prompt rendering, + 2. a realistic implementation-oriented flow. +4. Collect a minimum of: + 1. 2 baseline runs of the smoke scenario, + 2. 2 baseline runs of the implementation scenario, + 3. 1 post-change verification run for each scenario. +5. Persist analysis artifacts for each run, including: + 1. full SDK event stream, + 2. stream chunks where available, + 3. run summary metrics, + 4. tmux session capture paths / session logs. +6. Inspect logs after each run and compare baseline behavior across runs before making changes. +7. Identify inefficiencies in the current tester workflow, especially repeated or low-value captures, vague prompting, unnecessary setup, weak completion criteria, and poor completion detection. +8. For this task, treat the following as examples of “wasted actions” unless the logs justify them: + 1. duplicate captures with no meaningful UI state change, + 2. redundant waits that do not produce new evidence, + 3. follow-up prompts that restate the original task without adding precision, + 4. generic verification steps that are not well matched to the scenario, + 5. broad repo-reading instructions that do not improve the test outcome. +9. Identify missing Codebuff-CLI-specific knowledge that would help the tester drive the CLI more effectively, such as startup expectations, useful commands, verification behaviors, and signs that the CLI is done or needs follow-up. +10. Improve the shared CLI tester framework where doing so benefits multiple CLI testers. +11. Improve the `codebuff-local-cli` agent as the concrete primary target. +12. Preserve the tmux-session-based testing model and the existing structured `set_output` contract; any schema changes should be backward-compatible or additive only. +13. Keep changes focused on agent behavior, prompt quality, logging usefulness, and related validation/test coverage rather than unrelated CLI product changes. +14. Add richer CLI knowledge in a targeted way: new prompt or workflow guidance must be tied to observed baseline failures, confusion, or inefficiencies rather than generic prompt expansion. +15. Add or update validation coverage for the new behavior where practical. +16. Handle key failure modes cleanly in either the agent behavior or the analysis harness, including: + 1. missing API key / auth failure, + 2. tmux startup failure, + 3. CLI hang / no-progress situations, + 4. cleanup of temporary artifacts or tmux sessions where applicable. +17. Summarize findings, rationale, and before/after evidence in session documentation. + +## Acceptance Criteria + +1. There is a reproducible SDK-driven way to run and inspect the CLI tester with full event logging. +2. The session documentation includes concrete before/after findings from the mixed scenario runs rather than only anecdotal recommendations. +3. The shared prompt/template layer or concrete tester agent is updated to add materially better Codebuff-CLI-specific guidance. +4. The updated tester behavior reduces obvious wasted actions or improves evidence quality in a way that is visible in prompts, logs, outputs, or tests. +5. Validation demonstrates the changes did not break the CLI tester contract or nearby shared behavior, including at least one compatibility-oriented check on the shared CLI-agent layer. + +## Technical Approach + +- Use the SDK directly to run the relevant tester agent with `handleEvent` and `handleStreamChunk` collectors so every emitted event can be persisted and analyzed. +- Use the tester’s existing tmux scripts and session logs as the main source of truth for what the tested CLI actually displayed. +- Compare current shared instructions in `.agents/lib/cli-agent-prompts.ts` and agent-construction logic in `.agents/lib/create-cli-agent.ts` against the Codebuff-local tester’s concrete behavior in `.agents/codebuff-local-cli.ts` to find mismatches and missing guidance. +- Tighten prompts and workflow instructions so the tester gathers relevant repo/CLI context up front when appropriate, uses more targeted capture/verification behavior, and returns richer but backward-compatible structured output. +- Capture lightweight comparative metrics such as event counts by type, tool-call counts, spawned-agent counts, and notable capture usefulness observations. +- Add or update tests around the agent prompt/template layer and, if useful, add a reproducible SDK-driven analysis harness. + +## Files to Create/Modify + +- `.agents/codebuff-local-cli.ts` +- `.agents/lib/create-cli-agent.ts` +- `.agents/lib/cli-agent-prompts.ts` +- `.agents/lib/cli-agent-schemas.ts` (only if additive schema changes are needed) +- Possible new SDK/e2e or helper script under `sdk/e2e/` or `scripts/` +- Session docs under `.agents/sessions/03-06-0850-cli-tester-efficiency/` + +## Out of Scope + +- Reworking the underlying tmux helper scripts unless logs show a concrete blocker there. +- Broad changes to the main Codebuff CLI product unrelated to tester quality. +- Replacing the tmux-based approach with a different testing framework. +- Optimizing non-CLI-testing agents unless directly affected by shared CLI tester changes. diff --git a/.agents/skills/meta/SKILL.md b/.agents/skills/meta/SKILL.md new file mode 100644 index 0000000000..8b05efdddf --- /dev/null +++ b/.agents/skills/meta/SKILL.md @@ -0,0 +1,18 @@ +--- +name: meta +description: Broad project-level implementation and validation heuristics +--- + +# Meta + +- When validating CLI changes, run a non-effectful command path first (for example `--help`) before any command that could trigger external side effects. (from .agents/sessions/03-03-0909-add-console-log) +- For tightly scoped edits, pair runtime smoke-checks with `git diff -- ` to verify no unintended spillover. (from .agents/sessions/03-03-0909-add-console-log) +- From monorepo root, run workspace scripts as `bun run --cwd