diff --git a/src/app/[locale]/tenants/[name]/page.tsx b/src/app/[locale]/tenants/[name]/page.tsx index b02692e..4c00a47 100644 --- a/src/app/[locale]/tenants/[name]/page.tsx +++ b/src/app/[locale]/tenants/[name]/page.tsx @@ -67,18 +67,12 @@ export default async function TenantDetailPage({ ); const channelUsers = tenant.spec.channelUsers || {}; - // Admins inspecting another tenant's usage: pass teamId AND keyAlias so - // the backend filters spend logs by this specific tenant's virtual key. - // Without keyAlias the response would include sibling tenants in the - // same org, since teams are now shared (Slice 2). - // Customers viewing their own: pass nothing — backend resolves both - // from the session-bound tenant. - const usageTeamId = user.isPlatform - ? tenant.status?.litellmTeamId || undefined - : undefined; - const usageKeyAlias = user.isPlatform - ? tenant.status?.litellmKeyAlias || undefined - : undefined; + // Bug 19 fix: every viewer (customer or admin) passes the tenant + // name to UsageDisplay. The /api/usage route resolves team+alias + // from the tenant CR's status and applies the visibility check, so + // no per-role branching is needed here. Previous version only + // passed identifiers for platform admins; customers got "the first + // visible tenant" by API fallback, mingling siblings. return (
@@ -150,7 +144,7 @@ export default async function TenantDetailPage({

{t("usage")}

- + {/* Packages */} diff --git a/src/app/api/usage/route.ts b/src/app/api/usage/route.ts index 40223a3..96cb65d 100644 --- a/src/app/api/usage/route.ts +++ b/src/app/api/usage/route.ts @@ -8,64 +8,109 @@ import { safeError } from "@/lib/errors"; /** * GET /api/usage * - * Customers: tenant resolved server-side from the user's orgId. The - * response is filtered by the tenant's `litellmKeyAlias` so - * sibling tenants in the same org don't bleed into the total. - * Platform admins: may pass ?teamId=... to inspect any team. They may - * also pass ?keyAlias=... to scope to a single tenant. + * Per-tenant spend/token usage for a given month. * - * Slice 2 note - * ------------ - * LiteLLM teams are now shared across all tenants of an org. The team's - * `/team/info` budget is the *company* budget; the per-tenant numbers - * come from filtering spend logs by `key_alias`. If a tenant has no - * `litellmKeyAlias` in status (transitional state right after upgrade, - * before the operator has reconciled), we fall back to team-level - * filtering — the numbers will be slightly inflated for that one - * reconcile cycle. + * Resolution rules (in priority order) + * ------------------------------------ + * 1. `?tenant=` query param — the canonical path. The route + * looks up the PiecedTenant CR by name, runs it through the + * viewer's visibility filter, and reads `status.litellmTeamId` + + * `status.litellmKeyAlias`. This is what the tenant-detail page + * calls with for both customers and admins. + * 2. `?teamId=` (+ optional `?keyAlias=`) — admin escape + * hatch for debugging across orgs (e.g. opening the platform + * panel without a specific tenant in mind). Platform-only; + * ignored for customer sessions. + * 3. No params — 400. We deliberately do NOT fall back to "the + * first visible tenant". Bug 19: that fallback meant siblings + * in the same org showed identical numbers because the API + * always picked the same "first" tenant regardless of which + * detail page the customer was viewing. Forcing callers to be + * explicit makes the bug structurally impossible to reintroduce. + * + * Filtering + * --------- + * LiteLLM's `/spend/logs/v2` accepts a server-side `key_alias` filter. + * We pass it through directly — no more "fetch all team pages and + * post-filter in JS" (which was O(team_total) memory per request and + * masked the routing bug above by being slow enough that nobody + * noticed which alias was actually being used). + * + * The team-level budget is still surfaced as the *org* budget, since + * teams are org-scoped post-Slice-2. That's intentional: the customer + * sees "your company has X budget remaining" alongside "this tenant + * cost Y this month". */ export async function GET(req: NextRequest) { const user = await getSessionUser(); if (!user) return NextResponse.json({ error: "Unauthorized" }, { status: 401 }); + const tenantName = req.nextUrl.searchParams.get("tenant"); let teamId: string | null = null; let keyAlias: string | null = null; - if (user.isPlatform) { - teamId = req.nextUrl.searchParams.get("teamId") ?? null; - keyAlias = req.nextUrl.searchParams.get("keyAlias") ?? null; - } - - // For customers (or admins without explicit params): resolve from - // the user's *visible* tenants. With Slice 6, a `user`-role member - // can only see usage for tenants they're assigned to — a non-assigned - // user defaults to "no active tenant" (404). - // - // Owner and platform get the full org-scoped list and pick the first - // tenant, matching the dashboard's "current instance" semantics. - if (!teamId) { + if (tenantName) { + // Path 1: resolve from tenant name with visibility check. + // + // listVisibleTenants enforces the same visibility rules as every + // other read endpoint: + // - platform admins see everything + // - owners see all tenants in their org + // - users see only the tenants they're assigned to (Slice 6) + // + // Filtering through that list rather than reading the CR directly + // means a malicious caller can't probe arbitrary tenant names to + // learn what exists in other orgs. const allTenants = await listTenants(); const visible = await listVisibleTenants(user, allTenants); - const orgTenant = visible.find((t) => !!t.status?.litellmTeamId); + const tenant = visible.find((t) => t.metadata.name === tenantName); - if (!orgTenant?.status?.litellmTeamId) { + if (!tenant) { return NextResponse.json( - { error: "No active tenant found for your organization" }, + { error: "Tenant not found or not accessible" }, { status: 404 } ); } - teamId = orgTenant.status.litellmTeamId; - - // If the operator has populated the per-tenant key alias, filter by it. - // Falling back to team-level (no alias) will return the org total, which - // is acceptable transitionally but means siblings' usage shows up here. - if (orgTenant.status.litellmKeyAlias) { - keyAlias = orgTenant.status.litellmKeyAlias; + if (!tenant.status?.litellmTeamId) { + // Tenant exists but the operator hasn't reconciled it yet. + // Common right after onboarding; the customer should see a + // friendly empty state, not a 500. + return NextResponse.json( + { error: "Tenant is still provisioning, no usage data yet" }, + { status: 409 } + ); } + teamId = tenant.status.litellmTeamId; + // litellmKeyAlias is set by the operator's LiteLLM reconcile step + // alongside litellmTeamId, so if teamId is present this should be + // too. Defensive fallback to team-level if missing — in that case + // the customer briefly sees company totals until the next operator + // reconcile, which is better than 500. + keyAlias = tenant.status.litellmKeyAlias ?? null; + } else if (user.isPlatform) { + // Path 2: admin escape hatch. + teamId = req.nextUrl.searchParams.get("teamId"); + keyAlias = req.nextUrl.searchParams.get("keyAlias"); + if (!teamId) { + return NextResponse.json( + { + error: + "Either ?tenant= or ?teamId= (admin) must be provided", + }, + { status: 400 } + ); + } + } else { + // Path 3: no resolution possible. See doc above for why we don't + // pick a default. + return NextResponse.json( + { error: "Tenant must be specified via ?tenant=" }, + { status: 400 } + ); } - // Month param: YYYY-MM, defaults to current month + // Month param: YYYY-MM, defaults to current month. const now = new Date(); const monthParam = req.nextUrl.searchParams.get("month") || @@ -81,11 +126,11 @@ export async function GET(req: NextRequest) { try { const teamInfo = await getTeamInfo(teamId); - // Fetch all pages from the team. We always query at the team level — - // LiteLLM's /spend/logs/v2 doesn't filter by key_alias reliably across - // versions, so we paginate and post-filter in code. For pilot scale - // this is cheap; if a single team ever exceeds ~10k entries/month we - // can revisit. + // Page through results — server-side filtered by key_alias when + // provided. Pagination still needed because LiteLLM caps + // page_size at 100, and a busy tenant can easily exceed that in + // a month. With server-side filtering this stays cheap regardless + // of how busy sibling tenants in the same team are. const allRequests: any[] = []; let page = 1; while (true) { @@ -94,33 +139,25 @@ export async function GET(req: NextRequest) { startStr, endStr, page, - 100 + 100, + keyAlias ); allRequests.push(...(result.data || [])); if (page >= (result.total_pages || 1)) break; page++; + // Defensive cap. A pathological response with bogus total_pages + // shouldn't be able to spin us forever. 50 pages × 100 = 5000 + // entries/month/tenant is well above any realistic usage at + // pilot scale. + if (page > 50) break; } - // Apply key_alias post-filter when scoping to a single tenant. Match - // both `key_alias` (newer LiteLLM) and `metadata.user_api_key_alias` - // (older builds nest it inside metadata). - const scoped = keyAlias - ? allRequests.filter((r) => { - const alias = - r.key_alias ?? - r.metadata?.user_api_key_alias ?? - r.api_key_alias ?? - null; - return alias === keyAlias; - }) - : allRequests; - - // Aggregate by day + // Aggregate by day. const byDay: Record< string, { inputTokens: number; outputTokens: number; spend: number } > = {}; - for (const r of scoped) { + for (const r of allRequests) { const day = (r.startTime || r.endTime || "").slice(0, 10); if (!day) continue; if (!byDay[day]) @@ -134,30 +171,30 @@ export async function GET(req: NextRequest) { .sort(([a], [b]) => a.localeCompare(b)) .map(([date, d]) => ({ date, ...d })); - const totalInput = scoped.reduce( + const totalInput = allRequests.reduce( (s, r) => s + (r.prompt_tokens || 0), 0 ); - const totalOutput = scoped.reduce( + const totalOutput = allRequests.reduce( (s, r) => s + (r.completion_tokens || 0), 0 ); - const totalSpend = scoped.reduce((s, r) => s + (r.spend || 0), 0); + const totalSpend = allRequests.reduce((s, r) => s + (r.spend || 0), 0); return NextResponse.json({ teamId, - keyAlias, // null when not filtering — useful for the client to know it sees company-wide data + keyAlias, // null when admin queries team-wide (no specific tenant) month: monthParam, currentPeriod: { inputTokens: totalInput, outputTokens: totalOutput, totalSpend, - requestCount: scoped.length, + requestCount: allRequests.length, }, // Budget is always team-level (= company budget). Spend reported // here is the team total, not the per-key total — the customer - // wants to see "how much of our company budget is left", not just - // "how much has this one tenant cost". + // wants to see "how much of our company budget is left", not + // just "how much has this one tenant cost". budget: { maxBudget: teamInfo?.team_info?.max_budget ?? null, spend: teamInfo?.team_info?.spend ?? 0, diff --git a/src/components/dashboard/usage-display.tsx b/src/components/dashboard/usage-display.tsx index ae25567..02e54d1 100644 --- a/src/components/dashboard/usage-display.tsx +++ b/src/components/dashboard/usage-display.tsx @@ -94,17 +94,27 @@ function UsageChart({ data }: { data: DailyUsage[] }) { /** * Usage display widget. * - * - Customers: don't pass teamId or keyAlias — the backend resolves both - * from the session-bound tenant. - * - Admins inspecting a specific tenant: pass `teamId` (the org-level - * LiteLLM team id) AND `keyAlias` (the tenant's virtual-key alias). - * Without `keyAlias`, the response includes spend from sibling tenants - * in the same org, since teams are shared since Slice 2. + * Pass `tenant=` for the canonical path — works for both + * customers and admins, the API resolves team+alias from the tenant + * CR's status. The visibility check on the API ensures users can't + * query tenants they shouldn't see. + * + * `teamId`/`keyAlias` remain available as a platform-admin escape + * hatch for cross-org debugging, but the tenant-detail and dashboard + * paths should always use `tenant`. + * + * Bug 19 fix: previous version omitted both props for customer + * sessions, expecting the API to "figure it out". The API's fallback + * was "first visible tenant", which meant siblings in the same org + * showed identical numbers regardless of which detail page was open. + * Now the page passes the tenant name explicitly; no fallback exists. */ export function UsageDisplay({ + tenant, teamId, keyAlias, }: { + tenant?: string | null; teamId?: string | null; keyAlias?: string | null; }) { @@ -121,11 +131,13 @@ export function UsageDisplay({ setError(null); const params = new URLSearchParams({ month }); - if (teamId) { + if (tenant) { + params.set("tenant", tenant); + } else if (teamId) { + // Admin escape hatch — only honoured by the API when the + // viewer is platform-role. params.set("teamId", teamId); - } - if (keyAlias) { - params.set("keyAlias", keyAlias); + if (keyAlias) params.set("keyAlias", keyAlias); } fetch(`/api/usage?${params}`) @@ -133,7 +145,7 @@ export function UsageDisplay({ .then(setData) .catch((e) => setError(e.message)) .finally(() => setLoading(false)); - }, [teamId, keyAlias, month]); + }, [tenant, teamId, keyAlias, month]); useEffect(() => { fetchUsage(); }, [fetchUsage]); diff --git a/src/lib/litellm.ts b/src/lib/litellm.ts index e4bb604..104f42d 100644 --- a/src/lib/litellm.ts +++ b/src/lib/litellm.ts @@ -32,12 +32,43 @@ export async function getTeamSpendLogs( return litellmFetch(`/global/spend/logs?${params}`); } +/** + * Fetch one page of spend logs for a team, optionally narrowed to a + * single virtual key by alias. + * + * Slice 2 / Bug 19 context + * ------------------------ + * Teams in LiteLLM are now org-scoped (one team per org), and each + * tenant in the org has its own virtual key with `key_alias = tenant + * CR name`. Without `keyAlias`, this returns the full team's spend — + * which mingles every tenant in the org. The portal's per-tenant + * usage view passes `keyAlias` to filter server-side via LiteLLM's + * native `key_alias` query param. Confirmed available on the + * `/spend/logs/v2` endpoint via OpenAPI introspection — no need to + * page-and-post-filter as the previous slice did. + * + * Why this matters + * ---------------- + * Previous implementation fetched all team pages, then post-filtered + * by alias in JS. Two problems: (1) at any reasonable scale this is + * O(team_total) memory per request even when only one tenant's data + * is needed; (2) more importantly, when called from the customer + * dashboard without an explicit alias, the route's "pick the first + * visible tenant" fallback meant both Acme tenants showed identical + * numbers — the alias used was always the first tenant in the + * visible list, regardless of which tenant page was being viewed. + * + * The route layer above is responsible for resolving the tenant + * identity correctly and passing the right alias here. This + * function's only job is to pass it through to LiteLLM. + */ export async function getTeamSpendLogsV2( teamId: string, startDate: string, endDate: string, page: number = 1, - pageSize: number = 100 + pageSize: number = 100, + keyAlias?: string | null ) { const params = new URLSearchParams({ team_id: teamId, @@ -46,6 +77,9 @@ export async function getTeamSpendLogsV2( page: String(page), page_size: String(pageSize), }); + if (keyAlias) { + params.set("key_alias", keyAlias); + } return litellmFetch(`/spend/logs/v2?${params}`); }