Group F - Fix spending per tenant

2026-05-01 13:34:56 +02:00
parent 2cf5b56441
commit f308c84325
4 changed files with 169 additions and 92 deletions
--- a/src/app/[locale]/tenants/[name]/page.tsx
+++ b/src/app/[locale]/tenants/[name]/page.tsx
@@ -67,18 +67,12 @@ export default async function TenantDetailPage({
  );
  const channelUsers = tenant.spec.channelUsers || {};
-  // Admins inspecting another tenant's usage: pass teamId AND keyAlias so
+  // Bug 19 fix: every viewer (customer or admin) passes the tenant
-  // the backend filters spend logs by this specific tenant's virtual key.
+  // name to UsageDisplay. The /api/usage route resolves team+alias
-  // Without keyAlias the response would include sibling tenants in the
+  // from the tenant CR's status and applies the visibility check, so
-  // same org, since teams are now shared (Slice 2).
+  // no per-role branching is needed here. Previous version only
-  // Customers viewing their own: pass nothing — backend resolves both
+  // passed identifiers for platform admins; customers got "the first
-  // from the session-bound tenant.
+  // visible tenant" by API fallback, mingling siblings.
  const usageTeamId = user.isPlatform
    ? tenant.status?.litellmTeamId || undefined
    : undefined;
  const usageKeyAlias = user.isPlatform
    ? tenant.status?.litellmKeyAlias || undefined
    : undefined;
  return (
    <div>
@@ -150,7 +144,7 @@ export default async function TenantDetailPage({
        <h2 className="text-xs font-semibold uppercase tracking-wider text-text-muted mb-3">
          {t("usage")}
        </h2>
-        <UsageDisplay teamId={usageTeamId} keyAlias={usageKeyAlias} />
+        <UsageDisplay tenant={name} />
      </section>
      {/* Packages */}
--- a/src/app/api/usage/route.ts
+++ b/src/app/api/usage/route.ts
@@ -8,64 +8,109 @@ import { safeError } from "@/lib/errors";
 /**
 * GET /api/usage
 *
- * Customers: tenant resolved server-side from the user's orgId. The
+ * Per-tenant spend/token usage for a given month.
 *            response is filtered by the tenant's `litellmKeyAlias` so
 *            sibling tenants in the same org don't bleed into the total.
 * Platform admins: may pass ?teamId=... to inspect any team. They may
 *                  also pass ?keyAlias=... to scope to a single tenant.
 *
- * Slice 2 note
+ * Resolution rules (in priority order)
- * ------------
+ * ------------------------------------
- * LiteLLM teams are now shared across all tenants of an org. The team's
+ *  1. `?tenant=<name>` query param — the canonical path. The route
- * `/team/info` budget is the *company* budget; the per-tenant numbers
+ *     looks up the PiecedTenant CR by name, runs it through the
- * come from filtering spend logs by `key_alias`. If a tenant has no
+ *     viewer's visibility filter, and reads `status.litellmTeamId` +
- * `litellmKeyAlias` in status (transitional state right after upgrade,
+ *     `status.litellmKeyAlias`. This is what the tenant-detail page
- * before the operator has reconciled), we fall back to team-level
+ *     calls with for both customers and admins.
- * filtering — the numbers will be slightly inflated for that one
+ *  2. `?teamId=<id>` (+ optional `?keyAlias=<alias>`) — admin escape
- * reconcile cycle.
+ *     hatch for debugging across orgs (e.g. opening the platform
 *     panel without a specific tenant in mind). Platform-only;
 *     ignored for customer sessions.
 *  3. No params — 400. We deliberately do NOT fall back to "the
 *     first visible tenant". Bug 19: that fallback meant siblings
 *     in the same org showed identical numbers because the API
 *     always picked the same "first" tenant regardless of which
 *     detail page the customer was viewing. Forcing callers to be
 *     explicit makes the bug structurally impossible to reintroduce.
 *
 * Filtering
 * ---------
 * LiteLLM's `/spend/logs/v2` accepts a server-side `key_alias` filter.
 * We pass it through directly — no more "fetch all team pages and
 * post-filter in JS" (which was O(team_total) memory per request and
 * masked the routing bug above by being slow enough that nobody
 * noticed which alias was actually being used).
 *
 * The team-level budget is still surfaced as the *org* budget, since
 * teams are org-scoped post-Slice-2. That's intentional: the customer
 * sees "your company has X budget remaining" alongside "this tenant
 * cost Y this month".
 */
 export async function GET(req: NextRequest) {
  const user = await getSessionUser();
  if (!user)
    return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
  const tenantName = req.nextUrl.searchParams.get("tenant");
  let teamId: string | null = null;
  let keyAlias: string | null = null;
-  if (user.isPlatform) {
+  if (tenantName) {
-    teamId = req.nextUrl.searchParams.get("teamId") ?? null;
+    // Path 1: resolve from tenant name with visibility check.
-    keyAlias = req.nextUrl.searchParams.get("keyAlias") ?? null;
+    //
-  }
+    // listVisibleTenants enforces the same visibility rules as every
-
+    // other read endpoint:
-  // For customers (or admins without explicit params): resolve from
+    //   - platform admins see everything
-  // the user's *visible* tenants. With Slice 6, a `user`-role member
+    //   - owners see all tenants in their org
-  // can only see usage for tenants they're assigned to — a non-assigned
+    //   - users see only the tenants they're assigned to (Slice 6)
-  // user defaults to "no active tenant" (404).
+    //
-  //
+    // Filtering through that list rather than reading the CR directly
-  // Owner and platform get the full org-scoped list and pick the first
+    // means a malicious caller can't probe arbitrary tenant names to
-  // tenant, matching the dashboard's "current instance" semantics.
+    // learn what exists in other orgs.
  if (!teamId) {
    const allTenants = await listTenants();
    const visible = await listVisibleTenants(user, allTenants);
-    const orgTenant = visible.find((t) => !!t.status?.litellmTeamId);
+    const tenant = visible.find((t) => t.metadata.name === tenantName);
-    if (!orgTenant?.status?.litellmTeamId) {
+    if (!tenant) {
      return NextResponse.json(
-        { error: "No active tenant found for your organization" },
+        { error: "Tenant not found or not accessible" },
        { status: 404 }
      );
    }
-    teamId = orgTenant.status.litellmTeamId;
+    if (!tenant.status?.litellmTeamId) {
-
+      // Tenant exists but the operator hasn't reconciled it yet.
-    // If the operator has populated the per-tenant key alias, filter by it.
+      // Common right after onboarding; the customer should see a
-    // Falling back to team-level (no alias) will return the org total, which
+      // friendly empty state, not a 500.
-    // is acceptable transitionally but means siblings' usage shows up here.
+      return NextResponse.json(
-    if (orgTenant.status.litellmKeyAlias) {
+        { error: "Tenant is still provisioning, no usage data yet" },
-      keyAlias = orgTenant.status.litellmKeyAlias;
+        { status: 409 }
      );
    }
    teamId = tenant.status.litellmTeamId;
    // litellmKeyAlias is set by the operator's LiteLLM reconcile step
    // alongside litellmTeamId, so if teamId is present this should be
    // too. Defensive fallback to team-level if missing — in that case
    // the customer briefly sees company totals until the next operator
    // reconcile, which is better than 500.
    keyAlias = tenant.status.litellmKeyAlias ?? null;
  } else if (user.isPlatform) {
    // Path 2: admin escape hatch.
    teamId = req.nextUrl.searchParams.get("teamId");
    keyAlias = req.nextUrl.searchParams.get("keyAlias");
    if (!teamId) {
      return NextResponse.json(
        {
          error:
            "Either ?tenant=<name> or ?teamId=<id> (admin) must be provided",
        },
        { status: 400 }
      );
    }
  } else {
    // Path 3: no resolution possible. See doc above for why we don't
    // pick a default.
    return NextResponse.json(
      { error: "Tenant must be specified via ?tenant=<name>" },
      { status: 400 }
    );
  }
-  // Month param: YYYY-MM, defaults to current month
+  // Month param: YYYY-MM, defaults to current month.
  const now = new Date();
  const monthParam =
    req.nextUrl.searchParams.get("month") ||
@@ -81,11 +126,11 @@ export async function GET(req: NextRequest) {
  try {
    const teamInfo = await getTeamInfo(teamId);
-    // Fetch all pages from the team. We always query at the team level —
+    // Page through results — server-side filtered by key_alias when
-    // LiteLLM's /spend/logs/v2 doesn't filter by key_alias reliably across
+    // provided. Pagination still needed because LiteLLM caps
-    // versions, so we paginate and post-filter in code. For pilot scale
+    // page_size at 100, and a busy tenant can easily exceed that in
-    // this is cheap; if a single team ever exceeds ~10k entries/month we
+    // a month. With server-side filtering this stays cheap regardless
-    // can revisit.
+    // of how busy sibling tenants in the same team are.
    const allRequests: any[] = [];
    let page = 1;
    while (true) {
@@ -94,33 +139,25 @@ export async function GET(req: NextRequest) {
        startStr,
        endStr,
        page,
-        100
+        100,
        keyAlias
      );
      allRequests.push(...(result.data || []));
      if (page >= (result.total_pages || 1)) break;
      page++;
      // Defensive cap. A pathological response with bogus total_pages
      // shouldn't be able to spin us forever. 50 pages × 100 = 5000
      // entries/month/tenant is well above any realistic usage at
      // pilot scale.
      if (page > 50) break;
    }
-    // Apply key_alias post-filter when scoping to a single tenant. Match
+    // Aggregate by day.
    // both `key_alias` (newer LiteLLM) and `metadata.user_api_key_alias`
    // (older builds nest it inside metadata).
    const scoped = keyAlias
      ? allRequests.filter((r) => {
          const alias =
            r.key_alias ??
            r.metadata?.user_api_key_alias ??
            r.api_key_alias ??
            null;
          return alias === keyAlias;
        })
      : allRequests;
    // Aggregate by day
    const byDay: Record<
      string,
      { inputTokens: number; outputTokens: number; spend: number }
    > = {};
-    for (const r of scoped) {
+    for (const r of allRequests) {
      const day = (r.startTime || r.endTime || "").slice(0, 10);
      if (!day) continue;
      if (!byDay[day])
@@ -134,30 +171,30 @@ export async function GET(req: NextRequest) {
      .sort(([a], [b]) => a.localeCompare(b))
      .map(([date, d]) => ({ date, ...d }));
-    const totalInput = scoped.reduce(
+    const totalInput = allRequests.reduce(
      (s, r) => s + (r.prompt_tokens || 0),
      0
    );
-    const totalOutput = scoped.reduce(
+    const totalOutput = allRequests.reduce(
      (s, r) => s + (r.completion_tokens || 0),
      0
    );
-    const totalSpend = scoped.reduce((s, r) => s + (r.spend || 0), 0);
+    const totalSpend = allRequests.reduce((s, r) => s + (r.spend || 0), 0);
    return NextResponse.json({
      teamId,
-      keyAlias, // null when not filtering — useful for the client to know it sees company-wide data
+      keyAlias, // null when admin queries team-wide (no specific tenant)
      month: monthParam,
      currentPeriod: {
        inputTokens: totalInput,
        outputTokens: totalOutput,
        totalSpend,
-        requestCount: scoped.length,
+        requestCount: allRequests.length,
      },
      // Budget is always team-level (= company budget). Spend reported
      // here is the team total, not the per-key total — the customer
-      // wants to see "how much of our company budget is left", not just
+      // wants to see "how much of our company budget is left", not
-      // "how much has this one tenant cost".
+      // just "how much has this one tenant cost".
      budget: {
        maxBudget: teamInfo?.team_info?.max_budget ?? null,
        spend: teamInfo?.team_info?.spend ?? 0,
--- a/src/components/dashboard/usage-display.tsx
+++ b/src/components/dashboard/usage-display.tsx
@@ -94,17 +94,27 @@ function UsageChart({ data }: { data: DailyUsage[] }) {
 /**
 * Usage display widget.
 *
- * - Customers: don't pass teamId or keyAlias — the backend resolves both
+ * Pass `tenant=<name>` for the canonical path — works for both
- *   from the session-bound tenant.
+ * customers and admins, the API resolves team+alias from the tenant
- * - Admins inspecting a specific tenant: pass `teamId` (the org-level
+ * CR's status. The visibility check on the API ensures users can't
- *   LiteLLM team id) AND `keyAlias` (the tenant's virtual-key alias).
+ * query tenants they shouldn't see.
- *   Without `keyAlias`, the response includes spend from sibling tenants
+ *
- *   in the same org, since teams are shared since Slice 2.
+ * `teamId`/`keyAlias` remain available as a platform-admin escape
 * hatch for cross-org debugging, but the tenant-detail and dashboard
 * paths should always use `tenant`.
 *
 * Bug 19 fix: previous version omitted both props for customer
 * sessions, expecting the API to "figure it out". The API's fallback
 * was "first visible tenant", which meant siblings in the same org
 * showed identical numbers regardless of which detail page was open.
 * Now the page passes the tenant name explicitly; no fallback exists.
 */
 export function UsageDisplay({
  tenant,
  teamId,
  keyAlias,
 }: {
  tenant?: string | null;
  teamId?: string | null;
  keyAlias?: string | null;
 }) {
@@ -121,11 +131,13 @@ export function UsageDisplay({
    setError(null);
    const params = new URLSearchParams({ month });
-    if (teamId) {
+    if (tenant) {
      params.set("tenant", tenant);
    } else if (teamId) {
      // Admin escape hatch — only honoured by the API when the
      // viewer is platform-role.
      params.set("teamId", teamId);
-    }
+      if (keyAlias) params.set("keyAlias", keyAlias);
    if (keyAlias) {
      params.set("keyAlias", keyAlias);
    }
    fetch(`/api/usage?${params}`)
@@ -133,7 +145,7 @@ export function UsageDisplay({
      .then(setData)
      .catch((e) => setError(e.message))
      .finally(() => setLoading(false));
-  }, [teamId, keyAlias, month]);
+  }, [tenant, teamId, keyAlias, month]);
  useEffect(() => { fetchUsage(); }, [fetchUsage]);
--- a/src/lib/litellm.ts
+++ b/src/lib/litellm.ts
@@ -32,12 +32,43 @@ export async function getTeamSpendLogs(
  return litellmFetch(`/global/spend/logs?${params}`);
 }
 /**
 * Fetch one page of spend logs for a team, optionally narrowed to a
 * single virtual key by alias.
 *
 * Slice 2 / Bug 19 context
 * ------------------------
 * Teams in LiteLLM are now org-scoped (one team per org), and each
 * tenant in the org has its own virtual key with `key_alias = tenant
 * CR name`. Without `keyAlias`, this returns the full team's spend —
 * which mingles every tenant in the org. The portal's per-tenant
 * usage view passes `keyAlias` to filter server-side via LiteLLM's
 * native `key_alias` query param. Confirmed available on the
 * `/spend/logs/v2` endpoint via OpenAPI introspection — no need to
 * page-and-post-filter as the previous slice did.
 *
 * Why this matters
 * ----------------
 * Previous implementation fetched all team pages, then post-filtered
 * by alias in JS. Two problems: (1) at any reasonable scale this is
 * O(team_total) memory per request even when only one tenant's data
 * is needed; (2) more importantly, when called from the customer
 * dashboard without an explicit alias, the route's "pick the first
 * visible tenant" fallback meant both Acme tenants showed identical
 * numbers — the alias used was always the first tenant in the
 * visible list, regardless of which tenant page was being viewed.
 *
 * The route layer above is responsible for resolving the tenant
 * identity correctly and passing the right alias here. This
 * function's only job is to pass it through to LiteLLM.
 */
 export async function getTeamSpendLogsV2(
  teamId: string,
  startDate: string,
  endDate: string,
  page: number = 1,
-  pageSize: number = 100
+  pageSize: number = 100,
  keyAlias?: string | null
 ) {
  const params = new URLSearchParams({
    team_id: teamId,
@@ -46,6 +77,9 @@ export async function getTeamSpendLogsV2(
    page: String(page),
    page_size: String(pageSize),
  });
  if (keyAlias) {
    params.set("key_alias", keyAlias);
  }
  return litellmFetch(`/spend/logs/v2?${params}`);
 }