Suspendedremoval
Some checks failed
Build and Push / build (push) Failing after 48s

This commit is contained in:
2026-05-01 18:07:00 +02:00
parent 7d58c78cb9
commit a5812dca9a
16 changed files with 880 additions and 90 deletions

View File

@@ -0,0 +1,154 @@
import { NextRequest, NextResponse } from "next/server";
import { getSessionUser, canMutate } from "@/lib/session";
import { getTenant, setTenantAnnotation } from "@/lib/k8s";
import { canUserSeeTenant } from "@/lib/visibility";
import {
createResumeRequest,
getPendingResumeRequestForTenant,
getTenantRequestByTenantName,
} from "@/lib/db";
import { safeError } from "@/lib/errors";
/**
* POST /api/tenants/[name]/resume-request
*
* Owner-initiated request to reactivate a suspended tenant (Bug 37a).
* Creates a pending tenant_request of type 'resume' for admin review,
* and stamps the PiecedTenant CR with an annotation that pauses the
* operator's 60-day deletion timer.
*
* Why a request flow at all
* -------------------------
* Customers can self-serve cancel; resume requires admin oversight.
* Reactivation may involve re-validating billing, confirming the
* customer still wants to be active, or other manual steps. The
* request flow gives admins a queue to review, with the same approve/
* reject UX as initial provision requests.
*
* Authorization
* -------------
* Owners and platform admins. Platform admins shouldn't normally use
* this endpoint — they have direct PATCH suspend access — but it's
* permissive in case admin tooling pivots.
*
* Validation
* ----------
* - Tenant must exist and be visible to the caller.
* - Tenant must be currently suspended. Resuming an active tenant
* is meaningless.
* - At most one pending resume request per tenant. Enforced by the
* DB's partial unique index, but we also check explicitly here to
* return a friendly 409 instead of a 500.
*
* Side effects on success
* -----------------------
* - INSERT into tenant_requests (request_type='resume', status='pending')
* - PATCH annotation `pieced.ch/resume-request-pending=<request-id>` on
* the CR. This is the operator's signal to pause its 60-day deletion
* timer until the request transitions to terminal.
*
* The annotation set is best-effort: if the K8s PATCH fails after the
* DB insert, the row exists without the annotation. The customer
* sees the request as pending; admin can still approve. The only
* functional consequence is the 60-day timer doesn't pause until the
* next request transition, which is fine in practice (admin response
* times are dramatically shorter than 60 days).
*/
export async function POST(
req: NextRequest,
{ params }: { params: Promise<{ name: string }> }
) {
const user = await getSessionUser();
if (!user) {
return NextResponse.json({ error: "Unauthorized" }, { status: 401 });
}
if (!canMutate(user)) {
return NextResponse.json({ error: "Forbidden" }, { status: 403 });
}
const { name } = await params;
const tenant = await getTenant(name);
if (!tenant) {
return NextResponse.json({ error: "Not found" }, { status: 404 });
}
if (!(await canUserSeeTenant(user, tenant))) {
return NextResponse.json({ error: "Not found" }, { status: 404 });
}
if (!tenant.spec.suspend) {
return NextResponse.json(
{ error: "Tenant is not suspended; nothing to resume." },
{ status: 409 }
);
}
// Already a pending request? Don't duplicate.
const existing = await getPendingResumeRequestForTenant(name);
if (existing) {
return NextResponse.json(
{
error: "A resume request for this tenant is already pending.",
request: { id: existing.id, createdAt: existing.createdAt },
},
{ status: 409 }
);
}
// Pull traceability fields (companyName, agentName) from the original
// provision request. The schema marks these NOT NULL, so we have to
// populate them; copying from the provision row keeps the resume
// row navigable in the admin UI without making up values.
const provision = await getTenantRequestByTenantName(name);
try {
const resumeRequest = await createResumeRequest({
tenantName: name,
zitadelOrgId: tenant.metadata.labels?.[
"pieced.ch/zitadel-org-id"
] ?? user.zitadelOrgId,
zitadelUserId: user.id,
contactName: user.name ?? user.email ?? "Unknown",
contactEmail: user.email ?? "unknown@example.invalid",
companyName: provision?.companyName ?? tenant.spec.displayName ?? name,
agentName: provision?.agentName ?? "Assistant",
});
// Stamp the annotation so the operator pauses its TTL. If this
// fails the request still exists; surface the error so admin
// tooling can re-stamp if needed, but don't roll back.
try {
await setTenantAnnotation(
name,
"pieced.ch/resume-request-pending",
resumeRequest.id
);
} catch (e) {
console.warn(
"resume request created but annotation could not be set; operator's 60-day timer will not pause until next reconcile triggered by request transition",
e
);
}
return NextResponse.json(
{
message: "Resume request submitted. An admin will review shortly.",
request: { id: resumeRequest.id, status: resumeRequest.status },
},
{ status: 201 }
);
} catch (e: any) {
// Unique violation (a pending row already exists for this tenant)
// is friendly-handled above; this catches everything else.
if (e.code === "23505") {
return NextResponse.json(
{ error: "A resume request for this tenant is already pending." },
{ status: 409 }
);
}
console.error("Resume request creation failed:", e);
return NextResponse.json(
{ error: safeError(e, "Failed to submit resume request") },
{ status: 500 }
);
}
}

View File

@@ -1,7 +1,7 @@
import { NextRequest, NextResponse } from "next/server";
import { z } from "zod";
import { getSessionUser, canMutate } from "@/lib/session";
import { getTenant, patchTenantSpec } from "@/lib/k8s";
import { getTenant, patchTenantSpec, setTenantAnnotation } from "@/lib/k8s";
import { canUserSeeTenant } from "@/lib/visibility";
import { safeError } from "@/lib/errors";
@@ -12,37 +12,38 @@ const patchSchema = z.object({
/**
* PATCH /api/tenants/[name]/suspend
*
* Customer-side "Cancel subscription" / "Resume" toggle (Bug 31).
* Direct suspend control on the PiecedTenant CR. Sets `spec.suspend`
* to true (cancel) or false (resume).
*
* Sets `spec.suspend` on the PiecedTenant CR. The operator interprets
* this flag as "stop reconciling this tenant" — workloads, packages,
* and channel-user changes are no longer applied. Existing data is
* preserved (namespace, ConfigMaps, OpenBao secrets, CNPG database,
* billing records). Resuming sets the flag back to false and the
* operator picks up reconciliation on the next loop.
* Authorization (Bug 37a)
* -----------------------
* - suspend=true → owners and platform admins may call.
* - suspend=false → platform admins ONLY. Owners must go through the
* resume-request flow (POST /api/tenants/[name]/resume-request),
* which creates a pending request for admin approval. This
* asymmetry is by design: cancellation is self-service (low risk;
* reversible by request); reactivation requires admin oversight
* (e.g. to re-validate billing, confirm intent).
*
* Authorization
* -------------
* - Customer-side: only an `owner` of the tenant's org may call this.
* `canMutate` is the right gate (mirrors the rest of the customer
* API surface). User-role members cannot cancel a subscription.
* - Platform staff: allowed via `canMutate`'s isPlatform branch, but
* in practice they should use admin tooling for this — the action
* is exposed here for the customer's benefit.
* Customer flow:
* - Cancel: PATCH suspend=true here
* - Resume: POST /resume-request — creates a 'resume' tenant_request,
* admin approves via /api/admin/requests/[id]/approve which
* then PATCHes suspend=false here as a platform user.
*
* Visibility check is via `canUserSeeTenant` — same notFound() trick
* as the detail page, so we don't leak existence of tenants the
* caller can't see.
* Workload behaviour
* ------------------
* On suspend=true the operator deletes the OpenClawInstance, stopping
* the pod within seconds. Tenant data — namespace, ConfigMaps,
* OpenBao secrets, CNPG database, LiteLLM team — is retained.
*
* Note on workload teardown
* -------------------------
* As of this writing, the operator's `suspend` handling is "skip
* reconciliation and set status.phase to Suspended". The underlying
* StatefulSet keeps running until next reconciliation, which won't
* happen while suspended. Group D will add scale-to-zero so cancelled
* subscriptions actually stop incurring compute. Until then, an
* operator following up with a `kubectl scale` is the workaround.
* Customer data is preserved either way.
* Suspended tenants enter a 60-day retention window (operator
* constant `retentionAfterSuspend`); after that, the tenant is fully
* deleted unless a pending resume request exists. The operator
* checks the `pieced.ch/resume-request-pending` annotation to know
* about pending requests; we set it here when admin approves the
* resume (transitively, via the admin-approve endpoint), and clear
* it when the request reaches a terminal state.
*/
export async function PATCH(
req: NextRequest,
@@ -76,6 +77,18 @@ export async function PATCH(
}
const { suspend } = parsed.data;
// Bug 37a: resume (suspend=false) is platform-admin only via this
// endpoint. Owners must go through the resume-request flow.
if (!suspend && !user.isPlatform) {
return NextResponse.json(
{
error:
"Resume requires platform-admin approval. Submit a resume request via /api/tenants/[name]/resume-request.",
},
{ status: 403 }
);
}
// No-op early exit. Avoids a needless K8s patch + status churn when
// the user double-clicks the button or the UI is briefly out of sync.
if (Boolean(tenant.spec.suspend) === suspend) {
@@ -87,10 +100,32 @@ export async function PATCH(
try {
await patchTenantSpec(name, { suspend });
// On admin-side resume, also clear the pending-resume-request
// annotation if it exists. Belt-and-suspenders: the admin-approve
// endpoint already clears it on its happy path, but a platform
// user resuming directly via this endpoint shouldn't leave the
// annotation behind. Best-effort: failure to clear the annotation
// is logged but doesn't fail the resume.
if (!suspend) {
try {
await setTenantAnnotation(
name,
"pieced.ch/resume-request-pending",
null
);
} catch (e) {
console.warn(
"failed to clear resume-request-pending annotation; operator will see it stale until next request transition",
e
);
}
}
return NextResponse.json(
{
message: suspend
? "Subscription cancelled. Your data is preserved."
? "Subscription cancelled. Your data is preserved for 60 days."
: "Subscription resumed.",
suspend,
},