diff --git a/deploy/helm/pieced-threema-gateway/Chart.yaml b/deploy/helm/pieced-threema-gateway/Chart.yaml index 1477b2c..3536b75 100644 --- a/deploy/helm/pieced-threema-gateway/Chart.yaml +++ b/deploy/helm/pieced-threema-gateway/Chart.yaml @@ -2,5 +2,5 @@ apiVersion: v2 name: pieced-threema-gateway description: PieCed IT central Threema Gateway relay type: application -version: 0.1.7 -appVersion: "0.1.7" +version: 0.1.8 +appVersion: "0.1.8" diff --git a/deploy/helm/pieced-threema-gateway/templates/database-backup-cleanup.yaml b/deploy/helm/pieced-threema-gateway/templates/database-backup-cleanup.yaml new file mode 100644 index 0000000..945dcd5 --- /dev/null +++ b/deploy/helm/pieced-threema-gateway/templates/database-backup-cleanup.yaml @@ -0,0 +1,139 @@ +{{- if and .Values.postgres.enabled .Values.postgres.backup.enabled .Values.postgres.backup.cleanup.enabled }} +# ============================================================================= +# Backup CR cleanup CronJob. +# +# The Cluster has barmanObjectStore.retentionPolicy set, but that only +# prunes the actual backup data (base + WAL) in the MinIO bucket. CNPG +# does NOT delete the Kubernetes `Backup` CRs that ScheduledBackup keeps +# creating, so without this job they accumulate one per day forever and +# bloat the ArgoCD resource tree under the ScheduledBackup parent. +# +# Strategy: +# List all Backup CRs for cluster=pieced-threema-gateway-db, sort by +# creationTimestamp ascending, drop the last N entries (newest), +# delete the rest. Keep ~2x the S3 retention so we never delete a CR +# whose data is still on disk. +# +# Same shape as apps/litellm-pg-backup-cleanup.yaml in pieced-gitops. +# ============================================================================= +apiVersion: v1 +kind: ServiceAccount +metadata: + name: pieced-threema-gateway-db-backup-cleanup + namespace: {{ .Values.namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: pieced-threema-gateway-db-backup-cleanup + namespace: {{ .Values.namespace }} +rules: + - apiGroups: ["postgresql.cnpg.io"] + resources: ["backups"] + verbs: ["get", "list", "delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: pieced-threema-gateway-db-backup-cleanup + namespace: {{ .Values.namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: pieced-threema-gateway-db-backup-cleanup +subjects: + - kind: ServiceAccount + name: pieced-threema-gateway-db-backup-cleanup + namespace: {{ .Values.namespace }} +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: pieced-threema-gateway-db-backup-cleanup + namespace: {{ .Values.namespace }} + labels: + app.kubernetes.io/name: pieced-threema-gateway-db-backup-cleanup + app.kubernetes.io/part-of: pieced-platform +spec: + schedule: {{ .Values.postgres.backup.cleanup.schedule | quote }} + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 1 + startingDeadlineSeconds: 600 + jobTemplate: + spec: + # Auto-clean the Job object 1h after completion so it doesn't + # also pile up in ArgoCD's tree. + ttlSecondsAfterFinished: 3600 + backoffLimit: 1 + template: + metadata: + labels: + app.kubernetes.io/name: pieced-threema-gateway-db-backup-cleanup + spec: + serviceAccountName: pieced-threema-gateway-db-backup-cleanup + restartPolicy: OnFailure + securityContext: + runAsNonRoot: true + runAsUser: 1001 + runAsGroup: 1001 + seccompProfile: + type: RuntimeDefault + containers: + - name: cleanup + image: {{ .Values.postgres.backup.cleanup.image | quote }} + imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] + resources: + requests: + cpu: 10m + memory: 32Mi + limits: + cpu: 100m + memory: 128Mi + env: + - name: NAMESPACE + value: {{ .Values.namespace | quote }} + - name: CLUSTER + value: pieced-threema-gateway-db + - name: KEEP + value: {{ .Values.postgres.backup.cleanup.keep | quote }} + command: + - /bin/bash + - -c + - | + set -euo pipefail + + echo "Listing Backup CRs for cluster=${CLUSTER} in ns=${NAMESPACE}" + mapfile -t all < <( + kubectl -n "${NAMESPACE}" get backups.postgresql.cnpg.io \ + -l "cnpg.io/cluster=${CLUSTER}" \ + --sort-by=.metadata.creationTimestamp \ + -o name + ) + + total=${#all[@]} + echo "Found ${total} backup CR(s); keeping newest ${KEEP}" + + if (( total <= KEEP )); then + echo "Nothing to prune." + exit 0 + fi + + prune_count=$(( total - KEEP )) + to_delete=("${all[@]:0:${prune_count}}") + + echo "Deleting ${prune_count} old backup CR(s):" + printf ' %s\n' "${to_delete[@]}" + + # Delete in chunks to keep the kubectl command line sane + # even if the historical backlog is in the hundreds. + printf '%s\n' "${to_delete[@]}" \ + | xargs -r -n 50 kubectl -n "${NAMESPACE}" delete --ignore-not-found + + echo "Done." +{{- end }} diff --git a/deploy/helm/pieced-threema-gateway/templates/database-backup.yaml b/deploy/helm/pieced-threema-gateway/templates/database-backup.yaml new file mode 100644 index 0000000..20b0236 --- /dev/null +++ b/deploy/helm/pieced-threema-gateway/templates/database-backup.yaml @@ -0,0 +1,59 @@ +{{- if and .Values.postgres.enabled .Values.postgres.backup.enabled }} +# ============================================================================= +# S3 credentials for the CNPG Cluster's barmanObjectStore. +# +# Projects the in-cluster MinIO root credentials out of OpenBao +# (.Values.postgres.backup.s3.credentialsPath) into a Secret in this +# namespace. Referenced by spec.backup.barmanObjectStore.s3Credentials +# on the Cluster CR (see templates/database.yaml). +# +# Same shape and convention as the chart's other ExternalSecrets +# (templates/externalsecret.yaml) — KV v2 path without /data/ segment. +# ============================================================================= +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: cnpg-s3-credentials + namespace: {{ .Values.namespace }} +spec: + refreshInterval: 1h + secretStoreRef: + name: openbao-backend + kind: ClusterSecretStore + target: + name: cnpg-s3-credentials + creationPolicy: Owner + data: + - secretKey: ACCESS_KEY_ID + remoteRef: + key: {{ .Values.postgres.backup.s3.credentialsPath }} + property: {{ .Values.postgres.backup.s3.accessKeyProperty }} + - secretKey: ACCESS_SECRET_KEY + remoteRef: + key: {{ .Values.postgres.backup.s3.credentialsPath }} + property: {{ .Values.postgres.backup.s3.secretKeyProperty }} +--- +# ============================================================================= +# Daily backup of the pieced-threema-gateway-db CNPG cluster. +# +# IMPORTANT — cron format: +# CNPG ScheduledBackup uses a SIX-field Go-style cron expression +# (sec min hour dom mon dow), NOT the 5-field Unix crontab format. The +# CNPG controller silently accepts 5-field expressions but reinterprets +# them — see https://github.com/cloudnative-pg/cloudnative-pg/issues/5380 +# Default schedule (.Values.postgres.backup.schedule.cron) is set +# accordingly. +# ============================================================================= +apiVersion: postgresql.cnpg.io/v1 +kind: ScheduledBackup +metadata: + name: pieced-threema-gateway-db-daily + namespace: {{ .Values.namespace }} +spec: + schedule: {{ .Values.postgres.backup.schedule.cron | quote }} + backupOwnerReference: self + cluster: + name: pieced-threema-gateway-db + method: barmanObjectStore + immediate: {{ .Values.postgres.backup.schedule.immediate }} +{{- end }} diff --git a/deploy/helm/pieced-threema-gateway/templates/database.yaml b/deploy/helm/pieced-threema-gateway/templates/database.yaml index 8af766d..0085c22 100644 --- a/deploy/helm/pieced-threema-gateway/templates/database.yaml +++ b/deploy/helm/pieced-threema-gateway/templates/database.yaml @@ -17,4 +17,23 @@ spec: {{- toYaml .Values.postgres.resources | nindent 4 }} monitoring: enablePodMonitor: true + {{- if .Values.postgres.backup.enabled }} + backup: + barmanObjectStore: + destinationPath: s3://{{ .Values.postgres.backup.s3.bucket }}/pieced-threema-gateway-db/ + endpointURL: {{ .Values.postgres.backup.s3.endpointURL | quote }} + s3Credentials: + accessKeyId: + name: cnpg-s3-credentials + key: ACCESS_KEY_ID + secretAccessKey: + name: cnpg-s3-credentials + key: ACCESS_SECRET_KEY + wal: + compression: {{ .Values.postgres.backup.wal.compression }} + maxParallel: {{ .Values.postgres.backup.wal.maxParallel }} + data: + compression: {{ .Values.postgres.backup.data.compression }} + retentionPolicy: {{ .Values.postgres.backup.retentionPolicy | quote }} + {{- end }} {{- end }} diff --git a/deploy/helm/pieced-threema-gateway/values.yaml b/deploy/helm/pieced-threema-gateway/values.yaml index 0014982..9ab5228 100644 --- a/deploy/helm/pieced-threema-gateway/values.yaml +++ b/deploy/helm/pieced-threema-gateway/values.yaml @@ -6,7 +6,7 @@ namespace: threema-gateway image: repository: registry.c5ai.ch/pieced/pieced-threema-gateway - tag: "0.1.7" + tag: "0.1.8" pullPolicy: IfNotPresent # Pull from registry.c5ai.ch — matches operator + portal pattern. @@ -49,6 +49,78 @@ postgres: cpu: 100m memory: 256Mi + # --------------------------------------------------------------------------- + # Daily backup to in-cluster MinIO via barmanObjectStore. + # + # When enabled, the chart renders: + # - spec.backup on the CNPG Cluster (templates/database.yaml) + # - ExternalSecret "cnpg-s3-credentials" pulling MinIO root creds + # from OpenBao (templates/database-backup.yaml) + # - ScheduledBackup "pieced-threema-gateway-db-daily" (templates/database-backup.yaml) + # - CronJob "pieced-threema-gateway-db-backup-cleanup" that prunes + # old Backup CRs so the ArgoCD resource tree stays tidy + # (templates/database-backup-cleanup.yaml) + # + # Note on Cilium: this chart's CiliumNetworkPolicy only restricts the + # relay pod (endpointSelector matches app.kubernetes.io/name= + # pieced-threema-gateway). The CNPG postgres pod is NOT covered by + # that policy, so its egress to MinIO works freely as long as there + # is no namespace-level default-deny CNP in threema-gateway. If you + # later add one, you'll also need to allow egress to + # minio.minio-pieced.svc:80 from pods labelled cnpg.io/cluster= + # pieced-threema-gateway-db. + backup: + enabled: true + + # Where backups land. The destinationPath is hard-coded to use the + # cluster name so per-cluster paths don't collide in the shared + # cnpg-backups bucket (matches portal-db, litellm-pg, etc.). + s3: + bucket: cnpg-backups + endpointURL: http://minio.minio-pieced.svc:80 + # OpenBao path containing MinIO root_user / root_password. + # ESO's openbao-backend ClusterSecretStore rewrites KV v2 paths + # automatically, so no `/data/` segment is needed (matches the + # convention used by the chart's other ExternalSecrets above). + credentialsPath: secret/platform/minio-pieced + accessKeyProperty: root_user + secretKeyProperty: root_password + + wal: + compression: gzip + maxParallel: 2 + data: + compression: gzip + + # Barman retains backup *data* in S3 for this many days. The Backup + # CR cleanup CronJob below independently keeps roughly twice this + # many Backup CRs in Kubernetes so we never delete a CR whose data + # is still on disk. + retentionPolicy: "7d" + + schedule: + # CNPG ScheduledBackup uses a SIX-field Go-style cron expression + # (sec min hour dom mon dow), NOT the 5-field Unix crontab format. + # See https://github.com/cloudnative-pg/cloudnative-pg/issues/5380 + # for the silent-misinterpretation footgun. + # Slot: 02:45:00 daily — between litellm-pg (02:30) and portal-db + # (03:00) so the daily snapshot wave is staggered. + cron: "0 45 2 * * *" + # Trigger an immediate backup when the ScheduledBackup is created. + # Useful on first deploy so the `cnpg_collector_last_available_backup_timestamp` + # metric ticks immediately and PieCedCNPGBackupFailed clears. + immediate: true + + # Backup CR cleanup CronJob. Same shape as + # apps/litellm-pg-backup-cleanup.yaml in pieced-gitops. + cleanup: + enabled: true + # Daily at 04:45 — runs ~2h after the ScheduledBackup so the day's + # new CR exists and is preserved in the "newest N" window. + schedule: "45 4 * * *" + keep: 14 + image: bitnami/kubectl:1.31.6 + # Secrets sourced from OpenBao via External Secrets Operator. # Paths use the same convention as apps/portal/external-secrets.yaml: # full key path starting with the KV v2 mount name (`secret/`), no