From b7f85c9bf9e926d4124eae6e804ac2848f42093d Mon Sep 17 00:00:00 2001 From: David Date: Sat, 4 Apr 2026 12:52:56 +0200 Subject: [PATCH] feat(cicd): sync CI/CD pipeline from cicd branch Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/cd-main.yml | 269 ++++++ .github/workflows/cd-preprod.yml | 397 +++++++++ .github/workflows/ci.yml | 241 ++---- .github/workflows/pr-checks.yml | 176 ++++ .github/workflows/rollback.yml | 317 ++++++++ .../hetzner/09-kubernetes-manifests.md | 765 ++++++++++++++++++ .../hetzner/13-backup-disaster-recovery.md | 389 +++++++++ 7 files changed, 2390 insertions(+), 164 deletions(-) create mode 100644 .github/workflows/cd-main.yml create mode 100644 .github/workflows/cd-preprod.yml create mode 100644 .github/workflows/pr-checks.yml create mode 100644 .github/workflows/rollback.yml create mode 100644 docs/deployment/hetzner/09-kubernetes-manifests.md create mode 100644 docs/deployment/hetzner/13-backup-disaster-recovery.md diff --git a/.github/workflows/cd-main.yml b/.github/workflows/cd-main.yml new file mode 100644 index 0000000..5912f40 --- /dev/null +++ b/.github/workflows/cd-main.yml @@ -0,0 +1,269 @@ +name: CD Production (Hetzner k3s) + +# Production deployment pipeline — Hetzner k3s cluster. +# +# Flow: +# 1. Promote: re-tag preprod → latest + prod-SHA within Scaleway (no rebuild, no data transfer) +# 2. Deploy: kubectl set image + rollout status (blocks until pods are healthy) +# 3. Auto-rollback: kubectl rollout undo if rollout fails +# 4. Smoke tests: belt-and-suspenders HTTP health checks +# 5. Notify Discord +# +# Required secrets: +# REGISTRY_TOKEN — Scaleway registry token (read + write) +# HETZNER_KUBECONFIG — base64-encoded kubeconfig for xpeditis-prod cluster +# PROD_BACKEND_URL — https://api.xpeditis.com (health check) +# PROD_FRONTEND_URL — https://app.xpeditis.com (health check) +# DISCORD_WEBHOOK_URL — Discord notifications +# +# K8s cluster details (from docs/deployment/hetzner/): +# Namespace: xpeditis-prod +# Deployments: xpeditis-backend (container: backend) +# xpeditis-frontend (container: frontend) +# +on: + push: + branches: [main] + +# Only one prod deployment at a time. Never cancel. +concurrency: + group: cd-production + cancel-in-progress: false + +env: + REGISTRY: rg.fr-par.scw.cloud/weworkstudio + IMAGE_BACKEND: rg.fr-par.scw.cloud/weworkstudio/xpeditis-backend + IMAGE_FRONTEND: rg.fr-par.scw.cloud/weworkstudio/xpeditis-frontend + K8S_NAMESPACE: xpeditis-prod + +jobs: + # ────────────────────────────────────────────────────────────── + # 1. Promote preprod → prod tags within Scaleway + # imagetools create re-tags at manifest level — no layer + # download/upload, instant even for multi-arch images. + # ────────────────────────────────────────────────────────────── + promote-images: + name: Promote Images (preprod → prod) + runs-on: ubuntu-latest + outputs: + short-sha: ${{ steps.sha.outputs.short }} + backend-image: ${{ steps.images.outputs.backend }} + frontend-image: ${{ steps.images.outputs.frontend }} + + steps: + - name: Compute short SHA + id: sha + run: echo "short=$(echo ${{ github.sha }} | cut -c1-7)" >> $GITHUB_OUTPUT + + - name: Set image references + id: images + run: | + echo "backend=${{ env.IMAGE_BACKEND }}:prod-${{ steps.sha.outputs.short }}" >> $GITHUB_OUTPUT + echo "frontend=${{ env.IMAGE_FRONTEND }}:prod-${{ steps.sha.outputs.short }}" >> $GITHUB_OUTPUT + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Scaleway Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: nologin + password: ${{ secrets.REGISTRY_TOKEN }} + + - name: Promote Backend (preprod → latest + prod-SHA) + run: | + docker buildx imagetools create \ + --tag ${{ env.IMAGE_BACKEND }}:latest \ + --tag ${{ steps.images.outputs.backend }} \ + ${{ env.IMAGE_BACKEND }}:preprod + + - name: Promote Frontend (preprod → latest + prod-SHA) + run: | + docker buildx imagetools create \ + --tag ${{ env.IMAGE_FRONTEND }}:latest \ + --tag ${{ steps.images.outputs.frontend }} \ + ${{ env.IMAGE_FRONTEND }}:preprod + + - name: Verify promoted images + run: | + echo "=== Backend ===" + docker buildx imagetools inspect ${{ steps.images.outputs.backend }} + echo "=== Frontend ===" + docker buildx imagetools inspect ${{ steps.images.outputs.frontend }} + + # ────────────────────────────────────────────────────────────── + # 2. Deploy to Hetzner k3s + # kubectl set image → rollout status waits for pods to be + # healthy (readiness probes pass) before the job succeeds. + # Auto-rollback on failure via kubectl rollout undo. + # ────────────────────────────────────────────────────────────── + deploy: + name: Deploy to k3s (xpeditis-prod) + runs-on: ubuntu-latest + needs: promote-images + environment: + name: production + url: https://app.xpeditis.com + + steps: + - name: Configure kubectl + run: | + mkdir -p ~/.kube + echo "${{ secrets.HETZNER_KUBECONFIG }}" | base64 -d > ~/.kube/config + chmod 600 ~/.kube/config + kubectl cluster-info + kubectl get nodes + + - name: Deploy Backend + run: | + IMAGE="${{ needs.promote-images.outputs.backend-image }}" + echo "Deploying backend: $IMAGE" + kubectl set image deployment/xpeditis-backend \ + backend=$IMAGE \ + -n ${{ env.K8S_NAMESPACE }} + kubectl rollout status deployment/xpeditis-backend \ + -n ${{ env.K8S_NAMESPACE }} \ + --timeout=300s + echo "Backend deployed." + + - name: Deploy Frontend + run: | + IMAGE="${{ needs.promote-images.outputs.frontend-image }}" + echo "Deploying frontend: $IMAGE" + kubectl set image deployment/xpeditis-frontend \ + frontend=$IMAGE \ + -n ${{ env.K8S_NAMESPACE }} + kubectl rollout status deployment/xpeditis-frontend \ + -n ${{ env.K8S_NAMESPACE }} \ + --timeout=300s + echo "Frontend deployed." + + - name: Auto-rollback on failure + if: failure() + run: | + echo "Deployment failed — rolling back..." + kubectl rollout undo deployment/xpeditis-backend -n ${{ env.K8S_NAMESPACE }} || true + kubectl rollout undo deployment/xpeditis-frontend -n ${{ env.K8S_NAMESPACE }} || true + kubectl rollout status deployment/xpeditis-backend -n ${{ env.K8S_NAMESPACE }} --timeout=120s || true + kubectl rollout status deployment/xpeditis-frontend -n ${{ env.K8S_NAMESPACE }} --timeout=120s || true + echo "Previous version restored." + + # ────────────────────────────────────────────────────────────── + # 3. Smoke Tests + # kubectl rollout status already verifies pod readiness. + # These confirm the full network path: + # Cloudflare → Hetzner LB → Traefik → pod. + # ────────────────────────────────────────────────────────────── + smoke-tests: + name: Smoke Tests + runs-on: ubuntu-latest + needs: deploy + + steps: + - name: Wait for LB propagation + run: sleep 30 + + - name: Health check — Backend + run: | + for i in {1..12}; do + STATUS=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 \ + "${{ secrets.PROD_BACKEND_URL }}/api/v1/health" 2>/dev/null || echo "000") + echo " Attempt $i: HTTP $STATUS" + if [ "$STATUS" = "200" ]; then echo "Backend healthy."; exit 0; fi + sleep 15 + done + echo "CRITICAL: Backend unreachable after rollout." + exit 1 + + - name: Health check — Frontend + run: | + for i in {1..12}; do + STATUS=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 \ + "${{ secrets.PROD_FRONTEND_URL }}" 2>/dev/null || echo "000") + echo " Attempt $i: HTTP $STATUS" + if [ "$STATUS" = "200" ]; then echo "Frontend healthy."; exit 0; fi + sleep 15 + done + echo "CRITICAL: Frontend unreachable after rollout." + exit 1 + + # ────────────────────────────────────────────────────────────── + # 4. Deployment Summary + # ────────────────────────────────────────────────────────────── + summary: + name: Deployment Summary + runs-on: ubuntu-latest + needs: [promote-images, smoke-tests] + if: success() + + steps: + - name: Write summary + run: | + echo "## Production Deployment" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| | |" >> $GITHUB_STEP_SUMMARY + echo "|---|---|" >> $GITHUB_STEP_SUMMARY + echo "| **Commit** | \`${{ github.sha }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| **Backend** | \`${{ needs.promote-images.outputs.backend-image }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| **Frontend** | \`${{ needs.promote-images.outputs.frontend-image }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| **Source** | Promoted from \`preprod\` tag — no rebuild |" >> $GITHUB_STEP_SUMMARY + echo "| **Cluster** | Hetzner k3s — namespace \`xpeditis-prod\` |" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "To rollback: [Run rollback workflow](${{ github.server_url }}/${{ github.repository }}/actions/workflows/rollback.yml)" >> $GITHUB_STEP_SUMMARY + + # ────────────────────────────────────────────────────────────── + # Discord — Success + # ────────────────────────────────────────────────────────────── + notify-success: + name: Notify Success + runs-on: ubuntu-latest + needs: [promote-images, smoke-tests] + if: success() + + steps: + - name: Send Discord notification + run: | + curl -s -H "Content-Type: application/json" -d '{ + "embeds": [{ + "title": "🚀 Production Deployed & Healthy", + "color": 3066993, + "fields": [ + {"name": "Author", "value": "${{ github.actor }}", "inline": true}, + {"name": "Version", "value": "`prod-${{ needs.promote-images.outputs.short-sha }}`", "inline": true}, + {"name": "Commit", "value": "[`${{ github.sha }}`](${{ github.event.head_commit.url }})", "inline": false}, + {"name": "Registry", "value": "Scaleway — promoted from `preprod`, no rebuild", "inline": false}, + {"name": "Cluster", "value": "Hetzner k3s — `xpeditis-prod`", "inline": false}, + {"name": "Workflow", "value": "[${{ github.workflow }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})", "inline": false} + ], + "footer": {"text": "Xpeditis CI/CD • Production"} + }] + }' ${{ secrets.DISCORD_WEBHOOK_URL }} + + # ────────────────────────────────────────────────────────────── + # Discord — Failure (CRITICAL) + # ────────────────────────────────────────────────────────────── + notify-failure: + name: Notify Failure + runs-on: ubuntu-latest + needs: [promote-images, deploy, smoke-tests] + if: failure() + + steps: + - name: Send Discord notification + run: | + curl -s -H "Content-Type: application/json" -d '{ + "content": "@here PRODUCTION DEPLOYMENT FAILED", + "embeds": [{ + "title": "🔴 PRODUCTION PIPELINE FAILED", + "description": "Auto-rollback was triggered if deployment failed. Check rollout history.", + "color": 15158332, + "fields": [ + {"name": "Author", "value": "${{ github.actor }}", "inline": true}, + {"name": "Commit", "value": "[`${{ github.sha }}`](${{ github.event.head_commit.url }})", "inline": true}, + {"name": "Workflow", "value": "[${{ github.workflow }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})", "inline": false}, + {"name": "Manual rollback", "value": "[Run rollback workflow](${{ github.server_url }}/${{ github.repository }}/actions/workflows/rollback.yml)", "inline": false} + ], + "footer": {"text": "Xpeditis CI/CD • Production"} + }] + }' ${{ secrets.DISCORD_WEBHOOK_URL }} diff --git a/.github/workflows/cd-preprod.yml b/.github/workflows/cd-preprod.yml new file mode 100644 index 0000000..0d2e359 --- /dev/null +++ b/.github/workflows/cd-preprod.yml @@ -0,0 +1,397 @@ +name: CD Preprod + +# Full pipeline for the preprod branch. +# Flow: quality → integration tests → Docker build & push → deploy → smoke tests → notify +# +# Required secrets: +# REGISTRY_TOKEN — Scaleway container registry token +# NEXT_PUBLIC_API_URL — Preprod API URL (e.g. https://api.preprod.xpeditis.com) +# NEXT_PUBLIC_APP_URL — Preprod app URL (e.g. https://preprod.xpeditis.com) +# PORTAINER_WEBHOOK_BACKEND — Portainer webhook for preprod backend service +# PORTAINER_WEBHOOK_FRONTEND — Portainer webhook for preprod frontend service +# PREPROD_BACKEND_URL — Health check URL (e.g. https://api.preprod.xpeditis.com) +# PREPROD_FRONTEND_URL — Health check URL (e.g. https://preprod.xpeditis.com) +# DISCORD_WEBHOOK_URL — Discord deployment notifications + +on: + push: + branches: [preprod] + +# Only one preprod deployment at a time. Never cancel an in-progress deployment. +concurrency: + group: cd-preprod + cancel-in-progress: false + +env: + REGISTRY: rg.fr-par.scw.cloud/weworkstudio + NODE_VERSION: '20' + +jobs: + # ────────────────────────────────────────── + # 1. Lint & Type-check + # ────────────────────────────────────────── + quality: + name: Quality (${{ matrix.app }}) + runs-on: ubuntu-latest + strategy: + fail-fast: true + matrix: + app: [backend, frontend] + + defaults: + run: + working-directory: apps/${{ matrix.app }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + cache: 'npm' + cache-dependency-path: apps/${{ matrix.app }}/package-lock.json + + - name: Install dependencies + run: npm ci --legacy-peer-deps + + - name: Lint + run: npm run lint + + - name: Type-check (frontend only) + if: matrix.app == 'frontend' + run: npm run type-check + + # ────────────────────────────────────────── + # 2. Unit Tests + # ────────────────────────────────────────── + unit-tests: + name: Unit Tests (${{ matrix.app }}) + runs-on: ubuntu-latest + needs: quality + strategy: + fail-fast: true + matrix: + app: [backend, frontend] + + defaults: + run: + working-directory: apps/${{ matrix.app }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + cache: 'npm' + cache-dependency-path: apps/${{ matrix.app }}/package-lock.json + + - name: Install dependencies + run: npm ci --legacy-peer-deps + + - name: Run unit tests + run: npm test -- --passWithNoTests --coverage + + # ────────────────────────────────────────── + # 3. Integration Tests + # ────────────────────────────────────────── + integration-tests: + name: Integration Tests + runs-on: ubuntu-latest + needs: unit-tests + + defaults: + run: + working-directory: apps/backend + + services: + postgres: + image: postgres:15-alpine + env: + POSTGRES_USER: xpeditis_test + POSTGRES_PASSWORD: xpeditis_test_password + POSTGRES_DB: xpeditis_test + options: >- + --health-cmd pg_isready + --health-interval 5s + --health-timeout 5s + --health-retries 10 + ports: + - 5432:5432 + + redis: + image: redis:7-alpine + options: >- + --health-cmd "redis-cli ping" + --health-interval 5s + --health-timeout 5s + --health-retries 10 + ports: + - 6379:6379 + + steps: + - uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + cache: 'npm' + cache-dependency-path: apps/backend/package-lock.json + + - name: Install dependencies + run: npm ci --legacy-peer-deps + + - name: Run integration tests + env: + NODE_ENV: test + DATABASE_HOST: localhost + DATABASE_PORT: 5432 + DATABASE_USER: xpeditis_test + DATABASE_PASSWORD: xpeditis_test_password + DATABASE_NAME: xpeditis_test + DATABASE_SYNCHRONIZE: false + REDIS_HOST: localhost + REDIS_PORT: 6379 + REDIS_PASSWORD: '' + JWT_SECRET: test-secret-key-for-ci-only + SMTP_HOST: localhost + SMTP_PORT: 1025 + SMTP_FROM: test@xpeditis.com + run: npm run test:integration -- --passWithNoTests + + # ────────────────────────────────────────── + # 4a. Docker Build & Push — Backend + # ────────────────────────────────────────── + build-backend: + name: Build & Push Backend + runs-on: ubuntu-latest + needs: integration-tests + outputs: + image-tag: ${{ steps.sha.outputs.short }} + + steps: + - uses: actions/checkout@v4 + + - name: Compute short SHA + id: sha + run: echo "short=$(echo ${{ github.sha }} | cut -c1-7)" >> $GITHUB_OUTPUT + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Scaleway Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: nologin + password: ${{ secrets.REGISTRY_TOKEN }} + + - name: Build and push Backend image + uses: docker/build-push-action@v5 + with: + context: ./apps/backend + file: ./apps/backend/Dockerfile + push: true + # Tag with branch name AND commit SHA for traceability and prod promotion + tags: | + ${{ env.REGISTRY }}/xpeditis-backend:preprod + ${{ env.REGISTRY }}/xpeditis-backend:preprod-${{ steps.sha.outputs.short }} + cache-from: type=registry,ref=${{ env.REGISTRY }}/xpeditis-backend:buildcache + cache-to: type=registry,ref=${{ env.REGISTRY }}/xpeditis-backend:buildcache,mode=max + platforms: linux/amd64,linux/arm64 + + # ────────────────────────────────────────── + # 4b. Docker Build & Push — Frontend + # ────────────────────────────────────────── + build-frontend: + name: Build & Push Frontend + runs-on: ubuntu-latest + needs: integration-tests + outputs: + image-tag: ${{ steps.sha.outputs.short }} + + steps: + - uses: actions/checkout@v4 + + - name: Compute short SHA + id: sha + run: echo "short=$(echo ${{ github.sha }} | cut -c1-7)" >> $GITHUB_OUTPUT + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Scaleway Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: nologin + password: ${{ secrets.REGISTRY_TOKEN }} + + - name: Build and push Frontend image + uses: docker/build-push-action@v5 + with: + context: ./apps/frontend + file: ./apps/frontend/Dockerfile + push: true + tags: | + ${{ env.REGISTRY }}/xpeditis-frontend:preprod + ${{ env.REGISTRY }}/xpeditis-frontend:preprod-${{ steps.sha.outputs.short }} + cache-from: type=registry,ref=${{ env.REGISTRY }}/xpeditis-frontend:buildcache + cache-to: type=registry,ref=${{ env.REGISTRY }}/xpeditis-frontend:buildcache,mode=max + platforms: linux/amd64,linux/arm64 + build-args: | + NEXT_PUBLIC_API_URL=${{ secrets.NEXT_PUBLIC_API_URL }} + NEXT_PUBLIC_APP_URL=${{ secrets.NEXT_PUBLIC_APP_URL }} + + # ────────────────────────────────────────── + # 5. Deploy to Preprod via Portainer + # ────────────────────────────────────────── + deploy: + name: Deploy to Preprod + runs-on: ubuntu-latest + needs: [build-backend, build-frontend] + environment: preprod + + steps: + - name: Trigger Backend deployment + run: | + echo "Deploying backend (preprod-${{ needs.build-backend.outputs.image-tag }})..." + curl -sf -X POST \ + -H "Content-Type: application/json" \ + "${{ secrets.PORTAINER_WEBHOOK_BACKEND }}" + echo "Backend webhook triggered." + + - name: Wait for backend to stabilize + run: sleep 20 + + - name: Trigger Frontend deployment + run: | + echo "Deploying frontend (preprod-${{ needs.build-frontend.outputs.image-tag }})..." + curl -sf -X POST \ + -H "Content-Type: application/json" \ + "${{ secrets.PORTAINER_WEBHOOK_FRONTEND }}" + echo "Frontend webhook triggered." + + # ────────────────────────────────────────── + # 6. Smoke Tests — verify preprod is healthy + # ────────────────────────────────────────── + smoke-tests: + name: Smoke Tests + runs-on: ubuntu-latest + needs: deploy + + steps: + - name: Wait for services to start + run: sleep 40 + + - name: Health check — Backend + run: | + echo "Checking backend health..." + for i in {1..10}; do + STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ + --max-time 10 \ + "${{ secrets.PREPROD_BACKEND_URL }}/health" 2>/dev/null || echo "000") + echo " Attempt $i: HTTP $STATUS" + if [ "$STATUS" = "200" ]; then + echo "Backend is healthy." + exit 0 + fi + sleep 15 + done + echo "Backend health check failed after 10 attempts." + exit 1 + + - name: Health check — Frontend + run: | + echo "Checking frontend health..." + for i in {1..10}; do + STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ + --max-time 10 \ + "${{ secrets.PREPROD_FRONTEND_URL }}" 2>/dev/null || echo "000") + echo " Attempt $i: HTTP $STATUS" + if [ "$STATUS" = "200" ]; then + echo "Frontend is healthy." + exit 0 + fi + sleep 15 + done + echo "Frontend health check failed after 10 attempts." + exit 1 + + # ────────────────────────────────────────── + # 7. Deployment Summary + # ────────────────────────────────────────── + summary: + name: Deployment Summary + runs-on: ubuntu-latest + needs: [build-backend, build-frontend, smoke-tests] + if: success() + + steps: + - name: Write summary + run: | + echo "## Preprod Deployment" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| | |" >> $GITHUB_STEP_SUMMARY + echo "|---|---|" >> $GITHUB_STEP_SUMMARY + echo "| **Commit** | \`${{ github.sha }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| **Backend image** | \`${{ env.REGISTRY }}/xpeditis-backend:preprod-${{ needs.build-backend.outputs.image-tag }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| **Frontend image** | \`${{ env.REGISTRY }}/xpeditis-frontend:preprod-${{ needs.build-frontend.outputs.image-tag }}\` |" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "To promote this exact build to production, merge this commit to \`main\`." >> $GITHUB_STEP_SUMMARY + + # ────────────────────────────────────────── + # Discord — Success + # ────────────────────────────────────────── + notify-success: + name: Notify Success + runs-on: ubuntu-latest + needs: [build-backend, build-frontend, smoke-tests] + if: success() + + steps: + - name: Send Discord notification + run: | + curl -s -H "Content-Type: application/json" -d '{ + "embeds": [{ + "title": "✅ Preprod Deployed & Healthy", + "color": 3066993, + "fields": [ + {"name": "Author", "value": "${{ github.actor }}", "inline": true}, + {"name": "Commit", "value": "[`${{ github.sha }}`](${{ github.event.head_commit.url }})", "inline": true}, + {"name": "Backend", "value": "`preprod-${{ needs.build-backend.outputs.image-tag }}`", "inline": false}, + {"name": "Frontend", "value": "`preprod-${{ needs.build-frontend.outputs.image-tag }}`", "inline": false}, + {"name": "Workflow", "value": "[${{ github.workflow }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})", "inline": false} + ], + "footer": {"text": "Xpeditis CI/CD • Preprod"} + }] + }' ${{ secrets.DISCORD_WEBHOOK_URL }} + + # ────────────────────────────────────────── + # Discord — Failure + # ────────────────────────────────────────── + notify-failure: + name: Notify Failure + runs-on: ubuntu-latest + needs: [quality, unit-tests, integration-tests, build-backend, build-frontend, deploy, smoke-tests] + if: failure() + + steps: + - name: Send Discord notification + run: | + curl -s -H "Content-Type: application/json" -d '{ + "embeds": [{ + "title": "❌ Preprod Pipeline Failed", + "description": "Preprod was NOT deployed. Fix the issue before retrying.", + "color": 15158332, + "fields": [ + {"name": "Author", "value": "${{ github.actor }}", "inline": true}, + {"name": "Commit", "value": "[`${{ github.sha }}`](${{ github.event.head_commit.url }})", "inline": true}, + {"name": "Workflow", "value": "[${{ github.workflow }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})", "inline": false} + ], + "footer": {"text": "Xpeditis CI/CD • Preprod"} + }] + }' ${{ secrets.DISCORD_WEBHOOK_URL }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b927860..4a13643 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,199 +1,112 @@ -name: CI +name: Dev CI + +# Fast feedback loop for the dev branch. +# Runs lint + unit tests only — no Docker build, no deployment. on: push: - branches: [main, dev] + branches: [dev] pull_request: - branches: [main, dev] + branches: [dev] + +concurrency: + group: dev-ci-${{ github.ref }} + cancel-in-progress: true + +env: + NODE_VERSION: '20' jobs: - lint-and-format: - name: Lint & Format Check + # ────────────────────────────────────────── + # Lint & Type-check + # ────────────────────────────────────────── + quality: + name: Quality (${{ matrix.app }}) runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + app: [backend, frontend] + + defaults: + run: + working-directory: apps/${{ matrix.app }} steps: - - name: Checkout code - uses: actions/checkout@v4 + - uses: actions/checkout@v4 - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: '20' + node-version: ${{ env.NODE_VERSION }} cache: 'npm' + cache-dependency-path: apps/${{ matrix.app }}/package-lock.json - name: Install dependencies - run: npm ci + run: npm ci --legacy-peer-deps - - name: Run Prettier check - run: npm run format:check + - name: Lint + run: npm run lint - - name: Lint backend - run: npm run backend:lint --workspace=apps/backend + - name: Type-check (frontend only) + if: matrix.app == 'frontend' + run: npm run type-check - - name: Lint frontend - run: npm run frontend:lint --workspace=apps/frontend - - test-backend: - name: Test Backend + # ────────────────────────────────────────── + # Unit Tests + # ────────────────────────────────────────── + unit-tests: + name: Unit Tests (${{ matrix.app }}) runs-on: ubuntu-latest + needs: quality + strategy: + fail-fast: false + matrix: + app: [backend, frontend] - services: - postgres: - image: postgres:15-alpine - env: - POSTGRES_USER: xpeditis_test - POSTGRES_PASSWORD: xpeditis_test - POSTGRES_DB: xpeditis_test - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 5432:5432 - - redis: - image: redis:7-alpine - options: >- - --health-cmd "redis-cli ping" - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 6379:6379 + defaults: + run: + working-directory: apps/${{ matrix.app }} steps: - - name: Checkout code - uses: actions/checkout@v4 + - uses: actions/checkout@v4 - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: '20' + node-version: ${{ env.NODE_VERSION }} cache: 'npm' + cache-dependency-path: apps/${{ matrix.app }}/package-lock.json - name: Install dependencies - run: npm ci + run: npm ci --legacy-peer-deps - - name: Run backend unit tests - working-directory: apps/backend - env: - NODE_ENV: test - DATABASE_HOST: localhost - DATABASE_PORT: 5432 - DATABASE_USER: xpeditis_test - DATABASE_PASSWORD: xpeditis_test - DATABASE_NAME: xpeditis_test - REDIS_HOST: localhost - REDIS_PORT: 6379 - REDIS_PASSWORD: '' - JWT_SECRET: test-jwt-secret - run: npm run test + - name: Run unit tests + run: npm test -- --passWithNoTests --coverage - - name: Run backend E2E tests - working-directory: apps/backend - env: - NODE_ENV: test - DATABASE_HOST: localhost - DATABASE_PORT: 5432 - DATABASE_USER: xpeditis_test - DATABASE_PASSWORD: xpeditis_test - DATABASE_NAME: xpeditis_test - REDIS_HOST: localhost - REDIS_PORT: 6379 - REDIS_PASSWORD: '' - JWT_SECRET: test-jwt-secret - run: npm run test:e2e - - - name: Upload backend coverage - uses: codecov/codecov-action@v3 - with: - files: ./apps/backend/coverage/lcov.info - flags: backend - name: backend-coverage - - test-frontend: - name: Test Frontend + # ────────────────────────────────────────── + # Discord notification on failure + # ────────────────────────────────────────── + notify-failure: + name: Notify Failure runs-on: ubuntu-latest + needs: [quality, unit-tests] + if: failure() steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: '20' - cache: 'npm' - - - name: Install dependencies - run: npm ci - - - name: Run frontend tests - working-directory: apps/frontend - run: npm run test - - - name: Upload frontend coverage - uses: codecov/codecov-action@v3 - with: - files: ./apps/frontend/coverage/lcov.info - flags: frontend - name: frontend-coverage - - build-backend: - name: Build Backend - runs-on: ubuntu-latest - needs: [lint-and-format, test-backend] - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: '20' - cache: 'npm' - - - name: Install dependencies - run: npm ci - - - name: Build backend - working-directory: apps/backend - run: npm run build - - - name: Upload build artifacts - uses: actions/upload-artifact@v4 - with: - name: backend-dist - path: apps/backend/dist - - build-frontend: - name: Build Frontend - runs-on: ubuntu-latest - needs: [lint-and-format, test-frontend] - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: '20' - cache: 'npm' - - - name: Install dependencies - run: npm ci - - - name: Build frontend - working-directory: apps/frontend - env: - NEXT_PUBLIC_API_URL: ${{ secrets.NEXT_PUBLIC_API_URL || 'http://localhost:4000' }} - run: npm run build - - - name: Upload build artifacts - uses: actions/upload-artifact@v4 - with: - name: frontend-build - path: apps/frontend/.next + - name: Send Discord notification + run: | + curl -s -H "Content-Type: application/json" -d '{ + "embeds": [{ + "title": "❌ Dev CI Failed", + "description": "Fix the issues before merging to preprod.", + "color": 15158332, + "fields": [ + {"name": "Branch", "value": "`${{ github.ref_name }}`", "inline": true}, + {"name": "Author", "value": "${{ github.actor }}", "inline": true}, + {"name": "Commit", "value": "[`${{ github.sha }}`](${{ github.event.head_commit.url }})", "inline": false}, + {"name": "Workflow", "value": "[${{ github.workflow }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})", "inline": false} + ], + "footer": {"text": "Xpeditis CI/CD"} + }] + }' ${{ secrets.DISCORD_WEBHOOK_URL }} diff --git a/.github/workflows/pr-checks.yml b/.github/workflows/pr-checks.yml new file mode 100644 index 0000000..6b894d3 --- /dev/null +++ b/.github/workflows/pr-checks.yml @@ -0,0 +1,176 @@ +name: PR Checks + +# Validation gate for pull requests. +# PRs to preprod → lint + unit tests + integration tests +# PRs to main → lint + unit tests only (code was integration-tested in preprod already) +# +# Configure these as required status checks in GitHub branch protection rules. + +on: + pull_request: + branches: [preprod, main] + +concurrency: + group: pr-${{ github.event.pull_request.number }} + cancel-in-progress: true + +env: + NODE_VERSION: '20' + +jobs: + # ────────────────────────────────────────── + # Lint & Type-check (both apps, parallel) + # ────────────────────────────────────────── + quality: + name: Quality (${{ matrix.app }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + app: [backend, frontend] + + defaults: + run: + working-directory: apps/${{ matrix.app }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + cache: 'npm' + cache-dependency-path: apps/${{ matrix.app }}/package-lock.json + + - name: Install dependencies + run: npm ci --legacy-peer-deps + + - name: Lint + run: npm run lint + + - name: Type-check (frontend only) + if: matrix.app == 'frontend' + run: npm run type-check + + # ────────────────────────────────────────── + # Unit Tests (both apps, parallel) + # ────────────────────────────────────────── + unit-tests: + name: Unit Tests (${{ matrix.app }}) + runs-on: ubuntu-latest + needs: quality + strategy: + fail-fast: false + matrix: + app: [backend, frontend] + + defaults: + run: + working-directory: apps/${{ matrix.app }} + + steps: + - uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + cache: 'npm' + cache-dependency-path: apps/${{ matrix.app }}/package-lock.json + + - name: Install dependencies + run: npm ci --legacy-peer-deps + + - name: Run unit tests + run: npm test -- --passWithNoTests --coverage + + # ────────────────────────────────────────── + # Integration Tests — PRs to preprod only + # ────────────────────────────────────────── + integration-tests: + name: Integration Tests + runs-on: ubuntu-latest + needs: unit-tests + if: github.base_ref == 'preprod' + + defaults: + run: + working-directory: apps/backend + + services: + postgres: + image: postgres:15-alpine + env: + POSTGRES_USER: xpeditis_test + POSTGRES_PASSWORD: xpeditis_test_password + POSTGRES_DB: xpeditis_test + options: >- + --health-cmd pg_isready + --health-interval 5s + --health-timeout 5s + --health-retries 10 + ports: + - 5432:5432 + + redis: + image: redis:7-alpine + options: >- + --health-cmd "redis-cli ping" + --health-interval 5s + --health-timeout 5s + --health-retries 10 + ports: + - 6379:6379 + + steps: + - uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + cache: 'npm' + cache-dependency-path: apps/backend/package-lock.json + + - name: Install dependencies + run: npm ci --legacy-peer-deps + + - name: Run integration tests + env: + NODE_ENV: test + DATABASE_HOST: localhost + DATABASE_PORT: 5432 + DATABASE_USER: xpeditis_test + DATABASE_PASSWORD: xpeditis_test_password + DATABASE_NAME: xpeditis_test + DATABASE_SYNCHRONIZE: false + REDIS_HOST: localhost + REDIS_PORT: 6379 + REDIS_PASSWORD: '' + JWT_SECRET: test-secret-key-for-ci-only + SMTP_HOST: localhost + SMTP_PORT: 1025 + SMTP_FROM: test@xpeditis.com + run: npm run test:integration -- --passWithNoTests + + # ────────────────────────────────────────── + # PR Summary + # ────────────────────────────────────────── + pr-summary: + name: PR Summary + runs-on: ubuntu-latest + needs: [quality, unit-tests] + if: always() + + steps: + - name: Write job summary + run: | + echo "## PR Check Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Check | Status |" >> $GITHUB_STEP_SUMMARY + echo "|-------|--------|" >> $GITHUB_STEP_SUMMARY + echo "| Quality (lint + type-check) | ${{ needs.quality.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Unit Tests | ${{ needs.unit-tests.result }} |" >> $GITHUB_STEP_SUMMARY + echo "| Target Branch | \`${{ github.base_ref }}\` |" >> $GITHUB_STEP_SUMMARY + echo "| Author | ${{ github.actor }} |" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/rollback.yml b/.github/workflows/rollback.yml new file mode 100644 index 0000000..ea82bd9 --- /dev/null +++ b/.github/workflows/rollback.yml @@ -0,0 +1,317 @@ +name: Rollback + +# Emergency rollback for production (Hetzner k3s) and preprod (Portainer). +# All images are on Scaleway registry. +# +# Production (k3s): +# Option A — "previous": kubectl rollout undo (instant, reverts to last ReplicaSet) +# Option B — "specific-version": kubectl set image to a Scaleway prod-SHA tag +# +# Preprod (Portainer): +# Re-tags a Scaleway preprod-SHA image back to :preprod, triggers Portainer webhook. +# +# Run from: GitHub Actions → Workflows → Rollback → Run workflow + +on: + workflow_dispatch: + inputs: + environment: + description: 'Target environment' + required: true + type: choice + options: + - production + - preprod + strategy: + description: 'Rollback strategy ("previous" = kubectl rollout undo, prod only)' + required: true + type: choice + options: + - previous + - specific-version + version_tag: + description: 'Tag for specific-version (e.g. prod-a1b2c3d or preprod-a1b2c3d)' + required: false + type: string + reason: + description: 'Reason for rollback (audit trail)' + required: true + type: string + +env: + REGISTRY: rg.fr-par.scw.cloud/weworkstudio + IMAGE_BACKEND: rg.fr-par.scw.cloud/weworkstudio/xpeditis-backend + IMAGE_FRONTEND: rg.fr-par.scw.cloud/weworkstudio/xpeditis-frontend + K8S_NAMESPACE: xpeditis-prod + +jobs: + # ────────────────────────────────────────── + # Validate inputs + # ────────────────────────────────────────── + validate: + name: Validate Inputs + runs-on: ubuntu-latest + + steps: + - name: Validate + run: | + ENV="${{ github.event.inputs.environment }}" + STRATEGY="${{ github.event.inputs.strategy }}" + TAG="${{ github.event.inputs.version_tag }}" + + if [ "$STRATEGY" = "specific-version" ] && [ -z "$TAG" ]; then + echo "ERROR: version_tag is required for specific-version strategy." + exit 1 + fi + + if [ "$STRATEGY" = "specific-version" ] && [ "$ENV" = "production" ]; then + if [[ ! "$TAG" =~ ^prod- ]]; then + echo "ERROR: Production tag must start with 'prod-' (got: $TAG)" + exit 1 + fi + fi + + if [ "$STRATEGY" = "specific-version" ] && [ "$ENV" = "preprod" ]; then + if [[ ! "$TAG" =~ ^preprod- ]]; then + echo "ERROR: Preprod tag must start with 'preprod-' (got: $TAG)" + exit 1 + fi + fi + + echo "Validation passed." + echo " Environment : $ENV" + echo " Strategy : $STRATEGY" + echo " Version : ${TAG:-N/A (previous)}" + echo " Reason : ${{ github.event.inputs.reason }}" + + # ────────────────────────────────────────── + # PRODUCTION ROLLBACK — k3s via kubectl + # ────────────────────────────────────────── + rollback-production: + name: Rollback Production (k3s) + runs-on: ubuntu-latest + needs: validate + if: github.event.inputs.environment == 'production' + environment: + name: production + url: https://app.xpeditis.com + + steps: + - name: Configure kubectl + run: | + mkdir -p ~/.kube + echo "${{ secrets.HETZNER_KUBECONFIG }}" | base64 -d > ~/.kube/config + chmod 600 ~/.kube/config + kubectl cluster-info + + # ── Strategy A: kubectl rollout undo (fastest) + - name: Rollback to previous version + if: github.event.inputs.strategy == 'previous' + run: | + echo "Rolling back backend..." + kubectl rollout undo deployment/xpeditis-backend -n ${{ env.K8S_NAMESPACE }} + kubectl rollout status deployment/xpeditis-backend -n ${{ env.K8S_NAMESPACE }} --timeout=180s + + echo "Rolling back frontend..." + kubectl rollout undo deployment/xpeditis-frontend -n ${{ env.K8S_NAMESPACE }} + kubectl rollout status deployment/xpeditis-frontend -n ${{ env.K8S_NAMESPACE }} --timeout=180s + + kubectl get pods -n ${{ env.K8S_NAMESPACE }} + + # ── Strategy B: kubectl set image to specific Scaleway tag + - name: Set up Docker Buildx (for image inspect) + if: github.event.inputs.strategy == 'specific-version' + uses: docker/setup-buildx-action@v3 + + - name: Login to Scaleway Registry + if: github.event.inputs.strategy == 'specific-version' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: nologin + password: ${{ secrets.REGISTRY_TOKEN }} + + - name: Rollback to specific version + if: github.event.inputs.strategy == 'specific-version' + run: | + TAG="${{ github.event.inputs.version_tag }}" + BACKEND_IMAGE="${{ env.IMAGE_BACKEND }}:${TAG}" + FRONTEND_IMAGE="${{ env.IMAGE_FRONTEND }}:${TAG}" + + echo "Verifying images exist in Scaleway..." + docker buildx imagetools inspect "$BACKEND_IMAGE" || \ + { echo "ERROR: Backend image not found: $BACKEND_IMAGE"; exit 1; } + docker buildx imagetools inspect "$FRONTEND_IMAGE" || \ + { echo "ERROR: Frontend image not found: $FRONTEND_IMAGE"; exit 1; } + + echo "Deploying backend: $BACKEND_IMAGE" + kubectl set image deployment/xpeditis-backend backend="$BACKEND_IMAGE" -n ${{ env.K8S_NAMESPACE }} + kubectl rollout status deployment/xpeditis-backend -n ${{ env.K8S_NAMESPACE }} --timeout=180s + + echo "Deploying frontend: $FRONTEND_IMAGE" + kubectl set image deployment/xpeditis-frontend frontend="$FRONTEND_IMAGE" -n ${{ env.K8S_NAMESPACE }} + kubectl rollout status deployment/xpeditis-frontend -n ${{ env.K8S_NAMESPACE }} --timeout=180s + + kubectl get pods -n ${{ env.K8S_NAMESPACE }} + + - name: Show rollout history + if: always() + run: | + echo "=== Backend rollout history ===" + kubectl rollout history deployment/xpeditis-backend -n ${{ env.K8S_NAMESPACE }} || true + echo "=== Frontend rollout history ===" + kubectl rollout history deployment/xpeditis-frontend -n ${{ env.K8S_NAMESPACE }} || true + + # ────────────────────────────────────────── + # PREPROD ROLLBACK — Scaleway re-tag + Portainer + # ────────────────────────────────────────── + rollback-preprod: + name: Rollback Preprod (Portainer) + runs-on: ubuntu-latest + needs: validate + if: github.event.inputs.environment == 'preprod' + + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Scaleway Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: nologin + password: ${{ secrets.REGISTRY_TOKEN }} + + - name: Verify target images exist + run: | + TAG="${{ github.event.inputs.version_tag }}" + docker buildx imagetools inspect "${{ env.IMAGE_BACKEND }}:${TAG}" || \ + { echo "ERROR: Backend image not found: $TAG"; exit 1; } + docker buildx imagetools inspect "${{ env.IMAGE_FRONTEND }}:${TAG}" || \ + { echo "ERROR: Frontend image not found: $TAG"; exit 1; } + + - name: Re-tag target version as preprod + run: | + TAG="${{ github.event.inputs.version_tag }}" + echo "Re-tagging $TAG → preprod..." + docker buildx imagetools create \ + --tag ${{ env.IMAGE_BACKEND }}:preprod \ + ${{ env.IMAGE_BACKEND }}:${TAG} + docker buildx imagetools create \ + --tag ${{ env.IMAGE_FRONTEND }}:preprod \ + ${{ env.IMAGE_FRONTEND }}:${TAG} + echo "Re-tag complete." + + - name: Trigger Backend deployment (Portainer) + run: | + curl -sf -X POST -H "Content-Type: application/json" \ + "${{ secrets.PORTAINER_WEBHOOK_BACKEND }}" + echo "Backend webhook triggered." + + - name: Wait for backend + run: sleep 20 + + - name: Trigger Frontend deployment (Portainer) + run: | + curl -sf -X POST -H "Content-Type: application/json" \ + "${{ secrets.PORTAINER_WEBHOOK_FRONTEND }}" + echo "Frontend webhook triggered." + + # ────────────────────────────────────────── + # Smoke Tests + # ────────────────────────────────────────── + smoke-tests: + name: Smoke Tests + runs-on: ubuntu-latest + needs: [rollback-production, rollback-preprod] + if: always() && (needs.rollback-production.result == 'success' || needs.rollback-preprod.result == 'success') + + steps: + - name: Set health check URLs + id: urls + run: | + if [ "${{ github.event.inputs.environment }}" = "production" ]; then + echo "backend=${{ secrets.PROD_BACKEND_URL }}/api/v1/health" >> $GITHUB_OUTPUT + echo "frontend=${{ secrets.PROD_FRONTEND_URL }}" >> $GITHUB_OUTPUT + echo "wait=30" >> $GITHUB_OUTPUT + else + echo "backend=${{ secrets.PREPROD_BACKEND_URL }}/api/v1/health" >> $GITHUB_OUTPUT + echo "frontend=${{ secrets.PREPROD_FRONTEND_URL }}" >> $GITHUB_OUTPUT + echo "wait=60" >> $GITHUB_OUTPUT + fi + + - name: Wait for services + run: sleep ${{ steps.urls.outputs.wait }} + + - name: Health check — Backend + run: | + for i in {1..12}; do + STATUS=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 \ + "${{ steps.urls.outputs.backend }}" 2>/dev/null || echo "000") + echo " Attempt $i: HTTP $STATUS" + if [ "$STATUS" = "200" ]; then echo "Backend healthy."; exit 0; fi + sleep 15 + done + echo "CRITICAL: Backend unhealthy after rollback." + exit 1 + + - name: Health check — Frontend + run: | + for i in {1..12}; do + STATUS=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 \ + "${{ steps.urls.outputs.frontend }}" 2>/dev/null || echo "000") + echo " Attempt $i: HTTP $STATUS" + if [ "$STATUS" = "200" ]; then echo "Frontend healthy."; exit 0; fi + sleep 15 + done + echo "CRITICAL: Frontend unhealthy after rollback." + exit 1 + + # ────────────────────────────────────────── + # Discord notification + # ────────────────────────────────────────── + notify: + name: Notify Rollback Result + runs-on: ubuntu-latest + needs: [rollback-production, rollback-preprod, smoke-tests] + if: always() + + steps: + - name: Notify success + if: needs.smoke-tests.result == 'success' + run: | + curl -s -H "Content-Type: application/json" -d '{ + "embeds": [{ + "title": "↩️ Rollback Successful", + "color": 16776960, + "fields": [ + {"name": "Environment", "value": "`${{ github.event.inputs.environment }}`", "inline": true}, + {"name": "Strategy", "value": "`${{ github.event.inputs.strategy }}`", "inline": true}, + {"name": "Version", "value": "`${{ github.event.inputs.version_tag || 'previous' }}`", "inline": true}, + {"name": "Triggered by", "value": "${{ github.actor }}", "inline": true}, + {"name": "Reason", "value": "${{ github.event.inputs.reason }}", "inline": false}, + {"name": "Workflow", "value": "[${{ github.workflow }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})", "inline": false} + ], + "footer": {"text": "Xpeditis CI/CD • Rollback"} + }] + }' ${{ secrets.DISCORD_WEBHOOK_URL }} + + - name: Notify failure + if: needs.smoke-tests.result != 'success' + run: | + curl -s -H "Content-Type: application/json" -d '{ + "content": "@here ROLLBACK FAILED — MANUAL INTERVENTION REQUIRED", + "embeds": [{ + "title": "🔴 Rollback Failed", + "description": "Service may be degraded. Escalate immediately.", + "color": 15158332, + "fields": [ + {"name": "Environment", "value": "`${{ github.event.inputs.environment }}`", "inline": true}, + {"name": "Attempted version", "value": "`${{ github.event.inputs.version_tag || 'previous' }}`", "inline": true}, + {"name": "Triggered by", "value": "${{ github.actor }}", "inline": true}, + {"name": "Reason", "value": "${{ github.event.inputs.reason }}", "inline": false}, + {"name": "Workflow", "value": "[${{ github.workflow }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})", "inline": false} + ], + "footer": {"text": "Xpeditis CI/CD • Rollback"} + }] + }' ${{ secrets.DISCORD_WEBHOOK_URL }} diff --git a/docs/deployment/hetzner/09-kubernetes-manifests.md b/docs/deployment/hetzner/09-kubernetes-manifests.md new file mode 100644 index 0000000..d2a9508 --- /dev/null +++ b/docs/deployment/hetzner/09-kubernetes-manifests.md @@ -0,0 +1,765 @@ +# 09 — Manifests Kubernetes complets + +Tous les fichiers YAML de déploiement de Xpeditis. Créez un dossier `k8s/` à la racine du projet. + +--- + +## Structure des fichiers + +``` +k8s/ +├── 00-namespaces.yaml +├── 01-secrets.yaml # ← À remplir avec vos valeurs (ne pas committer) +├── 02-configmaps.yaml +├── 03-backend-deployment.yaml +├── 04-backend-service.yaml +├── 05-frontend-deployment.yaml +├── 06-frontend-service.yaml +├── 07-ingress.yaml +├── 08-hpa.yaml +└── 09-pdb.yaml +``` + +--- + +## 00 — Namespaces + +```yaml +# k8s/00-namespaces.yaml +--- +apiVersion: v1 +kind: Namespace +metadata: + name: xpeditis-prod + labels: + environment: production + app.kubernetes.io/managed-by: hetzner-k3s +``` + +```bash +kubectl apply -f k8s/00-namespaces.yaml +``` + +--- + +## 01 — Secrets (⚠️ ne jamais committer ce fichier dans Git) + +Ajoutez `k8s/01-secrets.yaml` à votre `.gitignore`. + +```yaml +# k8s/01-secrets.yaml ← AJOUTER AU .gitignore +--- +apiVersion: v1 +kind: Secret +metadata: + name: backend-secrets + namespace: xpeditis-prod +type: Opaque +stringData: + # Application + NODE_ENV: "production" + PORT: "4000" + API_PREFIX: "api/v1" + APP_URL: "https://app.xpeditis.com" + FRONTEND_URL: "https://app.xpeditis.com" + + # Base de données (choisir Option A ou B) + # === Option A : Neon.tech === + DATABASE_HOST: "ep-xxx.eu-central-1.aws.neon.tech" + DATABASE_PORT: "5432" + DATABASE_USER: "xpeditis" + DATABASE_PASSWORD: "" + DATABASE_NAME: "xpeditis" + DATABASE_SSL: "true" + DATABASE_SYNC: "false" + DATABASE_LOGGING: "false" + # === Option B : Self-hosted === + # DATABASE_HOST: "10.0.1.100" # IP privée Hetzner du serveur PG + # DATABASE_PORT: "6432" # PgBouncer + # DATABASE_USER: "xpeditis" + # DATABASE_PASSWORD: "" + # DATABASE_NAME: "xpeditis_prod" + # DATABASE_SYNC: "false" + # DATABASE_LOGGING: "false" + + # Redis (choisir Option A ou B) + # === Option A : Upstash === + REDIS_HOST: "your-redis.upstash.io" + REDIS_PORT: "6379" + REDIS_PASSWORD: "" + REDIS_DB: "0" + # === Option B : Self-hosted === + # REDIS_HOST: "redis.xpeditis-prod.svc.cluster.local" + # REDIS_PORT: "6379" + # REDIS_PASSWORD: "" + # REDIS_DB: "0" + + # JWT + JWT_SECRET: "" + JWT_ACCESS_EXPIRATION: "15m" + JWT_REFRESH_EXPIRATION: "7d" + + # OAuth2 Google + GOOGLE_CLIENT_ID: "" + GOOGLE_CLIENT_SECRET: "" + GOOGLE_CALLBACK_URL: "https://api.xpeditis.com/api/v1/auth/google/callback" + + # OAuth2 Microsoft + MICROSOFT_CLIENT_ID: "" + MICROSOFT_CLIENT_SECRET: "" + MICROSOFT_CALLBACK_URL: "https://api.xpeditis.com/api/v1/auth/microsoft/callback" + + # Email (Brevo SMTP — remplace SendGrid) + SMTP_HOST: "smtp-relay.brevo.com" + SMTP_PORT: "587" + SMTP_SECURE: "false" + SMTP_USER: "" + SMTP_PASS: "" + SMTP_FROM: "noreply@xpeditis.com" + + # Hetzner Object Storage (remplace MinIO) + AWS_S3_ENDPOINT: "https://fsn1.your-objectstorage.com" + AWS_ACCESS_KEY_ID: "" + AWS_SECRET_ACCESS_KEY: "" + AWS_REGION: "eu-central-1" + AWS_S3_BUCKET: "xpeditis-prod" + + # Carrier APIs + MAERSK_API_KEY: "" + MAERSK_API_URL: "https://api.maersk.com/v1" + MSC_API_KEY: "" + MSC_API_URL: "https://api.msc.com/v1" + CMACGM_API_URL: "https://api.cma-cgm.com/v1" + CMACGM_CLIENT_ID: "" + CMACGM_CLIENT_SECRET: "" + HAPAG_API_URL: "https://api.hapag-lloyd.com/v1" + HAPAG_API_KEY: "" + ONE_API_URL: "https://api.one-line.com/v1" + ONE_USERNAME: "" + ONE_PASSWORD: "" + + # Stripe + STRIPE_SECRET_KEY: "sk_live_<...>" + STRIPE_WEBHOOK_SECRET: "whsec_<...>" + STRIPE_SILVER_MONTHLY_PRICE_ID: "price_<...>" + STRIPE_SILVER_YEARLY_PRICE_ID: "price_<...>" + STRIPE_GOLD_MONTHLY_PRICE_ID: "price_<...>" + STRIPE_GOLD_YEARLY_PRICE_ID: "price_<...>" + STRIPE_PLATINIUM_MONTHLY_PRICE_ID: "price_<...>" + STRIPE_PLATINIUM_YEARLY_PRICE_ID: "price_<...>" + + # Sécurité + BCRYPT_ROUNDS: "12" + SESSION_TIMEOUT_MS: "7200000" + RATE_LIMIT_TTL: "60" + RATE_LIMIT_MAX: "100" + + # Monitoring + SENTRY_DSN: "" +--- +apiVersion: v1 +kind: Secret +metadata: + name: frontend-secrets + namespace: xpeditis-prod +type: Opaque +stringData: + NEXT_PUBLIC_API_URL: "https://api.xpeditis.com" + NEXT_PUBLIC_APP_URL: "https://app.xpeditis.com" + NEXT_PUBLIC_API_PREFIX: "api/v1" + NEXTAUTH_URL: "https://app.xpeditis.com" + NEXTAUTH_SECRET: "" + GOOGLE_CLIENT_ID: "" + GOOGLE_CLIENT_SECRET: "" + MICROSOFT_CLIENT_ID: "" + MICROSOFT_CLIENT_SECRET: "" + NODE_ENV: "production" +``` + +```bash +# Générer les secrets aléatoires +echo "JWT_SECRET=$(openssl rand -base64 48)" +echo "NEXTAUTH_SECRET=$(openssl rand -base64 24)" + +# Appliquer (après avoir rempli les valeurs) +kubectl apply -f k8s/01-secrets.yaml + +# Vérifier (sans voir les valeurs) +kubectl get secret backend-secrets -n xpeditis-prod -o jsonpath='{.data}' | jq 'keys' +``` + +--- + +## 02 — ConfigMaps (variables non-sensibles) + +```yaml +# k8s/02-configmaps.yaml +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: backend-config + namespace: xpeditis-prod +data: + # Ces valeurs ne sont pas sensibles + LOG_LEVEL: "info" + TZ: "Europe/Paris" + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: frontend-config + namespace: xpeditis-prod +data: + TZ: "Europe/Paris" +``` + +--- + +## 03 — Deployment Backend NestJS + +```yaml +# k8s/03-backend-deployment.yaml +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: xpeditis-backend + namespace: xpeditis-prod + labels: + app: xpeditis-backend + version: "latest" +spec: + replicas: 2 + selector: + matchLabels: + app: xpeditis-backend + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 # Zero downtime deployment + template: + metadata: + labels: + app: xpeditis-backend + version: "latest" + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "4000" + prometheus.io/path: "/api/v1/health" + spec: + # Anti-affinité : pods sur nœuds différents + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - xpeditis-backend + topologyKey: kubernetes.io/hostname + + # Temps de grâce pour les connexions WebSocket + terminationGracePeriodSeconds: 60 + + containers: + - name: backend + # L'image est mise à jour par le CI/CD (doc 11) + image: ghcr.io//xpeditis-backend:latest + imagePullPolicy: Always + ports: + - containerPort: 4000 + name: http + protocol: TCP + + # Variables d'environnement depuis les Secrets + envFrom: + - secretRef: + name: backend-secrets + - configMapRef: + name: backend-config + + # Resources (MVP — ajuster selon les métriques réelles) + resources: + requests: + cpu: "500m" + memory: "512Mi" + limits: + cpu: "2000m" + memory: "1.5Gi" + + # Health checks + startupProbe: + httpGet: + path: /api/v1/health + port: 4000 + initialDelaySeconds: 20 + periodSeconds: 5 + failureThreshold: 12 # 60 secondes max au démarrage + + readinessProbe: + httpGet: + path: /api/v1/health + port: 4000 + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + failureThreshold: 3 + + livenessProbe: + httpGet: + path: /api/v1/health + port: 4000 + initialDelaySeconds: 60 + periodSeconds: 30 + failureThreshold: 3 + + # Lifecycle hook pour graceful shutdown + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 10"] # Laisse le temps au LB de retirer le pod + + # Pull depuis GHCR (GitHub Container Registry) + imagePullSecrets: + - name: scaleway-registry + + # Redémarrage automatique + restartPolicy: Always +``` + +--- + +## 04 — Service Backend + +```yaml +# k8s/04-backend-service.yaml +--- +apiVersion: v1 +kind: Service +metadata: + name: xpeditis-backend + namespace: xpeditis-prod + labels: + app: xpeditis-backend +spec: + selector: + app: xpeditis-backend + ports: + - name: http + port: 4000 + targetPort: 4000 + protocol: TCP + type: ClusterIP +``` + +--- + +## 05 — Deployment Frontend Next.js + +```yaml +# k8s/05-frontend-deployment.yaml +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: xpeditis-frontend + namespace: xpeditis-prod + labels: + app: xpeditis-frontend +spec: + replicas: 1 + selector: + matchLabels: + app: xpeditis-frontend + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: xpeditis-frontend + spec: + terminationGracePeriodSeconds: 30 + + containers: + - name: frontend + image: ghcr.io//xpeditis-frontend:latest + imagePullPolicy: Always + ports: + - containerPort: 3000 + name: http + + envFrom: + - secretRef: + name: frontend-secrets + - configMapRef: + name: frontend-config + + resources: + requests: + cpu: "250m" + memory: "256Mi" + limits: + cpu: "1000m" + memory: "768Mi" + + startupProbe: + httpGet: + path: / + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 12 + + readinessProbe: + httpGet: + path: / + port: 3000 + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 3 + + livenessProbe: + httpGet: + path: / + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 30 + failureThreshold: 3 + + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "sleep 5"] + + imagePullSecrets: + - name: scaleway-registry + + restartPolicy: Always +``` + +--- + +## 06 — Service Frontend + +```yaml +# k8s/06-frontend-service.yaml +--- +apiVersion: v1 +kind: Service +metadata: + name: xpeditis-frontend + namespace: xpeditis-prod + labels: + app: xpeditis-frontend +spec: + selector: + app: xpeditis-frontend + ports: + - name: http + port: 3000 + targetPort: 3000 + protocol: TCP + type: ClusterIP +``` + +--- + +## 07 — Ingress (Traefik + TLS) + +```yaml +# k8s/07-ingress.yaml +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: xpeditis-ingress + namespace: xpeditis-prod + annotations: + # TLS via cert-manager + cert-manager.io/cluster-issuer: "letsencrypt-prod" + + # Traefik config + traefik.ingress.kubernetes.io/router.entrypoints: "websecure" + traefik.ingress.kubernetes.io/router.tls: "true" + + # Sticky sessions pour WebSocket Socket.IO + traefik.ingress.kubernetes.io/service.sticky.cookie: "true" + traefik.ingress.kubernetes.io/service.sticky.cookie.name: "XPEDITIS_BACKEND" + traefik.ingress.kubernetes.io/service.sticky.cookie.secure: "true" + traefik.ingress.kubernetes.io/service.sticky.cookie.httponly: "true" + + # Timeout pour les longues requêtes (carrier APIs = jusqu'à 30s) + traefik.ingress.kubernetes.io/router.middlewares: "xpeditis-prod-ratelimit@kubernetescrd" + + # Headers de sécurité + traefik.ingress.kubernetes.io/router.middlewares: "xpeditis-prod-headers@kubernetescrd" + +spec: + ingressClassName: traefik + tls: + - hosts: + - api.xpeditis.com + - app.xpeditis.com + secretName: xpeditis-tls-prod + + rules: + # API Backend NestJS + - host: api.xpeditis.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: xpeditis-backend + port: + number: 4000 + + # Frontend Next.js + - host: app.xpeditis.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: xpeditis-frontend + port: + number: 3000 +--- +# Middleware : headers de sécurité +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: headers + namespace: xpeditis-prod +spec: + headers: + customRequestHeaders: + X-Forwarded-Proto: "https" + customResponseHeaders: + X-Frame-Options: "SAMEORIGIN" + X-Content-Type-Options: "nosniff" + X-XSS-Protection: "1; mode=block" + Referrer-Policy: "strict-origin-when-cross-origin" + Permissions-Policy: "geolocation=(), microphone=(), camera=()" + contentSecurityPolicy: "default-src 'self'; script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline';" + stsSeconds: 31536000 + stsIncludeSubdomains: true + stsPreload: true +--- +# Middleware : rate limiting Traefik (en plus du rate limiting NestJS) +apiVersion: traefik.io/v1alpha1 +kind: Middleware +metadata: + name: ratelimit + namespace: xpeditis-prod +spec: + rateLimit: + average: 100 + burst: 50 + period: 1m + sourceCriterion: + ipStrategy: + depth: 1 +``` + +--- + +## 08 — Horizontal Pod Autoscaler + +```yaml +# k8s/08-hpa.yaml +--- +# HPA Backend +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: backend-hpa + namespace: xpeditis-prod +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: xpeditis-backend + minReplicas: 2 + maxReplicas: 15 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 + behavior: + scaleUp: + stabilizationWindowSeconds: 60 + policies: + - type: Pods + value: 2 + periodSeconds: 60 + scaleDown: + stabilizationWindowSeconds: 300 # 5 min avant de réduire + policies: + - type: Pods + value: 1 + periodSeconds: 120 +--- +# HPA Frontend +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: frontend-hpa + namespace: xpeditis-prod +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: xpeditis-frontend + minReplicas: 1 + maxReplicas: 8 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + behavior: + scaleDown: + stabilizationWindowSeconds: 300 +``` + +--- + +## 09 — PodDisruptionBudget + +```yaml +# k8s/09-pdb.yaml +--- +# Garantit qu'au moins 1 pod backend est toujours disponible pendant les maintenances +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: backend-pdb + namespace: xpeditis-prod +spec: + minAvailable: 1 + selector: + matchLabels: + app: xpeditis-backend +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: frontend-pdb + namespace: xpeditis-prod +spec: + minAvailable: 1 + selector: + matchLabels: + app: xpeditis-frontend +``` + +--- + +## Secret Scaleway Container Registry + +Pour que Kubernetes puisse pull les images depuis le registry Scaleway : + +```bash +# REGISTRY_TOKEN = token Scaleway (Settings → API Keys → Container Registry) + +kubectl create secret docker-registry scaleway-registry \ + --namespace xpeditis-prod \ + --docker-server=rg.fr-par.scw.cloud \ + --docker-username=nologin \ + --docker-password= +``` + +--- + +## Déploiement complet + +```bash +# Appliquer tous les manifests dans l'ordre +kubectl apply -f k8s/00-namespaces.yaml +kubectl apply -f k8s/01-secrets.yaml # Après avoir rempli les valeurs +kubectl apply -f k8s/02-configmaps.yaml +kubectl apply -f k8s/03-backend-deployment.yaml +kubectl apply -f k8s/04-backend-service.yaml +kubectl apply -f k8s/05-frontend-deployment.yaml +kubectl apply -f k8s/06-frontend-service.yaml +kubectl apply -f k8s/07-ingress.yaml +kubectl apply -f k8s/08-hpa.yaml +kubectl apply -f k8s/09-pdb.yaml + +# Ou tout d'un coup +kubectl apply -f k8s/ + +# Suivre le déploiement +kubectl rollout status deployment/xpeditis-backend -n xpeditis-prod +kubectl rollout status deployment/xpeditis-frontend -n xpeditis-prod + +# Voir les pods +kubectl get pods -n xpeditis-prod -w + +# Voir les logs +kubectl logs -f deployment/xpeditis-backend -n xpeditis-prod +kubectl logs -f deployment/xpeditis-frontend -n xpeditis-prod + +# Vérifier le certificat TLS +kubectl get certificate -n xpeditis-prod +# NAME READY SECRET AGE +# xpeditis-tls-prod True xpeditis-tls-prod 2m +``` + +--- + +## Migration des jobs TypeORM + +Le déploiement inclut automatiquement les migrations via le `startup.js` dans le Dockerfile. Si vous avez besoin de lancer les migrations manuellement : + +```bash +# Job de migration one-shot +cat > /tmp/migration-job.yaml << 'EOF' +apiVersion: batch/v1 +kind: Job +metadata: + name: xpeditis-migrations + namespace: xpeditis-prod +spec: + template: + spec: + restartPolicy: OnFailure + containers: + - name: migrations + image: ghcr.io//xpeditis-backend:latest + command: ["node", "dist/migration-runner.js"] + envFrom: + - secretRef: + name: backend-secrets + imagePullSecrets: + - name: scaleway-registry +EOF + +kubectl apply -f /tmp/migration-job.yaml +kubectl wait --for=condition=complete job/xpeditis-migrations -n xpeditis-prod --timeout=300s +kubectl logs job/xpeditis-migrations -n xpeditis-prod +kubectl delete job xpeditis-migrations -n xpeditis-prod +``` diff --git a/docs/deployment/hetzner/13-backup-disaster-recovery.md b/docs/deployment/hetzner/13-backup-disaster-recovery.md new file mode 100644 index 0000000..3ac5599 --- /dev/null +++ b/docs/deployment/hetzner/13-backup-disaster-recovery.md @@ -0,0 +1,389 @@ +# 13 — Backups et reprise après sinistre + +--- + +## Stratégie de backup + +| Composant | Méthode | Fréquence | Rétention | Destination | +|---|---|---|---|---| +| PostgreSQL | `pg_dump` via CronJob | Quotidien 3h00 | 30 jours | Hetzner Object Storage | +| PostgreSQL WAL | Streaming (si self-hosted) | Continue | 7 jours | Object Storage | +| Redis | RDB snapshot + AOF | Chaque 5 min | 24h | Volume local | +| Secrets Kubernetes | Export manuel chiffré | Avant chaque changement | Illimité | Hors-cluster (coffre) | +| Fichiers S3 | Versioning objet | Permanent | Voir lifecycle | Object Storage | +| Configs K8s | GitOps dans le repo | À chaque commit | Git history | GitHub | + +**Objectifs :** +- **RPO (Recovery Point Objective) :** 24h max (vous pouvez perdre au plus 24h de données) +- **RTO (Recovery Time Objective) :** 4h max (vous pouvez reconstruire en moins de 4h) + +--- + +## Backup PostgreSQL — Option A (Neon.tech) + +Si vous utilisez Neon.tech, les backups sont **automatiques** : +- Point-in-time recovery (PITR) sur 7 jours (plan Free) ou 30 jours (plan Pro) +- Pas de CronJob à gérer + +Pour créer un backup manuel : +```bash +# Installer la CLI Neon +npm install -g neonctl +neonctl auth + +# Créer un point de restauration (branch) +neonctl branches create \ + --project-id \ + --name "backup-$(date +%Y%m%d)" \ + --parent main +``` + +--- + +## Backup PostgreSQL — Option B (self-hosted) + +### CronJob Kubernetes de backup + +```yaml +# k8s/backup-postgres-cronjob.yaml +--- +apiVersion: v1 +kind: Secret +metadata: + name: backup-credentials + namespace: xpeditis-prod +type: Opaque +stringData: + # Même credentials que le backend pour Object Storage + AWS_ACCESS_KEY_ID: "" + AWS_SECRET_ACCESS_KEY: "" + AWS_S3_ENDPOINT: "https://fsn1.your-objectstorage.com" + AWS_S3_BUCKET: "xpeditis-prod" + # Credentials PostgreSQL + PGPASSWORD: "" +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: postgres-backup + namespace: xpeditis-prod +spec: + schedule: "0 3 * * *" # 3h00 chaque nuit + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 5 + jobTemplate: + spec: + template: + spec: + restartPolicy: OnFailure + containers: + - name: backup + image: postgres:15-alpine + command: + - /bin/sh + - -c + - | + set -e + echo "=== Démarrage backup PostgreSQL $(date) ===" + + # Variables + TIMESTAMP=$(date +%Y%m%d_%H%M%S) + BACKUP_FILE="/tmp/xpeditis_${TIMESTAMP}.sql.gz" + S3_KEY="backups/postgres/$(date +%Y/%m)/xpeditis_${TIMESTAMP}.sql.gz" + + # Dump PostgreSQL compressé + pg_dump \ + -h ${PGHOST} \ + -p ${PGPORT:-5432} \ + -U ${PGUSER} \ + -d ${PGDATABASE} \ + --no-password \ + --clean \ + --if-exists \ + --format=custom \ + | gzip > ${BACKUP_FILE} + + BACKUP_SIZE=$(du -sh ${BACKUP_FILE} | cut -f1) + echo "Dump créé: ${BACKUP_FILE} (${BACKUP_SIZE})" + + # Upload vers Hetzner Object Storage + apk add --no-cache aws-cli 2>/dev/null || pip install awscli 2>/dev/null + + AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} \ + AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} \ + aws s3 cp ${BACKUP_FILE} s3://${AWS_S3_BUCKET}/${S3_KEY} \ + --endpoint-url ${AWS_S3_ENDPOINT} + + echo "✅ Backup uploadé: s3://${AWS_S3_BUCKET}/${S3_KEY}" + + # Nettoyage local + rm ${BACKUP_FILE} + + # Vérifier les anciens backups (garder 30 jours) + echo "Backups existants:" + AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} \ + AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} \ + aws s3 ls s3://${AWS_S3_BUCKET}/backups/postgres/ \ + --endpoint-url ${AWS_S3_ENDPOINT} \ + --recursive | tail -10 + + echo "=== Backup terminé $(date) ===" + env: + - name: PGHOST + value: "10.0.1.100" # IP privée serveur PostgreSQL + - name: PGPORT + value: "5432" + - name: PGUSER + value: "xpeditis" + - name: PGDATABASE + value: "xpeditis_prod" + envFrom: + - secretRef: + name: backup-credentials + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi +``` + +```bash +# Appliquer +kubectl apply -f k8s/backup-postgres-cronjob.yaml + +# Tester manuellement (créer un Job depuis le CronJob) +kubectl create job --from=cronjob/postgres-backup test-backup -n xpeditis-prod +kubectl logs -l job-name=test-backup -n xpeditis-prod -f + +# Vérifier que le fichier est arrivé dans S3 +aws s3 ls s3://xpeditis-prod/backups/postgres/ \ + --profile hetzner \ + --endpoint-url https://fsn1.your-objectstorage.com \ + --recursive +``` + +--- + +## Procédure de restauration PostgreSQL + +### Restauration complète (catastrophe totale) + +```bash +# Étape 1 : Lister les backups disponibles +aws s3 ls s3://xpeditis-prod/backups/postgres/ \ + --profile hetzner \ + --endpoint-url https://fsn1.your-objectstorage.com \ + --recursive | sort -r | head -10 + +# Étape 2 : Télécharger le backup le plus récent +aws s3 cp \ + s3://xpeditis-prod/backups/postgres/2026/03/xpeditis_20260323_030001.sql.gz \ + /tmp/restore.sql.gz \ + --profile hetzner \ + --endpoint-url https://fsn1.your-objectstorage.com + +# Étape 3 : Décompresser et restaurer +# ⚠️ Cette commande EFFACE les données existantes +gunzip -c /tmp/restore.sql.gz | pg_restore \ + -h \ + -U xpeditis \ + -d xpeditis_prod \ + --clean \ + --if-exists \ + --no-privileges \ + --no-owner + +# Étape 4 : Vérifier l'intégrité +psql -h -U xpeditis -d xpeditis_prod \ + -c "SELECT COUNT(*) as bookings FROM bookings;" + +psql -h -U xpeditis -d xpeditis_prod \ + -c "SELECT COUNT(*) as users FROM users;" + +# Étape 5 : Redémarrer les pods pour reconnecter +kubectl rollout restart deployment/xpeditis-backend -n xpeditis-prod +``` + +--- + +## Backup des Secrets Kubernetes + +Les secrets ne sont pas dans Git (intentionnel). Sauvegardez-les chiffrés. + +```bash +#!/bin/bash +# scripts/backup-secrets.sh + +set -e +BACKUP_DIR="$HOME/.xpeditis-secrets-backup" +mkdir -p "$BACKUP_DIR" +DATE=$(date +%Y%m%d_%H%M%S) + +# Exporter les secrets (encodés base64) +kubectl get secret backend-secrets -n xpeditis-prod -o yaml > /tmp/backend-secrets-${DATE}.yaml +kubectl get secret frontend-secrets -n xpeditis-prod -o yaml > /tmp/frontend-secrets-${DATE}.yaml +kubectl get secret ghcr-credentials -n xpeditis-prod -o yaml > /tmp/ghcr-creds-${DATE}.yaml + +# Chiffrer avec GPG (ou utiliser un password) +tar czf - /tmp/*-${DATE}.yaml | gpg --symmetric --cipher-algo AES256 \ + > "${BACKUP_DIR}/k8s-secrets-${DATE}.tar.gz.gpg" + +# Nettoyage des fichiers temporaires +rm /tmp/*-${DATE}.yaml + +echo "✅ Secrets sauvegardés dans ${BACKUP_DIR}/k8s-secrets-${DATE}.tar.gz.gpg" + +# Lister les backups existants +ls -la "$BACKUP_DIR"/ +``` + +```bash +# Restaurer les secrets depuis un backup +gpg --decrypt "${BACKUP_DIR}/k8s-secrets-20260323_120000.tar.gz.gpg" | tar xzf - +kubectl apply -f /tmp/backend-secrets-20260323_120000.yaml +``` + +--- + +## Runbook — Reprise après sinistre complète + +Procédure si vous perdez tout le cluster (serveurs détruits) : + +### Étape 1 : Recréer l'infrastructure (30 min) + +```bash +# 1. Recréer le réseau +hcloud network create --name xpeditis-network --ip-range 10.0.0.0/16 +hcloud network add-subnet xpeditis-network --type cloud --network-zone eu-central --ip-range 10.0.1.0/24 + +# 2. Recréer le firewall +# (répéter les commandes du doc 03) + +# 3. Recréer le cluster k3s +hetzner-k3s create --config ~/.xpeditis/cluster.yaml + +# 4. Configurer kubectl +export KUBECONFIG=~/.kube/kubeconfig-xpeditis-prod +``` + +### Étape 2 : Restaurer les secrets (15 min) + +```bash +# Créer le namespace +kubectl apply -f k8s/00-namespaces.yaml + +# Restaurer les secrets depuis le backup chiffré +gpg --decrypt "$HOME/.xpeditis-secrets-backup/k8s-secrets-XXXXXXXX.tar.gz.gpg" | tar xzf - +kubectl apply -f /tmp/backend-secrets-*.yaml +kubectl apply -f /tmp/frontend-secrets-*.yaml +kubectl apply -f /tmp/scaleway-creds-*.yaml + +# Recréer le secret Scaleway +kubectl create secret docker-registry scaleway-registry \ + --namespace xpeditis-prod \ + --docker-server=rg.fr-par.scw.cloud \ + --docker-username=nologin \ + --docker-password= +``` + +### Étape 3 : Restaurer les services (15 min) + +```bash +# Installer cert-manager +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager --create-namespace \ + --version v1.15.3 --set installCRDs=true +kubectl apply -f /tmp/cluster-issuers.yaml + +# Déployer l'application +kubectl apply -f k8s/ + +# Attendre +kubectl rollout status deployment/xpeditis-backend -n xpeditis-prod --timeout=300s +``` + +### Étape 4 : Restaurer la base de données (30 min) + +```bash +# Si PostgreSQL self-hosted : +# (Recréer le serveur PostgreSQL si nécessaire, doc 07) +# Puis restaurer depuis le backup S3 + +# Télécharger le backup le plus récent +LATEST=$(aws s3 ls s3://xpeditis-prod/backups/postgres/ \ + --profile hetzner \ + --endpoint-url https://fsn1.your-objectstorage.com \ + --recursive | sort -r | head -1 | awk '{print $4}') + +aws s3 cp s3://xpeditis-prod/$LATEST /tmp/restore.sql.gz \ + --profile hetzner \ + --endpoint-url https://fsn1.your-objectstorage.com + +# Restaurer +gunzip -c /tmp/restore.sql.gz | pg_restore \ + -h -U xpeditis -d xpeditis_prod \ + --clean --if-exists --no-privileges --no-owner +``` + +### Étape 5 : Vérification finale (15 min) + +```bash +# Health checks +curl https://api.xpeditis.com/api/v1/health +curl https://app.xpeditis.com/ + +# Test login +curl -X POST https://api.xpeditis.com/api/v1/auth/login \ + -H "Content-Type: application/json" \ + -d '{"email":"admin@test.com","password":"test"}' | jq . + +# Vérifier les données +kubectl exec -it deployment/xpeditis-backend -n xpeditis-prod -- \ + node -e "console.log('Database OK')" + +echo "✅ Système opérationnel. RTO: $(date)" +``` + +--- + +## Test régulier des backups (mensuel) + +```bash +#!/bin/bash +# scripts/test-backup-restore.sh +# À exécuter en environnement de test, JAMAIS en production + +echo "🧪 Test de restauration du backup PostgreSQL" + +# 1. Créer une DB de test +psql -h -U postgres -c "CREATE DATABASE xpeditis_restore_test;" + +# 2. Télécharger le dernier backup +LATEST=$(aws s3 ls s3://xpeditis-prod/backups/postgres/ \ + --profile hetzner \ + --endpoint-url https://fsn1.your-objectstorage.com \ + --recursive | sort -r | head -1 | awk '{print $4}') + +aws s3 cp s3://xpeditis-prod/$LATEST /tmp/test-restore.sql.gz \ + --profile hetzner \ + --endpoint-url https://fsn1.your-objectstorage.com + +# 3. Restaurer dans la DB de test +gunzip -c /tmp/test-restore.sql.gz | pg_restore \ + -h -U postgres -d xpeditis_restore_test + +# 4. Vérifier +BOOKING_COUNT=$(psql -h -U postgres -d xpeditis_restore_test \ + -t -c "SELECT COUNT(*) FROM bookings;" | xargs) + +echo "✅ Restauration réussie. Nombre de bookings: $BOOKING_COUNT" + +# 5. Nettoyage +psql -h -U postgres -c "DROP DATABASE xpeditis_restore_test;" +rm /tmp/test-restore.sql.gz + +echo "✅ Test de backup/restore réussi le $(date)" +```