veylant/deploy/scripts/blue-green.sh
2026-02-23 13:35:04 +01:00

163 lines
6.9 KiB
Bash

#!/usr/bin/env bash
# deploy/scripts/blue-green.sh
#
# Atomic blue/green deployment for Veylant IA proxy.
# Rollback time: < 5s (single kubectl patch on the Istio VirtualService).
#
# Strategy:
# 1. Detect which slot is currently active (blue|green) from the VirtualService.
# 2. Deploy the new image tag to the INACTIVE slot via helm upgrade.
# 3. Wait for the inactive slot's rollout to complete.
# 4. Smoke-test the inactive slot via a temp port-forward.
# 5. Switch 100% traffic to the new slot (patch VirtualService).
# 6. Verify health post-switch; roll back if verification fails.
# 7. Scale down the old slot to 0 replicas to free resources.
#
# Required env vars:
# IMAGE_TAG — Docker image tag to deploy (e.g. sha-abc123)
# NAMESPACE — Kubernetes namespace (default: veylant)
# KUBECONFIG — path to kubeconfig (uses default if not set)
#
# Optional env vars:
# ROLLOUT_TIMEOUT — kubectl rollout wait timeout (default: 5m)
# SMOKE_RETRIES — health check retries after switch (default: 5)
# DRY_RUN — set to "true" to print commands without executing
set -euo pipefail
# ── Config ────────────────────────────────────────────────────────────────────
IMAGE_TAG="${IMAGE_TAG:?IMAGE_TAG is required}"
NAMESPACE="${NAMESPACE:-veylant}"
ROLLOUT_TIMEOUT="${ROLLOUT_TIMEOUT:-5m}"
SMOKE_RETRIES="${SMOKE_RETRIES:-5}"
DRY_RUN="${DRY_RUN:-false}"
CHART_PATH="deploy/helm/veylant-proxy"
# ── Helpers ───────────────────────────────────────────────────────────────────
log() { echo "[blue-green] $*"; }
die() { echo "[blue-green] ERROR: $*" >&2; exit 1; }
run() {
if [[ "$DRY_RUN" == "true" ]]; then
echo "[dry-run] $*"
else
"$@"
fi
}
# ── Step 1: Detect active slot ────────────────────────────────────────────────
log "Detecting active slot from VirtualService..."
ACTIVE_SLOT=$(kubectl get virtualservice veylant-proxy -n "$NAMESPACE" -o jsonpath='{.spec.http[0].route[0].destination.subset}' 2>/dev/null || echo "blue")
if [[ "$ACTIVE_SLOT" == "blue" ]]; then
INACTIVE_SLOT="green"
else
INACTIVE_SLOT="blue"
fi
log "Active slot: ${ACTIVE_SLOT} → deploying to INACTIVE slot: ${INACTIVE_SLOT}"
HELM_RELEASE="veylant-proxy-${INACTIVE_SLOT}"
VALUES_FILE="${CHART_PATH}/values-${INACTIVE_SLOT}.yaml"
# ── Step 2: Deploy to inactive slot ──────────────────────────────────────────
log "Deploying image tag '${IMAGE_TAG}' to slot '${INACTIVE_SLOT}' (release: ${HELM_RELEASE})..."
run helm upgrade --install "$HELM_RELEASE" "$CHART_PATH" \
-f "$VALUES_FILE" \
--namespace "$NAMESPACE" \
--create-namespace \
--set image.tag="$IMAGE_TAG" \
--set slot="$INACTIVE_SLOT" \
--wait \
--timeout "$ROLLOUT_TIMEOUT"
log "Helm deploy complete for slot '${INACTIVE_SLOT}'."
# ── Step 3: Wait for rollout ──────────────────────────────────────────────────
log "Waiting for deployment rollout (timeout: ${ROLLOUT_TIMEOUT})..."
run kubectl rollout status "deployment/${HELM_RELEASE}" \
-n "$NAMESPACE" \
--timeout "$ROLLOUT_TIMEOUT"
log "Rollout complete."
# ── Step 4: Smoke test on inactive slot ──────────────────────────────────────
log "Smoke-testing inactive slot via port-forward..."
PF_PORT=19090
# Start port-forward in background; capture PID for cleanup.
if [[ "$DRY_RUN" != "true" ]]; then
kubectl port-forward \
"deployment/${HELM_RELEASE}" \
"${PF_PORT}:8090" \
-n "$NAMESPACE" &>/tmp/veylant-pf.log &
PF_PID=$!
# Give it 3s to establish.
sleep 3
SMOKE_OK=false
for i in $(seq 1 5); do
HTTP_STATUS=$(curl -sf -o /dev/null -w "%{http_code}" "http://localhost:${PF_PORT}/healthz" 2>/dev/null || echo "000")
if [[ "$HTTP_STATUS" == "200" ]]; then
SMOKE_OK=true
break
fi
log " Smoke attempt ${i}/5: HTTP ${HTTP_STATUS} — retrying..."
sleep 2
done
kill "$PF_PID" 2>/dev/null || true
wait "$PF_PID" 2>/dev/null || true
if [[ "$SMOKE_OK" != "true" ]]; then
die "Smoke test failed on inactive slot '${INACTIVE_SLOT}'. Deployment ABORTED — active slot unchanged."
fi
fi
log "Smoke test passed."
# ── Step 5: Switch traffic to new slot ───────────────────────────────────────
log "Switching 100%% traffic from '${ACTIVE_SLOT}' → '${INACTIVE_SLOT}'..."
run kubectl patch virtualservice veylant-proxy -n "$NAMESPACE" --type merge \
-p "{\"spec\":{\"http\":[{\"route\":[{\"destination\":{\"host\":\"veylant-proxy\",\"subset\":\"${INACTIVE_SLOT}\"},\"weight\":100}]}]}}"
log "Traffic switched."
# ── Step 6: Verify post-switch ────────────────────────────────────────────────
log "Verifying health post-switch (${SMOKE_RETRIES} attempts)..."
VEYLANT_URL="${VEYLANT_URL:-http://localhost:8090}"
POST_SWITCH_OK=false
if [[ "$DRY_RUN" != "true" ]]; then
for i in $(seq 1 "$SMOKE_RETRIES"); do
HTTP_STATUS=$(curl -sf -o /dev/null -w "%{http_code}" "${VEYLANT_URL}/healthz" 2>/dev/null || echo "000")
if [[ "$HTTP_STATUS" == "200" ]]; then
POST_SWITCH_OK=true
break
fi
log " Post-switch check ${i}/${SMOKE_RETRIES}: HTTP ${HTTP_STATUS} — retrying..."
sleep 2
done
else
POST_SWITCH_OK=true
fi
if [[ "$POST_SWITCH_OK" != "true" ]]; then
log "Post-switch verification FAILED. Rolling back to '${ACTIVE_SLOT}'..."
kubectl patch virtualservice veylant-proxy -n "$NAMESPACE" --type merge \
-p "{\"spec\":{\"http\":[{\"route\":[{\"destination\":{\"host\":\"veylant-proxy\",\"subset\":\"${ACTIVE_SLOT}\"},\"weight\":100}]}]}}"
die "Rollback complete. Active slot reverted to '${ACTIVE_SLOT}'."
fi
log "Post-switch verification passed."
# ── Step 7: Scale down old slot ───────────────────────────────────────────────
log "Scaling down old slot '${ACTIVE_SLOT}' to 0 replicas..."
OLD_RELEASE="veylant-proxy-${ACTIVE_SLOT}"
run kubectl scale deployment "$OLD_RELEASE" --replicas=0 -n "$NAMESPACE" 2>/dev/null || \
log " (scale-down skipped — release ${OLD_RELEASE} not found)"
log ""
log "✓ Blue/green deployment complete."
log " Previous slot : ${ACTIVE_SLOT} (scaled to 0)"
log " Active slot : ${INACTIVE_SLOT} (image: ${IMAGE_TAG})"
log " Rollback : make deploy-rollback ACTIVE_SLOT=${ACTIVE_SLOT} NAMESPACE=${NAMESPACE}"