#!/usr/bin/env bash
# deploy/scripts/blue-green.sh
#
# Atomic blue/green deployment for Veylant IA proxy.
# Rollback time: < 5s (single kubectl patch on the Istio VirtualService).
#
# Strategy:
#   1. Detect which slot is currently active (blue|green) from the VirtualService.
#   2. Deploy the new image tag to the INACTIVE slot via helm upgrade.
#   3. Wait for the inactive slot's rollout to complete.
#   4. Smoke-test the inactive slot via a temp port-forward.
#   5. Switch 100% traffic to the new slot (patch VirtualService).
#   6. Verify health post-switch; roll back if verification fails.
#   7. Scale down the old slot to 0 replicas to free resources.
#
# Required env vars:
#   IMAGE_TAG    — Docker image tag to deploy (e.g. sha-abc123)
#   NAMESPACE    — Kubernetes namespace (default: veylant)
#   KUBECONFIG   — path to kubeconfig (uses default if not set)
#
# Optional env vars:
#   ROLLOUT_TIMEOUT — kubectl rollout wait timeout (default: 5m)
#   SMOKE_RETRIES   — health check retries after switch (default: 5)
#   DRY_RUN         — set to "true" to print commands without executing

set -euo pipefail

# ── Config ────────────────────────────────────────────────────────────────────
IMAGE_TAG="${IMAGE_TAG:?IMAGE_TAG is required}"
NAMESPACE="${NAMESPACE:-veylant}"
ROLLOUT_TIMEOUT="${ROLLOUT_TIMEOUT:-5m}"
SMOKE_RETRIES="${SMOKE_RETRIES:-5}"
DRY_RUN="${DRY_RUN:-false}"
CHART_PATH="deploy/helm/veylant-proxy"

# ── Helpers ───────────────────────────────────────────────────────────────────
log()  { echo "[blue-green] $*"; }
die()  { echo "[blue-green] ERROR: $*" >&2; exit 1; }

run() {
  if [[ "$DRY_RUN" == "true" ]]; then
    echo "[dry-run] $*"
  else
    "$@"
  fi
}

# ── Step 1: Detect active slot ────────────────────────────────────────────────
log "Detecting active slot from VirtualService..."
ACTIVE_SLOT=$(kubectl get virtualservice veylant-proxy -n "$NAMESPACE" -o jsonpath='{.spec.http[0].route[0].destination.subset}' 2>/dev/null || echo "blue")

if [[ "$ACTIVE_SLOT" == "blue" ]]; then
  INACTIVE_SLOT="green"
else
  INACTIVE_SLOT="blue"
fi

log "Active slot: ${ACTIVE_SLOT} → deploying to INACTIVE slot: ${INACTIVE_SLOT}"

HELM_RELEASE="veylant-proxy-${INACTIVE_SLOT}"
VALUES_FILE="${CHART_PATH}/values-${INACTIVE_SLOT}.yaml"

# ── Step 2: Deploy to inactive slot ──────────────────────────────────────────
log "Deploying image tag '${IMAGE_TAG}' to slot '${INACTIVE_SLOT}' (release: ${HELM_RELEASE})..."
run helm upgrade --install "$HELM_RELEASE" "$CHART_PATH" \
  -f "$VALUES_FILE" \
  --namespace "$NAMESPACE" \
  --create-namespace \
  --set image.tag="$IMAGE_TAG" \
  --set slot="$INACTIVE_SLOT" \
  --wait \
  --timeout "$ROLLOUT_TIMEOUT"

log "Helm deploy complete for slot '${INACTIVE_SLOT}'."

# ── Step 3: Wait for rollout ──────────────────────────────────────────────────
log "Waiting for deployment rollout (timeout: ${ROLLOUT_TIMEOUT})..."
run kubectl rollout status "deployment/${HELM_RELEASE}" \
  -n "$NAMESPACE" \
  --timeout "$ROLLOUT_TIMEOUT"

log "Rollout complete."

# ── Step 4: Smoke test on inactive slot ──────────────────────────────────────
log "Smoke-testing inactive slot via port-forward..."
PF_PORT=19090
# Start port-forward in background; capture PID for cleanup.
if [[ "$DRY_RUN" != "true" ]]; then
  kubectl port-forward \
    "deployment/${HELM_RELEASE}" \
    "${PF_PORT}:8090" \
    -n "$NAMESPACE" &>/tmp/veylant-pf.log &
  PF_PID=$!
  # Give it 3s to establish.
  sleep 3

  SMOKE_OK=false
  for i in $(seq 1 5); do
    HTTP_STATUS=$(curl -sf -o /dev/null -w "%{http_code}" "http://localhost:${PF_PORT}/healthz" 2>/dev/null || echo "000")
    if [[ "$HTTP_STATUS" == "200" ]]; then
      SMOKE_OK=true
      break
    fi
    log "  Smoke attempt ${i}/5: HTTP ${HTTP_STATUS} — retrying..."
    sleep 2
  done

  kill "$PF_PID" 2>/dev/null || true
  wait "$PF_PID" 2>/dev/null || true

  if [[ "$SMOKE_OK" != "true" ]]; then
    die "Smoke test failed on inactive slot '${INACTIVE_SLOT}'. Deployment ABORTED — active slot unchanged."
  fi
fi

log "Smoke test passed."

# ── Step 5: Switch traffic to new slot ───────────────────────────────────────
log "Switching 100%% traffic from '${ACTIVE_SLOT}' → '${INACTIVE_SLOT}'..."
run kubectl patch virtualservice veylant-proxy -n "$NAMESPACE" --type merge \
  -p "{\"spec\":{\"http\":[{\"route\":[{\"destination\":{\"host\":\"veylant-proxy\",\"subset\":\"${INACTIVE_SLOT}\"},\"weight\":100}]}]}}"

log "Traffic switched."

# ── Step 6: Verify post-switch ────────────────────────────────────────────────
log "Verifying health post-switch (${SMOKE_RETRIES} attempts)..."
VEYLANT_URL="${VEYLANT_URL:-http://localhost:8090}"
POST_SWITCH_OK=false
if [[ "$DRY_RUN" != "true" ]]; then
  for i in $(seq 1 "$SMOKE_RETRIES"); do
    HTTP_STATUS=$(curl -sf -o /dev/null -w "%{http_code}" "${VEYLANT_URL}/healthz" 2>/dev/null || echo "000")
    if [[ "$HTTP_STATUS" == "200" ]]; then
      POST_SWITCH_OK=true
      break
    fi
    log "  Post-switch check ${i}/${SMOKE_RETRIES}: HTTP ${HTTP_STATUS} — retrying..."
    sleep 2
  done
else
  POST_SWITCH_OK=true
fi

if [[ "$POST_SWITCH_OK" != "true" ]]; then
  log "Post-switch verification FAILED. Rolling back to '${ACTIVE_SLOT}'..."
  kubectl patch virtualservice veylant-proxy -n "$NAMESPACE" --type merge \
    -p "{\"spec\":{\"http\":[{\"route\":[{\"destination\":{\"host\":\"veylant-proxy\",\"subset\":\"${ACTIVE_SLOT}\"},\"weight\":100}]}]}}"
  die "Rollback complete. Active slot reverted to '${ACTIVE_SLOT}'."
fi

log "Post-switch verification passed."

# ── Step 7: Scale down old slot ───────────────────────────────────────────────
log "Scaling down old slot '${ACTIVE_SLOT}' to 0 replicas..."
OLD_RELEASE="veylant-proxy-${ACTIVE_SLOT}"
run kubectl scale deployment "$OLD_RELEASE" --replicas=0 -n "$NAMESPACE" 2>/dev/null || \
  log "  (scale-down skipped — release ${OLD_RELEASE} not found)"

log ""
log "✓ Blue/green deployment complete."
log "  Previous slot : ${ACTIVE_SLOT} (scaled to 0)"
log "  Active slot   : ${INACTIVE_SLOT} (image: ${IMAGE_TAG})"
log "  Rollback      : make deploy-rollback ACTIVE_SLOT=${ACTIVE_SLOT} NAMESPACE=${NAMESPACE}"