veylant/test/k6/load_test.js

// k6 load test for Veylant IA Proxy (E10-10).
//
// Targets:
//   - p99 latency < 300ms
//   - error rate   < 1%
//   - 1 000 VU sustained for 8 minutes
//
// Run (requires a running proxy + mock Ollama):
//   k6 run test/k6/load_test.js
//
// Environment variables:
//   BASE_URL   — proxy base URL (default: http://localhost:8090)
//   AUTH_TOKEN — Bearer token (default: dev-token)
//   MODEL      — LLM model name (default: llama3.2, routed to local Ollama)

import http from 'k6/http';
import { check, sleep } from 'k6';
import { Rate, Trend } from 'k6/metrics';

// ── Custom metrics ────────────────────────────────────────────────────────────
const errorRate    = new Rate('custom_error_rate');
const chatLatency  = new Trend('chat_latency_ms', true);
const healthLatency = new Trend('health_latency_ms', true);

// ── Test configuration ────────────────────────────────────────────────────────
export const options = {
  stages: [
    { duration: '1m',  target: 100  },  // ramp-up
    { duration: '8m',  target: 1000 },  // sustained load
    { duration: '1m',  target: 0    },  // ramp-down
  ],
  thresholds: {
    // SLA targets
    http_req_duration:    ['p(99)<300'],   // p99 < 300ms
    http_req_failed:      ['rate<0.01'],   // < 1% HTTP errors
    custom_error_rate:    ['rate<0.01'],   // < 1% application errors
    chat_latency_ms:      ['p(99)<300'],
  },
};

// ── Helpers ───────────────────────────────────────────────────────────────────
const BASE_URL   = __ENV.BASE_URL   || 'http://localhost:8090';
const AUTH_TOKEN = __ENV.AUTH_TOKEN || 'dev-token';
const MODEL      = __ENV.MODEL      || 'llama3.2';

const chatParams = {
  headers: {
    'Content-Type':  'application/json',
    'Authorization': `Bearer ${AUTH_TOKEN}`,
  },
  timeout: '5s',
};

const chatBody = JSON.stringify({
  model:    MODEL,
  messages: [{ role: 'user', content: 'Dis-moi bonjour en une phrase.' }],
  stream:   false,
});

// ── Default scenario ──────────────────────────────────────────────────────────
export default function () {
  // 90% chat completions, 10% health checks (mirrors production traffic mix).
  if (Math.random() < 0.9) {
    const res = http.post(`${BASE_URL}/v1/chat/completions`, chatBody, chatParams);

    const ok = check(res, {
      'chat: status 200':        (r) => r.status === 200,
      'chat: has choices':       (r) => {
        try {
          const body = JSON.parse(r.body);
          return Array.isArray(body.choices) && body.choices.length > 0;
        } catch (_) {
          return false;
        }
      },
    });

    chatLatency.add(res.timings.duration);
    errorRate.add(!ok);
  } else {
    const res = http.get(`${BASE_URL}/healthz`, { timeout: '2s' });
    check(res, { 'health: status 200': (r) => r.status === 200 });
    healthLatency.add(res.timings.duration);
  }

  // Think time: 0–200ms random (simulates realistic inter-request spacing).
  sleep(Math.random() * 0.2);
}

// ── Setup — verify proxy is reachable before starting ────────────────────────
export function setup() {
  const res = http.get(`${BASE_URL}/healthz`);
  if (res.status !== 200) {
    throw new Error(`Proxy not reachable at ${BASE_URL}/healthz — status ${res.status}`);
  }
  console.log(`Load test starting. Target: ${BASE_URL}, model: ${MODEL}`);
}

// ── Teardown — summary ────────────────────────────────────────────────────────
export function teardown(data) {
  console.log('Load test complete. Check thresholds in the summary above.');
}