Compare commits

..

2 Commits

Author SHA1 Message Date
borja de9bfba9aa feat: añadir tests de health-check y documentar métricas/ENV
Co-authored-by: aider (openrouter/openai/gpt-5) <aider@aider.chat>
2 months ago
borja a432ee5a4a feat: reducir logs a cambios de estado y exponer métricas de Evolution
Co-authored-by: aider (openrouter/openai/gpt-5) <aider@aider.chat>
2 months ago

@ -88,6 +88,12 @@ ONBOARDING_FALLBACK_MIN_DIGITS=8 # A2: longitud mínima para conservar núme
# METRICS_ENABLED=true
# METRICS_FORMAT=prom # prom|json
# Health check de Evolution API (opcional)
# Intervalo en milisegundos para consultar estado (por defecto 60000 = 60s).
# HEALTH_CHECK_INTERVAL_MS=60000
# Cooldown en milisegundos entre intentos de reinicio (por defecto 900000 = 15 min).
# HEALTH_CHECK_RESTART_COOLDOWN_MS=900000
# Migrador (opcional)
# MIGRATIONS_LOG_LEVEL="silent" # Silencia logs del migrador (en test ya se silencian automáticamente)

@ -80,6 +80,7 @@ Variables clave:
- TZ (por defecto Europe/Madrid).
- REMINDERS_GRACE_MINUTES (ventana de gracia tras la hora; por defecto 60).
- ALLOWED_GROUPS (semilla inicial), NOTIFY_ADMINS_ON_DISCOVERY.
- HEALTH_CHECK_INTERVAL_MS (ms, por defecto 60000) y HEALTH_CHECK_RESTART_COOLDOWN_MS (ms, por defecto 900000).
- METRICS_ENABLED, PORT.
- WEB_BASE_URL (host público de la web para generar enlaces absolutos; usado por /t web).
- Rate limit: RATE_LIMIT_PER_MIN, RATE_LIMIT_BURST.
@ -93,7 +94,13 @@ Consulta:
## Operación y mantenimiento
- /metrics expone contadores y gauges; puede deshabilitarse por configuración.
- /metrics expone contadores y gauges; puede deshabilitarse por configuración. Principales series:
- evolution_instance_state{instance, state} (gauge): 1 para el estado actual de Evolution (open/connecting/closed/unreachable…), 0 al estado anterior en cada transición.
- evolution_instance_last_state_change_ts{instance} (gauge): timestamp epoch (s) del último cambio de estado.
- evolution_instance_state_changes_total{instance} (counter): número de transiciones de estado observadas.
- evolution_instance_restart_attempts_total{instance} (counter): intentos de reinicio cuando el estado no es 'open'.
- evolution_instance_restart_success_total{instance} (counter): reinicios exitosos.
- evolution_health_check_errors_total{instance} (counter): errores HTTP/red al consultar estado.
- Schedulers configurables; se evitan en entornos de test.
- Migraciones up-only al arranque; logging de eventos de migración.
- Copias de seguridad: respaldar el directorio data/ y planificar retención.

@ -1,11 +1,14 @@
import type { Database } from 'bun:sqlite';
import { getDb } from '../db/locator';
import { toIsoSqlUTC } from '../utils/datetime';
import { Metrics } from './metrics';
export class MaintenanceService {
private static _timer: any = null;
private static _healthCheckTimer: any = null;
private static _lastRestartAttempt: number = 0;
private static _lastEvolutionState: string | null = null;
private static _lastStateChangeTs: number = 0;
private static get retentionDays(): number {
const v = Number(process.env.GROUP_MEMBERS_INACTIVE_RETENTION_DAYS);
@ -18,7 +21,7 @@ export class MaintenanceService {
url: process.env.EVOLUTION_API_URL,
instance: process.env.EVOLUTION_API_INSTANCE,
apiKey: process.env.EVOLUTION_API_KEY,
intervalMs: Number(process.env.HEALTH_CHECK_INTERVAL_MS || '120000'), // 2 min por defecto
intervalMs: Number(process.env.HEALTH_CHECK_INTERVAL_MS || '60000'), // 1 min por defecto
restartCooldownMs: Number(process.env.HEALTH_CHECK_RESTART_COOLDOWN_MS || '900000'), // 15 min por defecto
};
}
@ -142,27 +145,63 @@ export class MaintenanceService {
const restartUrl = `${url}/instance/restart/${instance}`;
const headers: HeadersInit = { apikey: String(apiKey || '') };
const recordState = (newState: string) => {
const prev = this._lastEvolutionState;
const nowSec = Math.floor(Date.now() / 1000);
// Actualizar métricas de estado (1 para el actual; 0 para el anterior si cambió)
try {
Metrics.set('evolution_instance_state', 1, { instance: String(instance || ''), state: newState });
if (prev && prev !== newState) {
Metrics.set('evolution_instance_state', 0, { instance: String(instance || ''), state: prev });
}
} catch {}
// Logging solo en primer muestreo o cuando cambie
if (!prev) {
console.log(`[HealthCheck] Estado inicial de la instancia '${instance}': ${newState}`);
this._lastStateChangeTs = nowSec;
try { Metrics.set('evolution_instance_last_state_change_ts', nowSec, { instance: String(instance || '') }); } catch {}
} else if (prev !== newState) {
console.log(`[HealthCheck] Cambio de estado en instancia '${instance}': ${prev}${newState}`);
this._lastStateChangeTs = nowSec;
try {
Metrics.set('evolution_instance_last_state_change_ts', nowSec, { instance: String(instance || '') });
Metrics.inc('evolution_instance_state_changes_total', 1, { instance: String(instance || '') });
} catch {}
}
this._lastEvolutionState = newState;
};
try {
const response = await fetch(stateUrl, { method: 'GET', headers });
if (!response.ok) {
console.error(`[HealthCheck] Error al consultar estado de Evolution API: ${response.status} ${response.statusText}`);
try { Metrics.inc('evolution_health_check_errors_total', 1, { instance: String(instance || '') }); } catch {}
// Registrar estado como 'unreachable' (sin intentar reinicio aquí)
recordState('unreachable');
return;
}
const data = await response.json();
const currentState = data?.instance?.state;
const currentState = String(data?.instance?.state ?? 'unknown');
console.log(`[HealthCheck] Estado de la instancia '${instance}': ${currentState}`);
// Registrar estado y métricas (sin spam de logs si no cambia)
recordState(currentState);
// Intentar reinicio si no está 'open' y ha pasado el cooldown
if (currentState !== 'open') {
const now = Date.now();
if (now - this._lastRestartAttempt > restartCooldownMs) {
console.warn(`[HealthCheck] La instancia no está 'open'. Estado actual: ${currentState}. Intentando reiniciar...`);
console.warn(`[HealthCheck] La instancia no está 'open' (estado: ${currentState}). Intentando reiniciar...`);
try {
try { Metrics.inc('evolution_instance_restart_attempts_total', 1, { instance: String(instance || '') }); } catch {}
const restartResponse = await fetch(restartUrl, { method: 'PUT', headers });
if (restartResponse.ok) {
console.log(`[HealthCheck] Petición de reinicio para '${instance}' enviada exitosamente.`);
console.log(`[HealthCheck] Petición de reinicio enviada exitosamente para '${instance}'.`);
try { Metrics.inc('evolution_instance_restart_success_total', 1, { instance: String(instance || '') }); } catch {}
this._lastRestartAttempt = now;
} else {
console.error(`[HealthCheck] Fallo al reiniciar la instancia. Status: ${restartResponse.status} ${restartResponse.statusText}`);
@ -171,11 +210,13 @@ export class MaintenanceService {
console.error('[HealthCheck] Error de red al intentar reiniciar la instancia:', restartError);
}
} else {
console.log(`[HealthCheck] La instancia no está 'open', pero esperando cooldown de ${Math.round(restartCooldownMs / 60000)} minutos para no sobrecargar la API.`);
// Reducir ruido: no loguear en cada intervalo si seguimos en el mismo estado
}
}
} catch (error) {
console.error('[HealthCheck] Error de red o inesperado al verificar el estado de la Evolution API:', error);
try { Metrics.inc('evolution_health_check_errors_total', 1, { instance: String(instance || '') }); } catch {}
recordState('unreachable');
}
}
}

@ -1,8 +1,9 @@
import { beforeEach, describe, expect, it } from 'bun:test';
import { beforeEach, describe, expect, it, afterEach } from 'bun:test';
import Database from 'bun:sqlite';
import { initializeDatabase } from '../../../src/db';
import { MaintenanceService } from '../../../src/services/maintenance';
import { toIsoSqlUTC } from '../../../src/utils/datetime';
import { Metrics } from '../../../src/services/metrics';
function makeMem(): any {
const db = new Database(':memory:');
@ -90,3 +91,160 @@ describe('MaintenanceService', () => {
expect(merged).toBe(0);
});
});
describe('MaintenanceService - Evolution health check', () => {
let originalFetch: any;
const ENV_KEYS = ['METRICS_ENABLED', 'EVOLUTION_API_URL', 'EVOLUTION_API_INSTANCE', 'EVOLUTION_API_KEY', 'HEALTH_CHECK_RESTART_COOLDOWN_MS', 'HEALTH_CHECK_INTERVAL_MS'];
const savedEnv: Record<string, string | undefined> = {};
beforeEach(() => {
// Guardar y configurar entorno mínimo para habilitar métricas y health-check
for (const k of ENV_KEYS) savedEnv[k] = process.env[k];
process.env.METRICS_ENABLED = 'true';
process.env.EVOLUTION_API_URL = 'http://evo';
process.env.EVOLUTION_API_INSTANCE = 'inst';
process.env.EVOLUTION_API_KEY = 'key';
process.env.HEALTH_CHECK_RESTART_COOLDOWN_MS = '0'; // facilitar intentos de reinicio en tests
// Resetear métricas y estado interno
Metrics.reset();
(MaintenanceService as any)._lastEvolutionState = null;
(MaintenanceService as any)._lastStateChangeTs = 0;
(MaintenanceService as any)._lastRestartAttempt = 0;
// Guardar fetch original
originalFetch = globalThis.fetch;
});
afterEach(() => {
// Restaurar entorno
for (const k of ENV_KEYS) {
if (savedEnv[k] == null) delete (process.env as any)[k];
else process.env[k] = savedEnv[k];
}
// Restaurar fetch
globalThis.fetch = originalFetch;
// Reset de métricas tras cada caso
Metrics.reset();
});
it('registra y actualiza métricas en transición de estado (open → closed)', async () => {
let currentState = 'open';
let restartStatus = 200;
globalThis.fetch = async (url: any, init?: any) => {
const u = String(url);
if (u.includes('/instance/connectionState/')) {
return new Response(JSON.stringify({ instance: { state: currentState } }), { status: 200 });
}
if (u.includes('/instance/restart/')) {
return new Response('', { status: restartStatus });
}
return new Response('', { status: 404 });
};
// Primer muestreo: open
await (MaintenanceService as any).performEvolutionHealthCheck();
// Segundo muestreo: closed (debe disparar transición)
currentState = 'closed';
await (MaintenanceService as any).performEvolutionHealthCheck();
const stats = JSON.parse(Metrics.render('json'));
const lg = stats.labeledGauges || {};
const lc = stats.labeledCounters || {};
expect(lg.evolution_instance_state['instance="inst",state="closed"']).toBe(1);
expect(lg.evolution_instance_state['instance="inst",state="open"']).toBe(0);
expect(typeof lg.evolution_instance_last_state_change_ts['instance="inst"']).toBe('number');
expect(lc.evolution_instance_state_changes_total['instance="inst"']).toBe(1);
});
it('no incrementa cambios si el estado se repite (open → open)', async () => {
let currentState = 'open';
globalThis.fetch = async (url: any, init?: any) => {
const u = String(url);
if (u.includes('/instance/connectionState/')) {
return new Response(JSON.stringify({ instance: { state: currentState } }), { status: 200 });
}
if (u.includes('/instance/restart/')) {
return new Response('', { status: 200 });
}
return new Response('', { status: 404 });
};
await (MaintenanceService as any).performEvolutionHealthCheck();
await (MaintenanceService as any).performEvolutionHealthCheck();
const stats = JSON.parse(Metrics.render('json'));
const lg = stats.labeledGauges || {};
const lc = stats.labeledCounters || {};
expect(lg.evolution_instance_state['instance="inst",state="open"']).toBe(1);
const changes = lc.evolution_instance_state_changes_total;
expect(!changes || Object.values(changes).reduce((a: number, b: any) => a + Number(b || 0), 0) === 0).toBe(true);
});
it('registra error y marca estado "unreachable" ante HTTP no OK', async () => {
globalThis.fetch = async (url: any, init?: any) => {
const u = String(url);
if (u.includes('/instance/connectionState/')) {
return new Response('err', { status: 500 });
}
return new Response('', { status: 404 });
};
await (MaintenanceService as any).performEvolutionHealthCheck();
const stats = JSON.parse(Metrics.render('json'));
const lg = stats.labeledGauges || {};
const lc = stats.labeledCounters || {};
expect(lg.evolution_instance_state['instance="inst",state="unreachable"']).toBe(1);
expect(lc.evolution_health_check_errors_total['instance="inst"']).toBe(1);
});
it('incrementa attempts y success al reiniciar cuando el estado no es open', async () => {
let currentState = 'closed';
let restartStatus = 200;
globalThis.fetch = async (url: any, init?: any) => {
const u = String(url);
if (u.includes('/instance/connectionState/')) {
return new Response(JSON.stringify({ instance: { state: currentState } }), { status: 200 });
}
if (u.includes('/instance/restart/')) {
return new Response('', { status: restartStatus });
}
return new Response('', { status: 404 });
};
await (MaintenanceService as any).performEvolutionHealthCheck();
const stats = JSON.parse(Metrics.render('json'));
const lc = stats.labeledCounters || {};
expect(lc.evolution_instance_restart_attempts_total['instance="inst"']).toBe(1);
expect(lc.evolution_instance_restart_success_total['instance="inst"']).toBe(1);
});
it('no incrementa success si el reinicio falla', async () => {
let currentState = 'closed';
let restartStatus = 500;
globalThis.fetch = async (url: any, init?: any) => {
const u = String(url);
if (u.includes('/instance/connectionState/')) {
return new Response(JSON.stringify({ instance: { state: currentState } }), { status: 200 });
}
if (u.includes('/instance/restart/')) {
return new Response('', { status: restartStatus });
}
return new Response('', { status: 404 });
};
await (MaintenanceService as any).performEvolutionHealthCheck();
const stats = JSON.parse(Metrics.render('json'));
const lc = stats.labeledCounters || {};
expect(lc.evolution_instance_restart_attempts_total['instance="inst"']).toBe(1);
expect(!lc.evolution_instance_restart_success_total || lc.evolution_instance_restart_success_total['instance="inst"'] == null).toBe(true);
});
});

Loading…
Cancel
Save