agenticSystem/evals/driver.py

#!/usr/bin/env python3
"""Driver de evaluación del agente acai-code (chat agentic).

Manda UN mensaje de usuario al chat, consume el SSE, loguea EN VIVO cada
tool/resultado/error y resume el turno. Reutiliza session_id para mantener la
MISMA conversación a lo largo de varios turnos.

Uso (dentro de la red docker, hitea `app` directo con auth interna X-Acai-User):

    docker run --rm --network <proj>_acai-net \\
      -v "$PWD/agenticSystem/evals:/data" -v "$PWD/agenticSystem/evals/logs:/logs" \\
      -w /data acai-vscode-plugin-agentic \\
      python /data/driver.py "<mensaje del usuario>" "<session_id opcional>"

Variables de entorno opcionales: EVAL_PROJECT (slug), EVAL_USER (default superadmin).

Sirve para comparar el comportamiento/errores del MISMO flujo entre distintos
modelos (cambia el modelo activo en el admin panel y repite). Ver README.md.
"""
import os
import sys
import json
import time
import urllib.request

APP = os.environ.get("EVAL_APP", "http://app:9091")
USER = os.environ.get("EVAL_USER", "superadmin")
PROJECT = os.environ.get("EVAL_PROJECT", "empleo.cocosolution.com")
LOG = os.environ.get("EVAL_LOG", "/logs/session.log")

msg = sys.argv[1]
session_id = sys.argv[2] if len(sys.argv) > 2 else ""


def log(s):
    with open(LOG, "a") as f:
        f.write(s + "\n")
        f.flush()


body = {"project": PROJECT, "message": msg, "agent_id": "acai", "plan_mode": "off"}
if session_id:
    body["session_id"] = session_id

req = urllib.request.Request(
    APP + "/api/agentic/chat",
    data=json.dumps(body).encode(),
    headers={"Content-Type": "application/json", "X-Acai-User": USER},
    method="POST",
)

log("\n" + "=" * 80)
log("[{}] >>> USER: {}".format(time.strftime("%H:%M:%S"), msg))

sid = session_id
text_parts = []
thinking_chars = 0
tool_calls = []
tool_results = {}
errors = []
usage = {}
seen = {}
# IMPORTANTE: el agentic re-emite el snapshot `assistant` con TODOS los bloques
# acumulados tras cada tool (reconciliación, claude_format.py). Hay que
# deduplicar por `tool_use` id o se cuenta el mismo tool decenas de veces.
seen_ids = set()

try:
    resp = urllib.request.urlopen(req, timeout=1200)
except Exception as e:
    log("!!! HTTP ERROR: {}".format(e))
    print("HTTP_ERROR", e)
    sys.exit(1)

for raw in resp:
    line = raw.decode("utf-8", "replace").rstrip("\r\n")
    if not line.startswith("data: "):
        continue
    payload = line[6:].strip()
    if not payload:
        continue
    try:
        ev = json.loads(payload)
    except Exception:
        continue
    t = ev.get("type")
    if t == "session":
        sid = ev.get("session_id") or sid
    elif t == "stream_event":
        e = ev.get("event", {})
        et = e.get("type")
        if et == "content_block_delta":
            d = e.get("delta", {})
            if d.get("type") == "text_delta" or "text" in d:
                text_parts.append(d.get("text", ""))
            elif d.get("type") == "thinking_delta":
                thinking_chars += len(d.get("thinking", ""))
    elif t == "assistant":
        for blk in ev.get("message", {}).get("content", []):
            if blk.get("type") != "tool_use":
                continue
            bid = blk.get("id") or ""
            if bid and bid in seen_ids:
                continue  # snapshot de reconciliación re-emite bloques ya vistos
            if bid:
                seen_ids.add(bid)
            name = blk.get("name", "?")
            inp = json.dumps(blk.get("input", {}), ensure_ascii=False)
            sig = name + "|" + inp[:200]
            seen[sig] = seen.get(sig, 0) + 1  # repeticiones REALES (mismo tool+input, otro id)
            tool_calls.append((name, inp, bid))
            rep = "  [REPETIDA x{}]".format(seen[sig]) if seen[sig] >= 2 else ""
            log("  [{}] TOOL {} {}{}".format(time.strftime("%H:%M:%S"), name, inp[:300], rep))
    elif t == "tool_result":
        tid = ev.get("tool_use_id")
        content = ev.get("content")
        cstr = content if isinstance(content, str) else json.dumps(content, ensure_ascii=False)
        is_err = bool(ev.get("is_error")) or ('"success": false' in cstr) or ('"success":false' in cstr)
        tool_results[tid] = (is_err, cstr[:500])
        log("       ->{} {}".format(" [ERROR]" if is_err else " ok", cstr[:300]))
        if is_err:
            errors.append("TOOL_ERROR: " + cstr[:300])
    elif t == "result":
        usage = ev.get("usage", {}) or {}
        if ev.get("content") and not text_parts:
            text_parts.append(ev["content"])
    elif t == "error":
        errors.append("STREAM_ERROR: " + str(ev.get("error")))
        log("  !! STREAM_ERROR: " + str(ev.get("error"))[:300])
    elif t == "done":
        break

full_text = "".join(text_parts)
repeated = {s: c for s, c in seen.items() if c >= 2}
log("[ASSISTANT] " + full_text[:1500])
log("[resumen] tools={} errores={} repetidas={} thinking~{}c usage in={} out={}".format(
    len(tool_calls), len(errors), len(repeated), thinking_chars,
    usage.get("input_tokens"), usage.get("output_tokens")))

print("SESSION_ID={}".format(sid))
print("TOOLS={} ERRORS={} REPEATED={}".format(len(tool_calls), len(errors), len(repeated)))
for (name, inp, tid) in tool_calls:
    res = tool_results.get(tid)
    print("  - {}{} {}".format(name, " [ERR]" if (res and res[0]) else "", inp[:110]))
for e in errors:
    print("  !! " + e[:220])
print("ASSISTANT:", full_text[:1400])
print("USAGE in={} out={}".format(usage.get("input_tokens"), usage.get("output_tokens")))