#!/usr/bin/env python3 """Driver de evaluación del agente acai-code (chat agentic). Manda UN mensaje de usuario al chat, consume el SSE, loguea EN VIVO cada tool/resultado/error y resume el turno. Reutiliza session_id para mantener la MISMA conversación a lo largo de varios turnos. Uso (dentro de la red docker, hitea `app` directo con auth interna X-Acai-User): docker run --rm --network _acai-net \\ -v "$PWD/agenticSystem/evals:/data" -v "$PWD/agenticSystem/evals/logs:/logs" \\ -w /data acai-vscode-plugin-agentic \\ python /data/driver.py "" "" Variables de entorno opcionales: EVAL_PROJECT (slug), EVAL_USER (default superadmin). Sirve para comparar el comportamiento/errores del MISMO flujo entre distintos modelos (cambia el modelo activo en el admin panel y repite). Ver README.md. """ import os import sys import json import time import urllib.request APP = os.environ.get("EVAL_APP", "http://app:9091") USER = os.environ.get("EVAL_USER", "superadmin") PROJECT = os.environ.get("EVAL_PROJECT", "empleo.cocosolution.com") LOG = os.environ.get("EVAL_LOG", "/logs/session.log") msg = sys.argv[1] session_id = sys.argv[2] if len(sys.argv) > 2 else "" def log(s): with open(LOG, "a") as f: f.write(s + "\n") f.flush() body = {"project": PROJECT, "message": msg, "agent_id": "acai", "plan_mode": "off"} if session_id: body["session_id"] = session_id req = urllib.request.Request( APP + "/api/agentic/chat", data=json.dumps(body).encode(), headers={"Content-Type": "application/json", "X-Acai-User": USER}, method="POST", ) log("\n" + "=" * 80) log("[{}] >>> USER: {}".format(time.strftime("%H:%M:%S"), msg)) sid = session_id text_parts = [] thinking_chars = 0 tool_calls = [] tool_results = {} errors = [] usage = {} seen = {} # IMPORTANTE: el agentic re-emite el snapshot `assistant` con TODOS los bloques # acumulados tras cada tool (reconciliación, claude_format.py). Hay que # deduplicar por `tool_use` id o se cuenta el mismo tool decenas de veces. seen_ids = set() try: resp = urllib.request.urlopen(req, timeout=1200) except Exception as e: log("!!! HTTP ERROR: {}".format(e)) print("HTTP_ERROR", e) sys.exit(1) for raw in resp: line = raw.decode("utf-8", "replace").rstrip("\r\n") if not line.startswith("data: "): continue payload = line[6:].strip() if not payload: continue try: ev = json.loads(payload) except Exception: continue t = ev.get("type") if t == "session": sid = ev.get("session_id") or sid elif t == "stream_event": e = ev.get("event", {}) et = e.get("type") if et == "content_block_delta": d = e.get("delta", {}) if d.get("type") == "text_delta" or "text" in d: text_parts.append(d.get("text", "")) elif d.get("type") == "thinking_delta": thinking_chars += len(d.get("thinking", "")) elif t == "assistant": for blk in ev.get("message", {}).get("content", []): if blk.get("type") != "tool_use": continue bid = blk.get("id") or "" if bid and bid in seen_ids: continue # snapshot de reconciliación re-emite bloques ya vistos if bid: seen_ids.add(bid) name = blk.get("name", "?") inp = json.dumps(blk.get("input", {}), ensure_ascii=False) sig = name + "|" + inp[:200] seen[sig] = seen.get(sig, 0) + 1 # repeticiones REALES (mismo tool+input, otro id) tool_calls.append((name, inp, bid)) rep = " [REPETIDA x{}]".format(seen[sig]) if seen[sig] >= 2 else "" log(" [{}] TOOL {} {}{}".format(time.strftime("%H:%M:%S"), name, inp[:300], rep)) elif t == "tool_result": tid = ev.get("tool_use_id") content = ev.get("content") cstr = content if isinstance(content, str) else json.dumps(content, ensure_ascii=False) is_err = bool(ev.get("is_error")) or ('"success": false' in cstr) or ('"success":false' in cstr) tool_results[tid] = (is_err, cstr[:500]) log(" ->{} {}".format(" [ERROR]" if is_err else " ok", cstr[:300])) if is_err: errors.append("TOOL_ERROR: " + cstr[:300]) elif t == "result": usage = ev.get("usage", {}) or {} if ev.get("content") and not text_parts: text_parts.append(ev["content"]) elif t == "error": errors.append("STREAM_ERROR: " + str(ev.get("error"))) log(" !! STREAM_ERROR: " + str(ev.get("error"))[:300]) elif t == "done": break full_text = "".join(text_parts) repeated = {s: c for s, c in seen.items() if c >= 2} log("[ASSISTANT] " + full_text[:1500]) log("[resumen] tools={} errores={} repetidas={} thinking~{}c usage in={} out={}".format( len(tool_calls), len(errors), len(repeated), thinking_chars, usage.get("input_tokens"), usage.get("output_tokens"))) print("SESSION_ID={}".format(sid)) print("TOOLS={} ERRORS={} REPEATED={}".format(len(tool_calls), len(errors), len(repeated))) for (name, inp, tid) in tool_calls: res = tool_results.get(tid) print(" - {}{} {}".format(name, " [ERR]" if (res and res[0]) else "", inp[:110])) for e in errors: print(" !! " + e[:220]) print("ASSISTANT:", full_text[:1400]) print("USAGE in={} out={}".format(usage.get("input_tokens"), usage.get("output_tokens")))