Files
agenticSystem/tests/test_context_budget.py
Jordan 651d61b096 P0 contexto: ventana por modelo + recuperación ante overflow + self-heal del catálogo
Que las conversaciones largas no se rompan ni gasten de más:

Ventana de contexto por modelo (antes: budget estático 120k/200k para todos):
- cost.resolve_context_window: lee context_length del catálogo OpenRouter/DeepSeek
  en Redis, con fallback a litellm. config.budget_for_window deriva el budget de
  la ventana real (window - max_output - reserve). build_context lo aplica por
  turno (param model_id) en vez del fijo de settings.
- Self-heal del catálogo OpenRouter: el admin panel lo cachea con TTL 1h y solo lo
  repuebla al abrir su ventana de IA → en runtime caducaba y se perdían ventana y
  precio. Ahora cost._get_catalog lo refresca solo (fetch público, mismo shape,
  cooldown 5min, TTL 24h). Arregla también el coste (caía al fijo).

Recuperación ante overflow:
- adapters.base.ContextOverflowError; openai_adapter traduce el error de
  context-length del proveedor (init e iteración del stream).
- base.py: retry proactivo que recompacta hasta caber en la ventana ANTES de
  llamar al LLM; si ni así cabe → error accionable (no rompe la sesión).
- engine.py: mensaje user-facing claro (modelo + ventana).

Tests: ventana/budget, self-heal (mockeado), overflow, y sesión REAL de Redis. 106 verdes.

evals/: harness para evaluar al agente acai-code (driver + README + resultados).
Comparativa kimi vs deepseek vs glm (deepseek-v4-pro high = mejor calidad/precio).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-20 13:48:19 +01:00

620 lines
24 KiB
Python

"""Tests para budget efectivo de contexto e integracion del ContextEngine."""
from __future__ import annotations
import asyncio
import enum
import sys
import types
import pytest
if not hasattr(enum, "StrEnum"):
class _CompatStrEnum(str, enum.Enum):
pass
enum.StrEnum = _CompatStrEnum
if "anthropic" not in sys.modules:
anthropic_stub = types.ModuleType("anthropic")
class _AsyncAnthropic:
pass
anthropic_stub.AsyncAnthropic = _AsyncAnthropic
sys.modules["anthropic"] = anthropic_stub
if "openai" not in sys.modules:
openai_stub = types.ModuleType("openai")
class _AsyncOpenAI:
pass
openai_stub.AsyncOpenAI = _AsyncOpenAI
sys.modules["openai"] = openai_stub
from src.config import Settings, settings
from src.context.compactor import ContextCompactor
from src.context.engine import ContextEngine
from src.models.agent import AgentProfile
from src.models.artifacts import ArtifactSummary
from src.models.session import SessionState
from src.orchestrator.engine import OrchestratorEngine
from src.orchestrator.agents.base import BaseAgent
class TestSettingsBudget:
def test_effective_budget_uses_explicit_override(self):
cfg = Settings(
context_max_tokens=120_000,
model_context_window=200_000,
model_max_output_tokens=8_192,
_env_file=None,
)
assert cfg.effective_context_budget == 120_000
def test_effective_budget_uses_model_window_when_no_override(self):
cfg = Settings(
context_max_tokens=0,
model_context_window=200_000,
model_max_output_tokens=8_000,
context_reserve_ratio=0.10,
_env_file=None,
)
assert cfg.reserve_tokens == 20_000
assert cfg.effective_context_budget == 172_000
assert cfg.effective_compaction_threshold == 137_600
def test_budget_for_window_small_and_large(self):
cfg = Settings(
context_max_tokens=0,
model_max_output_tokens=4_096,
context_reserve_ratio=0.10,
_env_file=None,
)
# 32k: window - max_output - 10% reserve
assert cfg.budget_for_window(32_000) == 32_000 - 4_096 - 3_200
# 1M: budget mucho mayor (no compacta innecesariamente)
assert cfg.budget_for_window(1_000_000) == 1_000_000 - 4_096 - 100_000
# ventana inválida → fallback al budget estático
assert cfg.budget_for_window(0) == cfg.effective_context_budget
def test_compaction_threshold_for_uses_ratio(self):
cfg = Settings(
compaction_threshold_tokens=0,
compaction_threshold_ratio=0.80,
_env_file=None,
)
assert cfg.compaction_threshold_for(100_000) == 80_000
class TestContextWindowResolution:
def test_resolve_window_from_catalog(self, monkeypatch):
import json
from src.orchestrator import cost
cost._window_cache.clear()
class _FakeRedis:
async def get(self, key):
return json.dumps([
{"id": "kimi-k2.7-code", "context_length": 256_000},
{"id": "otro", "context_length": 32_000},
])
monkeypatch.setattr(cost, "_get_cfg_redis", lambda: _FakeRedis())
w = asyncio.run(cost.resolve_context_window("openrouter/kimi-k2.7-code"))
assert w == 256_000
# segunda llamada usa cache (no peta aunque cambie el fake)
assert asyncio.run(cost.resolve_context_window("openrouter/kimi-k2.7-code")) == 256_000
def test_resolve_window_miss_is_none_or_int(self, monkeypatch):
from src.orchestrator import cost
cost._window_cache.clear()
class _FakeRedis:
async def get(self, key):
return None
monkeypatch.setattr(cost, "_get_cfg_redis", lambda: _FakeRedis())
w = asyncio.run(cost.resolve_context_window("openrouter/modelo-inexistente-xyz"))
assert w is None or isinstance(w, int)
def test_resolve_window_ignores_non_litellm_ids(self):
from src.orchestrator import cost
cost._window_cache.clear()
assert asyncio.run(cost.resolve_context_window("sin-prefijo")) is None
assert asyncio.run(cost.resolve_context_window(None)) is None
def test_resolve_window_self_heals_when_catalog_missing(self, monkeypatch):
"""Si el catálogo OpenRouter caducó, se repuebla en runtime (self-heal)."""
from src.orchestrator import cost
cost._window_cache.clear()
cost._or_last_refresh[0] = 0.0 # desactivar cooldown para el test
store = {}
class _FakeRedis:
async def get(self, key):
return store.get(key)
async def set(self, key, val, ex=None):
store[key] = val
monkeypatch.setattr(cost, "_get_cfg_redis", lambda: _FakeRedis())
monkeypatch.setattr(
cost, "_fetch_openrouter_catalog_sync",
lambda: [{"id": "moonshotai/kimi-x", "context_length": 262_144,
"price_in_1m": 0.6, "price_out_1m": 3.0}],
)
w = asyncio.run(cost.resolve_context_window("openrouter/moonshotai/kimi-x"))
assert w == 262_144
# quedó repoblado en el cache para futuras lecturas
assert "acai:config:ai:models_cache:openrouter" in store
class TestModelAwareBudget:
def test_build_context_uses_model_window_budget(self, monkeypatch):
from src.orchestrator import cost
async def _fake_window(model_id):
return 40_000
monkeypatch.setattr(cost, "resolve_context_window", _fake_window)
session = SessionState(immutable_rules=["No romper"])
session.begin_task("hola")
agent = AgentProfile(role="acai", name="Acai", system_prompt="Haz el trabajo.")
pkg = asyncio.run(
ContextEngine().build_context(
session=session, agent=agent, model_id="openrouter/m"
)
)
assert pkg.budget_tokens == settings.budget_for_window(40_000)
def test_budget_override_wins(self):
session = SessionState(immutable_rules=["No romper"])
session.begin_task("hola")
agent = AgentProfile(role="acai", name="Acai", system_prompt="Haz el trabajo.")
pkg = asyncio.run(
ContextEngine().build_context(
session=session, agent=agent, budget_override=12_345
)
)
assert pkg.budget_tokens == 12_345
class TestContextEngine:
def test_build_context_keeps_task_history_and_current_task(self):
session = SessionState(
immutable_rules=["No romper el proyecto"],
project_profile={"name": "demo"},
task_history=[
{
"task_id": "prev1",
"objective": "Crear banner",
"status": "completed",
"summary": "User: Crear banner → Agent: Banner creado",
"facts": ["Section: home"],
"key_data": {"sections": ["home"]},
"tools_used": ["create_module"],
}
],
)
session.begin_task("Actualizar hero")
agent = AgentProfile(
role="acai",
name="Acai",
system_prompt="Haz el trabajo.",
)
package = asyncio.run(ContextEngine().build_context(session=session, agent=agent))
assert "# Session History" in package.system_prompt
assert "# Current Task" in package.system_prompt
def test_build_context_includes_artifact_memory_with_task_state_agents(self):
session = SessionState(
immutable_rules=["No romper el proyecto"],
project_profile={"name": "demo"},
)
task = session.begin_task("Revisar modulo")
agent = AgentProfile(
role="acai",
name="Acai",
system_prompt="Haz el trabajo.",
context_sections=[
"immutable_rules",
"project_profile",
"task_state",
],
)
artifacts = [
ArtifactSummary(
artifact_id="art-1",
session_id=session.session_id,
task_id=task.task_id,
artifact_type="code",
title="Output of read_file",
summary="Resumen del archivo",
facts=["Status: ok"],
source_tool="read_file",
char_count=120,
)
]
package = asyncio.run(
ContextEngine().build_context(
session=session,
agent=agent,
artifacts=artifacts,
)
)
assert "## Artifacts" in package.system_prompt
assert "Resumen del archivo" in package.system_prompt
def test_build_messages_prefers_recent_raw_conversation_over_synthetic_history(self):
session = SessionState(
immutable_rules=["No romper el proyecto"],
task_history=[
{
"task_id": "prev1",
"objective": "Revísame la home y dime qué módulo ves más flojo",
"status": "completed",
"summary": "User: home → Agent: el módulo más flojo es Desplegables",
"facts": [],
"key_data": {"sections": ["u30mz"]},
"tools_used": ["get_module_config_vars"],
}
],
recent_messages=[
{"role": "user", "content": "Revísame la home y dime qué módulo ves más flojo"},
{"role": "assistant", "content": "El módulo más flojo es Desplegables."},
],
)
session.begin_task("Céntrate solo en ese módulo y dime qué cambiarías")
messages = ContextEngine()._build_messages(session)
assert messages[0]["content"] == "Revísame la home y dime qué módulo ves más flojo"
assert messages[1]["content"] == "El módulo más flojo es Desplegables."
assert "[HISTORIAL DE CONVERSACIÓN ANTERIOR" not in messages[0]["content"]
assert "Desplegables" in messages[-1]["content"]
def test_build_context_keeps_recent_raw_conversation_across_tasks(self):
session = SessionState(
immutable_rules=["No romper el proyecto"],
recent_messages=[
{"role": "user", "content": "Revísame la home y dime qué módulo ves más flojo"},
{"role": "assistant", "content": "El módulo más flojo es Desplegables."},
],
task_history=[
{
"task_id": "prev1",
"objective": "Revísame la home y dime qué módulo ves más flojo",
"status": "completed",
"summary": "User: home → Agent: el módulo más flojo es Desplegables",
"facts": [],
"key_data": {"sections": ["u30mz"]},
"tools_used": ["get_module_config_vars"],
"outcomes": ["El módulo más flojo es Desplegables."],
"focus_refs": [
{
"type": "module",
"label": "Desplegables",
"id": "u30mz",
"role": "primary_focus",
}
],
}
],
)
session.begin_task("Céntrate solo en ese módulo y dime qué cambiarías")
agent = AgentProfile(
role="acai",
name="Acai",
system_prompt="Haz el trabajo.",
context_sections=["immutable_rules", "task_state"],
)
package = asyncio.run(ContextEngine().build_context(session=session, agent=agent))
assert package.messages[0]["content"] == "Revísame la home y dime qué módulo ves más flojo"
assert package.messages[1]["content"] == "El módulo más flojo es Desplegables."
assert "Resolved Follow-up Context" in package.system_prompt
assert "Desplegables" in package.messages[-1]["content"]
def test_classify_followup_mode_detects_transform_requests(self):
mode = ContextEngine._classify_followup_mode(
"Hazme una segunda versión más comercial, pero sin cambiar el foco."
)
assert mode == "transform"
def test_classify_followup_mode_detects_fetch_requests(self):
mode = ContextEngine._classify_followup_mode(
"Céntrate en ese módulo y revisa la configuración actual."
)
assert mode == "fetch_more"
def test_build_context_sets_transform_followup_mode_in_task_state(self):
session = SessionState(
immutable_rules=["No romper el proyecto"],
recent_messages=[
{"role": "user", "content": "Dame una propuesta para ese módulo"},
{"role": "assistant", "content": "La propuesta actual es esta."},
],
)
session.begin_task("Hazme una segunda versión más comercial, pero sin cambiar el foco.")
agent = AgentProfile(
role="acai",
name="Acai",
system_prompt="Haz el trabajo.",
context_sections=["immutable_rules", "task_state"],
)
package = asyncio.run(ContextEngine().build_context(session=session, agent=agent))
assert "**Follow-up Mode**: transform" in package.system_prompt
assert "No llames herramientas salvo que falte un dato factual critico" in package.system_prompt
class TestTaskHistoryTrim:
def test_trim_respects_entry_limit_and_token_budget(self, monkeypatch):
monkeypatch.setattr(settings, "task_history_max_entries", 3)
monkeypatch.setattr(settings, "task_history_max_tokens", 60)
history = [
{"objective": "old", "summary": "muy antiguo", "facts": [], "tools_used": [], "key_data": {}},
{
"objective": "medio",
"summary": "contenido " * 20,
"facts": [],
"tools_used": [],
"key_data": {},
},
{"objective": "nuevo", "summary": "corto", "facts": [], "tools_used": [], "key_data": {}},
{"objective": "final", "summary": "ultimo", "facts": [], "tools_used": [], "key_data": {}},
]
trimmed = OrchestratorEngine._trim_task_history(history)
assert len(trimmed) <= 3
assert trimmed[-1]["objective"] == "final"
assert all(entry["objective"] != "old" for entry in trimmed)
def test_append_recent_messages_keeps_user_and_raw_turn_messages(self):
merged = OrchestratorEngine._append_recent_messages(
existing=[
{"role": "user", "content": "Pregunta anterior"},
{"role": "assistant", "content": "Respuesta anterior"},
],
message="Nueva pregunta",
conversation=[
{"role": "assistant", "content": "Voy a revisarlo."},
{"role": "tool", "tool_call_id": "tool-1", "content": "resultado tool"},
{"role": "assistant", "content": "Respuesta final"},
],
)
assert [m["role"] for m in merged] == [
"user",
"assistant",
"user",
"assistant",
"tool",
"assistant",
]
assert merged[2]["content"] == "Nueva pregunta"
assert merged[4]["tool_call_id"] == "tool-1"
class TestConversationCompaction:
def test_compactor_preserves_last_user_and_compacts_old_tool_results(self):
compactor = ContextCompactor(max_tokens=999999)
# Los assistants llevan sus tool_calls: sin ellos los `role: tool`
# serian huerfanos y `_enforce_tool_pairing` los convertiria a user.
messages = [
{"role": "user", "content": "Contexto anterior " * 10},
{
"role": "assistant",
"content": "Voy a revisar el modulo ahora mismo. " * 6,
"tool_calls": [
{"id": "tool-1", "type": "function",
"function": {"name": "t", "arguments": "{}"}},
],
},
{"role": "tool", "tool_call_id": "tool-1", "content": "resultado antiguo\n" * 80},
{
"role": "assistant",
"content": "He visto el resultado anterior. " * 6,
"tool_calls": [
{"id": "tool-2", "type": "function",
"function": {"name": "t", "arguments": "{}"}},
],
},
{"role": "tool", "tool_call_id": "tool-2", "content": "resultado reciente\n" * 80},
{"role": "user", "content": "Este es el ultimo mensaje del usuario y debe quedar intacto."},
]
compacted, meta = compactor.compact_conversation(
messages,
max_tokens=420,
recent_raw_limit=1,
raw_char_limit=120,
)
assert compacted[-1]["content"] == messages[-1]["content"]
assert compacted[2]["content"].startswith("[TOOL RESULT COMPACTADO]")
assert compacted[4]["content"].startswith("resultado reciente")
assert compacted[1]["content"] == messages[1]["content"]
assert meta["messages_compacted"] > 0
assert meta["raw_tool_results_kept"] == 1
assert meta["tool_messages_compacted"] > 0
assert meta["assistant_messages_compacted"] == 0
assert meta["user_messages_compacted"] == 0
def test_engine_reports_conversation_compaction_when_budget_is_small(self, monkeypatch):
monkeypatch.setattr(settings, "context_max_tokens", 1400)
monkeypatch.setattr(settings, "compaction_threshold_tokens", 1)
monkeypatch.setattr(settings, "knowledge_base_max_tokens", 0)
monkeypatch.setattr(settings, "tool_raw_output_max_chars", 120)
monkeypatch.setattr(settings, "conversation_recent_raw_limit", 1)
session = SessionState(immutable_rules=["No romper el proyecto"])
session.begin_task("Revisar modulo")
agent = AgentProfile(
role="acai",
name="Acai",
system_prompt="Haz el trabajo.",
context_sections=["immutable_rules", "task_state"],
)
conversation = [
{"role": "assistant", "content": "Respuesta intermedia " * 25},
{"role": "tool", "tool_call_id": "tool-1", "content": "resultado antiguo\n" * 80},
{"role": "assistant", "content": "Segunda respuesta " * 25},
{"role": "tool", "tool_call_id": "tool-2", "content": "resultado reciente\n" * 80},
]
engine = ContextEngine()
asyncio.run(
engine.build_context(
session=session,
agent=agent,
conversation=conversation,
)
)
debug = engine.get_last_context_debug(session.session_id)
assert debug is not None
assert debug["conversation_compaction"]["messages_compacted"] > 0
assert debug["message_tokens"] <= debug["message_tokens_before_compaction"]
def test_compactor_only_touches_user_messages_as_last_resort(self):
compactor = ContextCompactor(max_tokens=999999)
# tool_calls en el assistant para que el `role: tool` no sea huerfano
# (el invariante `_enforce_tool_pairing` convertiria un huerfano a user).
messages = [
{"role": "user", "content": "Contexto previo del usuario " * 8},
{
"role": "assistant",
"content": "Respuesta previa del asistente " * 6,
"tool_calls": [
{"id": "tool-1", "type": "function",
"function": {"name": "t", "arguments": "{}"}},
],
},
{"role": "tool", "tool_call_id": "tool-1", "content": "resultado viejo\n" * 80},
{"role": "user", "content": "Ultimo mensaje del usuario"},
]
compacted, meta = compactor.compact_conversation(
messages,
max_tokens=420,
recent_raw_limit=0,
raw_char_limit=120,
)
assert compacted[0]["content"] == messages[0]["content"]
assert compacted[1]["content"] == messages[1]["content"]
assert compacted[2]["content"].startswith("[TOOL RESULT COMPACTADO]")
assert compacted[3]["content"] == messages[3]["content"]
assert meta["tool_messages_compacted"] > 0
assert meta["assistant_messages_compacted"] == 0
assert meta["user_messages_compacted"] == 0
class TestStructuredFollowups:
def test_history_entry_extracts_outcomes_and_focus_refs(self):
entry = OrchestratorEngine._build_task_history_entry(
task_id="task-1",
message="Revísame la home y dime qué módulo ves más flojo",
content=(
"## El módulo más flojo\n"
"Si tuviera que elegir uno, diría que **Desplegables** es el más problemático.\n"
"Recomiendo revisarlo primero."
),
agent_id="acai",
facts=[],
key_data={"sections": ["u30mz"]},
tool_executions=[],
artifacts_count=0,
)
assert any("Desplegables" in outcome for outcome in entry["outcomes"])
assert any(ref["label"] == "Desplegables" for ref in entry["focus_refs"])
def test_followup_message_includes_resolved_context(self):
session = SessionState(
immutable_rules=["No romper el proyecto"],
task_history=[
{
"task_id": "prev1",
"objective": "Revísame la home y dime qué módulo ves más flojo",
"status": "completed",
"summary": "User: home → Agent: el módulo más flojo es Desplegables",
"facts": [],
"key_data": {"sections": ["u30mz"]},
"tools_used": ["get_module_config_vars"],
"outcomes": ["Si tuviera que elegir uno, diría que Desplegables es el más problemático."],
"focus_refs": [
{
"type": "module",
"label": "Desplegables",
"id": "u30mz",
"role": "primary_focus",
}
],
}
],
)
session.begin_task("Céntrate solo en ese módulo y dime qué cambiarías")
engine = ContextEngine()
messages = engine._build_messages(session)
assert "[CONTEXTO RESUELTO DEL TURNO ANTERIOR]" in messages[-1]["content"]
assert "Desplegables" in messages[-1]["content"]
class _DummyMCP:
is_running = True
def get_tool_definitions(self):
return [
{"name": "tool_a"},
{"name": "tool_b"},
]
class TestToolGating:
def test_base_agent_disables_tools_for_transform_followups(self):
agent = BaseAgent(
profile=AgentProfile(role="acai", name="Acai", allowed_tools=["tool_a", "tool_b"]),
model_adapter=None,
context_engine=None,
mcp_client=_DummyMCP(),
memory_store=None,
sse_emitter=None,
)
assert agent._get_allowed_tools(followup_mode="transform") == []
def test_base_agent_keeps_tools_for_non_transform_followups(self):
agent = BaseAgent(
profile=AgentProfile(role="acai", name="Acai", allowed_tools=["tool_a"]),
model_adapter=None,
context_engine=None,
mcp_client=_DummyMCP(),
memory_store=None,
sse_emitter=None,
)
tools = agent._get_allowed_tools(followup_mode="fetch_more")
assert tools == [{"name": "tool_a"}]