P0 contexto: ventana por modelo + recuperación ante overflow + self-heal del catálogo
Que las conversaciones largas no se rompan ni gasten de más: Ventana de contexto por modelo (antes: budget estático 120k/200k para todos): - cost.resolve_context_window: lee context_length del catálogo OpenRouter/DeepSeek en Redis, con fallback a litellm. config.budget_for_window deriva el budget de la ventana real (window - max_output - reserve). build_context lo aplica por turno (param model_id) en vez del fijo de settings. - Self-heal del catálogo OpenRouter: el admin panel lo cachea con TTL 1h y solo lo repuebla al abrir su ventana de IA → en runtime caducaba y se perdían ventana y precio. Ahora cost._get_catalog lo refresca solo (fetch público, mismo shape, cooldown 5min, TTL 24h). Arregla también el coste (caía al fijo). Recuperación ante overflow: - adapters.base.ContextOverflowError; openai_adapter traduce el error de context-length del proveedor (init e iteración del stream). - base.py: retry proactivo que recompacta hasta caber en la ventana ANTES de llamar al LLM; si ni así cabe → error accionable (no rompe la sesión). - engine.py: mensaje user-facing claro (modelo + ventana). Tests: ventana/budget, self-heal (mockeado), overflow, y sesión REAL de Redis. 106 verdes. evals/: harness para evaluar al agente acai-code (driver + README + resultados). Comparativa kimi vs deepseek vs glm (deepseek-v4-pro high = mejor calidad/precio). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -65,6 +65,128 @@ class TestSettingsBudget:
|
||||
assert cfg.effective_context_budget == 172_000
|
||||
assert cfg.effective_compaction_threshold == 137_600
|
||||
|
||||
def test_budget_for_window_small_and_large(self):
|
||||
cfg = Settings(
|
||||
context_max_tokens=0,
|
||||
model_max_output_tokens=4_096,
|
||||
context_reserve_ratio=0.10,
|
||||
_env_file=None,
|
||||
)
|
||||
# 32k: window - max_output - 10% reserve
|
||||
assert cfg.budget_for_window(32_000) == 32_000 - 4_096 - 3_200
|
||||
# 1M: budget mucho mayor (no compacta innecesariamente)
|
||||
assert cfg.budget_for_window(1_000_000) == 1_000_000 - 4_096 - 100_000
|
||||
# ventana inválida → fallback al budget estático
|
||||
assert cfg.budget_for_window(0) == cfg.effective_context_budget
|
||||
|
||||
def test_compaction_threshold_for_uses_ratio(self):
|
||||
cfg = Settings(
|
||||
compaction_threshold_tokens=0,
|
||||
compaction_threshold_ratio=0.80,
|
||||
_env_file=None,
|
||||
)
|
||||
assert cfg.compaction_threshold_for(100_000) == 80_000
|
||||
|
||||
|
||||
class TestContextWindowResolution:
|
||||
def test_resolve_window_from_catalog(self, monkeypatch):
|
||||
import json
|
||||
from src.orchestrator import cost
|
||||
|
||||
cost._window_cache.clear()
|
||||
|
||||
class _FakeRedis:
|
||||
async def get(self, key):
|
||||
return json.dumps([
|
||||
{"id": "kimi-k2.7-code", "context_length": 256_000},
|
||||
{"id": "otro", "context_length": 32_000},
|
||||
])
|
||||
|
||||
monkeypatch.setattr(cost, "_get_cfg_redis", lambda: _FakeRedis())
|
||||
w = asyncio.run(cost.resolve_context_window("openrouter/kimi-k2.7-code"))
|
||||
assert w == 256_000
|
||||
# segunda llamada usa cache (no peta aunque cambie el fake)
|
||||
assert asyncio.run(cost.resolve_context_window("openrouter/kimi-k2.7-code")) == 256_000
|
||||
|
||||
def test_resolve_window_miss_is_none_or_int(self, monkeypatch):
|
||||
from src.orchestrator import cost
|
||||
|
||||
cost._window_cache.clear()
|
||||
|
||||
class _FakeRedis:
|
||||
async def get(self, key):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(cost, "_get_cfg_redis", lambda: _FakeRedis())
|
||||
w = asyncio.run(cost.resolve_context_window("openrouter/modelo-inexistente-xyz"))
|
||||
assert w is None or isinstance(w, int)
|
||||
|
||||
def test_resolve_window_ignores_non_litellm_ids(self):
|
||||
from src.orchestrator import cost
|
||||
|
||||
cost._window_cache.clear()
|
||||
assert asyncio.run(cost.resolve_context_window("sin-prefijo")) is None
|
||||
assert asyncio.run(cost.resolve_context_window(None)) is None
|
||||
|
||||
def test_resolve_window_self_heals_when_catalog_missing(self, monkeypatch):
|
||||
"""Si el catálogo OpenRouter caducó, se repuebla en runtime (self-heal)."""
|
||||
from src.orchestrator import cost
|
||||
|
||||
cost._window_cache.clear()
|
||||
cost._or_last_refresh[0] = 0.0 # desactivar cooldown para el test
|
||||
store = {}
|
||||
|
||||
class _FakeRedis:
|
||||
async def get(self, key):
|
||||
return store.get(key)
|
||||
|
||||
async def set(self, key, val, ex=None):
|
||||
store[key] = val
|
||||
|
||||
monkeypatch.setattr(cost, "_get_cfg_redis", lambda: _FakeRedis())
|
||||
monkeypatch.setattr(
|
||||
cost, "_fetch_openrouter_catalog_sync",
|
||||
lambda: [{"id": "moonshotai/kimi-x", "context_length": 262_144,
|
||||
"price_in_1m": 0.6, "price_out_1m": 3.0}],
|
||||
)
|
||||
|
||||
w = asyncio.run(cost.resolve_context_window("openrouter/moonshotai/kimi-x"))
|
||||
assert w == 262_144
|
||||
# quedó repoblado en el cache para futuras lecturas
|
||||
assert "acai:config:ai:models_cache:openrouter" in store
|
||||
|
||||
|
||||
class TestModelAwareBudget:
|
||||
def test_build_context_uses_model_window_budget(self, monkeypatch):
|
||||
from src.orchestrator import cost
|
||||
|
||||
async def _fake_window(model_id):
|
||||
return 40_000
|
||||
|
||||
monkeypatch.setattr(cost, "resolve_context_window", _fake_window)
|
||||
session = SessionState(immutable_rules=["No romper"])
|
||||
session.begin_task("hola")
|
||||
agent = AgentProfile(role="acai", name="Acai", system_prompt="Haz el trabajo.")
|
||||
|
||||
pkg = asyncio.run(
|
||||
ContextEngine().build_context(
|
||||
session=session, agent=agent, model_id="openrouter/m"
|
||||
)
|
||||
)
|
||||
assert pkg.budget_tokens == settings.budget_for_window(40_000)
|
||||
|
||||
def test_budget_override_wins(self):
|
||||
session = SessionState(immutable_rules=["No romper"])
|
||||
session.begin_task("hola")
|
||||
agent = AgentProfile(role="acai", name="Acai", system_prompt="Haz el trabajo.")
|
||||
|
||||
pkg = asyncio.run(
|
||||
ContextEngine().build_context(
|
||||
session=session, agent=agent, budget_override=12_345
|
||||
)
|
||||
)
|
||||
assert pkg.budget_tokens == 12_345
|
||||
|
||||
|
||||
class TestContextEngine:
|
||||
def test_build_context_keeps_task_history_and_current_task(self):
|
||||
|
||||
110
tests/test_context_real_session.py
Normal file
110
tests/test_context_real_session.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""Test de integración contra sesiones REALES de Redis (db 1).
|
||||
|
||||
Valida el budget por-ventana y la compactación sobre las conversaciones reales
|
||||
del agentic (las que los usuarios mantienen abiertas), no sobre fixtures
|
||||
sintéticos. Es OPT-IN: se salta si no hay Redis disponible o no hay sesiones,
|
||||
para no acoplar la suite a datos de cliente ni romper en CI.
|
||||
|
||||
Ejecutar contra el Redis real:
|
||||
docker run --rm --network acai-net \\
|
||||
-v "$PWD/agenticSystem/src:/app/src" -v "$PWD/agenticSystem/tests:/app/tests" \\
|
||||
-e AGENTIC_REDIS_HOST=redis -w /app acai-vscode-plugin-agentic \\
|
||||
sh -lc "pip install -q pytest pytest-asyncio; python -m pytest tests/test_context_real_session.py -q"
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import enum
|
||||
import json
|
||||
import sys
|
||||
import types
|
||||
|
||||
import pytest
|
||||
|
||||
if not hasattr(enum, "StrEnum"):
|
||||
class _CompatStrEnum(str, enum.Enum):
|
||||
pass
|
||||
|
||||
enum.StrEnum = _CompatStrEnum
|
||||
|
||||
for _name, _attr in (("anthropic", "AsyncAnthropic"), ("openai", "AsyncOpenAI")):
|
||||
if _name not in sys.modules:
|
||||
_stub = types.ModuleType(_name)
|
||||
setattr(_stub, _attr, type("_Stub", (), {}))
|
||||
sys.modules[_name] = _stub
|
||||
|
||||
from src.config import settings
|
||||
from src.context.compactor import estimate_tokens
|
||||
from src.context.engine import ContextEngine
|
||||
from src.models.agent import AgentProfile
|
||||
from src.models.session import SessionState
|
||||
|
||||
|
||||
def _load_largest_real_session():
|
||||
"""Mayor sesión real de Redis db 1, o None si no hay acceso/sesiones."""
|
||||
try:
|
||||
import redis
|
||||
|
||||
r = redis.Redis(
|
||||
host=settings.redis_host,
|
||||
port=settings.redis_port,
|
||||
db=1,
|
||||
password=settings.redis_password or None,
|
||||
decode_responses=True,
|
||||
socket_connect_timeout=2,
|
||||
)
|
||||
keys = [
|
||||
k for k in r.scan_iter("agentic:session:*")
|
||||
if not k.endswith((":events", ":artifacts"))
|
||||
]
|
||||
if not keys:
|
||||
return None
|
||||
biggest = max(keys, key=lambda k: r.strlen(k))
|
||||
raw = r.get(biggest)
|
||||
return json.loads(raw) if raw else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def test_real_session_compacts_under_model_window(monkeypatch):
|
||||
data = _load_largest_real_session()
|
||||
if not data or not data.get("recent_messages"):
|
||||
pytest.skip("sin Redis/sesiones reales disponibles")
|
||||
|
||||
rm = data["recent_messages"]
|
||||
raw_tokens = sum(estimate_tokens(json.dumps(m)) for m in rm)
|
||||
|
||||
from src.orchestrator import cost
|
||||
|
||||
async def _fake_window(model_id):
|
||||
return 32_000
|
||||
|
||||
monkeypatch.setattr(cost, "resolve_context_window", _fake_window)
|
||||
|
||||
session = SessionState(
|
||||
immutable_rules=data.get("immutable_rules") or ["No romper"],
|
||||
project_profile=data.get("project_profile") or {},
|
||||
task_history=data.get("task_history") or [],
|
||||
recent_messages=rm,
|
||||
)
|
||||
session.begin_task("Sigamos con lo anterior")
|
||||
agent = AgentProfile(
|
||||
role="acai",
|
||||
name="Acai",
|
||||
system_prompt="Haz el trabajo.",
|
||||
context_sections=["immutable_rules", "task_state"],
|
||||
)
|
||||
|
||||
pkg = asyncio.run(
|
||||
ContextEngine().build_context(
|
||||
session=session, agent=agent, conversation=rm, model_id="openrouter/x"
|
||||
)
|
||||
)
|
||||
|
||||
# Budget derivado de la ventana REAL del modelo (32k), no del fijo de 120k/200k.
|
||||
assert pkg.budget_tokens == settings.budget_for_window(32_000)
|
||||
# La sesión real se compactó de verdad (no se reenvía cruda).
|
||||
assert pkg.total_token_estimate < raw_tokens
|
||||
# Y el resultado cabe en el budget del modelo → no habría overflow.
|
||||
assert pkg.total_token_estimate <= pkg.budget_tokens
|
||||
93
tests/test_overflow_recovery.py
Normal file
93
tests/test_overflow_recovery.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""Tests de recuperación ante overflow de ventana de contexto.
|
||||
|
||||
Cubre: detección del error de context-length del proveedor, y el envoltorio del
|
||||
adapter que lo traduce a `ContextOverflowError` (dominio) tanto si salta al
|
||||
iniciar el stream como durante la iteración.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import enum
|
||||
import sys
|
||||
import types
|
||||
|
||||
import pytest
|
||||
|
||||
if not hasattr(enum, "StrEnum"):
|
||||
class _CompatStrEnum(str, enum.Enum):
|
||||
pass
|
||||
|
||||
enum.StrEnum = _CompatStrEnum
|
||||
|
||||
if "anthropic" not in sys.modules:
|
||||
anthropic_stub = types.ModuleType("anthropic")
|
||||
anthropic_stub.AsyncAnthropic = type("_AsyncAnthropic", (), {})
|
||||
sys.modules["anthropic"] = anthropic_stub
|
||||
|
||||
if "openai" not in sys.modules:
|
||||
openai_stub = types.ModuleType("openai")
|
||||
openai_stub.AsyncOpenAI = type("_AsyncOpenAI", (), {})
|
||||
sys.modules["openai"] = openai_stub
|
||||
|
||||
from src.adapters.base import ContextOverflowError
|
||||
from src.adapters.openai_adapter import OpenAIAdapter, _is_context_overflow
|
||||
|
||||
|
||||
class TestOverflowDetection:
|
||||
def test_detects_by_message(self):
|
||||
assert _is_context_overflow(
|
||||
Exception("This model's maximum context length is 8192 tokens, however you requested 9000")
|
||||
)
|
||||
assert _is_context_overflow(Exception("context_length_exceeded"))
|
||||
assert _is_context_overflow(Exception("Please reduce the length of the messages"))
|
||||
|
||||
def test_does_not_flag_unrelated_errors(self):
|
||||
assert not _is_context_overflow(Exception("rate limit exceeded"))
|
||||
assert not _is_context_overflow(Exception("invalid api key"))
|
||||
|
||||
def test_detects_by_type_name(self):
|
||||
class ContextWindowExceededError(Exception):
|
||||
pass
|
||||
|
||||
assert _is_context_overflow(ContextWindowExceededError("boom"))
|
||||
|
||||
|
||||
class TestStreamWrapperMapsOverflow:
|
||||
def _make_adapter(self):
|
||||
# Saltamos __init__ (no necesitamos el cliente AsyncOpenAI: parcheamos
|
||||
# _stream_impl). Así el test no depende del stub de openai.
|
||||
return OpenAIAdapter.__new__(OpenAIAdapter)
|
||||
|
||||
def test_overflow_at_stream_init_becomes_domain_error(self, monkeypatch):
|
||||
adapter = self._make_adapter()
|
||||
|
||||
async def _impl(messages, tools=None, config=None):
|
||||
raise RuntimeError("maximum context length is 32768 tokens")
|
||||
yield # noqa: hace de esto un async generator
|
||||
|
||||
monkeypatch.setattr(adapter, "_stream_impl", _impl)
|
||||
|
||||
async def _run():
|
||||
async for _ in adapter.stream([{"role": "user", "content": "hola"}]):
|
||||
pass
|
||||
|
||||
with pytest.raises(ContextOverflowError):
|
||||
asyncio.run(_run())
|
||||
|
||||
def test_non_overflow_error_propagates_unchanged(self, monkeypatch):
|
||||
adapter = self._make_adapter()
|
||||
|
||||
async def _impl(messages, tools=None, config=None):
|
||||
raise RuntimeError("connection reset by peer")
|
||||
yield
|
||||
|
||||
monkeypatch.setattr(adapter, "_stream_impl", _impl)
|
||||
|
||||
async def _run():
|
||||
async for _ in adapter.stream([{"role": "user", "content": "hola"}]):
|
||||
pass
|
||||
|
||||
with pytest.raises(RuntimeError) as exc:
|
||||
asyncio.run(_run())
|
||||
assert not isinstance(exc.value, ContextOverflowError)
|
||||
Reference in New Issue
Block a user