Que las conversaciones largas no se rompan ni gasten de más: Ventana de contexto por modelo (antes: budget estático 120k/200k para todos): - cost.resolve_context_window: lee context_length del catálogo OpenRouter/DeepSeek en Redis, con fallback a litellm. config.budget_for_window deriva el budget de la ventana real (window - max_output - reserve). build_context lo aplica por turno (param model_id) en vez del fijo de settings. - Self-heal del catálogo OpenRouter: el admin panel lo cachea con TTL 1h y solo lo repuebla al abrir su ventana de IA → en runtime caducaba y se perdían ventana y precio. Ahora cost._get_catalog lo refresca solo (fetch público, mismo shape, cooldown 5min, TTL 24h). Arregla también el coste (caía al fijo). Recuperación ante overflow: - adapters.base.ContextOverflowError; openai_adapter traduce el error de context-length del proveedor (init e iteración del stream). - base.py: retry proactivo que recompacta hasta caber en la ventana ANTES de llamar al LLM; si ni así cabe → error accionable (no rompe la sesión). - engine.py: mensaje user-facing claro (modelo + ventana). Tests: ventana/budget, self-heal (mockeado), overflow, y sesión REAL de Redis. 106 verdes. evals/: harness para evaluar al agente acai-code (driver + README + resultados). Comparativa kimi vs deepseek vs glm (deepseek-v4-pro high = mejor calidad/precio). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
619 lines
28 KiB
Python
619 lines
28 KiB
Python
"""OpenAI model adapter with full streaming support."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
from typing import Any, AsyncIterator
|
|
|
|
from openai import AsyncOpenAI
|
|
|
|
from ..config import settings
|
|
from .base import (
|
|
ContextOverflowError,
|
|
ModelAdapter,
|
|
ModelConfig,
|
|
ModelResponse,
|
|
StreamChunk,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Señales de que el proveedor rechazó por ventana de contexto. Detectamos por
|
|
# tipo (litellm.ContextWindowExceededError) y por mensaje (openai.BadRequestError
|
|
# u otros 400), sin acoplar el adapter a litellm con un import duro.
|
|
_CONTEXT_OVERFLOW_MARKERS = (
|
|
"context_length_exceeded",
|
|
"maximum context length",
|
|
"context window",
|
|
"context length",
|
|
"too many tokens",
|
|
"reduce the length",
|
|
"prompt is too long",
|
|
)
|
|
|
|
|
|
def _is_context_overflow(exc: Exception) -> bool:
|
|
if type(exc).__name__ in ("ContextWindowExceededError",):
|
|
return True
|
|
msg = str(getattr(exc, "message", "") or exc).lower()
|
|
return any(marker in msg for marker in _CONTEXT_OVERFLOW_MARKERS)
|
|
|
|
|
|
def _estimate_usage(messages: list[dict[str, Any]], output_text: str) -> dict[str, int]:
|
|
"""Estimacion de tokens cuando el proveedor no entrega usage (p.ej. LiteLLM
|
|
streaming). Aproximada pero evita billing 0."""
|
|
from ..context.compactor import estimate_tokens
|
|
inp = 0
|
|
for m in messages:
|
|
c = m.get("content")
|
|
if isinstance(c, str):
|
|
inp += estimate_tokens(c)
|
|
elif isinstance(c, list):
|
|
for b in c:
|
|
if isinstance(b, dict):
|
|
inp += estimate_tokens(
|
|
b.get("text") or b.get("thinking") or str(b.get("content") or "")
|
|
)
|
|
return {"input_tokens": inp, "output_tokens": estimate_tokens(output_text or "")}
|
|
|
|
|
|
class OpenAIAdapter(ModelAdapter):
|
|
"""Adapter for the OpenAI API (GPT-4o, o1, etc.)."""
|
|
|
|
def __init__(self, api_key: str | None = None, base_url: str | None = None) -> None:
|
|
kwargs: dict[str, Any] = {
|
|
"api_key": api_key or settings.openai_api_key,
|
|
}
|
|
url = base_url or settings.openai_base_url
|
|
if url:
|
|
kwargs["base_url"] = url
|
|
self._client = AsyncOpenAI(**kwargs)
|
|
# El path nativo conserva el usage real del proveedor; subclases que no
|
|
# reciben usage fiable en streaming (LiteLLM) lo ponen a True para estimar.
|
|
self._estimate_usage_fallback = False
|
|
|
|
async def _acreate(self, kwargs: dict[str, Any]):
|
|
"""Hook de la llamada al modelo. Subclases (p.ej. LiteLLMAdapter) lo
|
|
sobreescriben para enrutar por otra librería sin tocar el resto del
|
|
flujo (procesado de chunks, tools, mensajes)."""
|
|
return await self._client.chat.completions.create(**kwargs)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Streaming
|
|
# ------------------------------------------------------------------
|
|
|
|
async def stream(
|
|
self,
|
|
messages: list[dict[str, Any]],
|
|
tools: list[dict[str, Any]] | None = None,
|
|
config: ModelConfig | None = None,
|
|
) -> AsyncIterator[StreamChunk]:
|
|
"""Envoltorio que traduce errores de ventana de contexto del proveedor a
|
|
`ContextOverflowError` (dominio), tanto si saltan al iniciar el stream
|
|
como durante la primera iteración. El loop del agente lo usa para
|
|
reintentar con compactación agresiva si aún no emitió nada."""
|
|
try:
|
|
async for chunk in self._stream_impl(messages, tools, config):
|
|
yield chunk
|
|
except ContextOverflowError:
|
|
raise
|
|
except Exception as e:
|
|
if _is_context_overflow(e):
|
|
raise ContextOverflowError(str(getattr(e, "message", "") or e)) from e
|
|
raise
|
|
|
|
async def _stream_impl(
|
|
self,
|
|
messages: list[dict[str, Any]],
|
|
tools: list[dict[str, Any]] | None = None,
|
|
config: ModelConfig | None = None,
|
|
) -> AsyncIterator[StreamChunk]:
|
|
config = config or ModelConfig(
|
|
model_id=settings.default_model_id,
|
|
max_tokens=settings.max_tokens,
|
|
temperature=settings.temperature,
|
|
)
|
|
|
|
kwargs: dict[str, Any] = {
|
|
"model": config.model_id or settings.default_model_id or "gpt-4o",
|
|
"max_tokens": config.max_tokens,
|
|
"temperature": config.temperature,
|
|
"messages": self._to_openai_messages(messages),
|
|
"stream": True,
|
|
"stream_options": {"include_usage": True},
|
|
}
|
|
if getattr(config, "reasoning_effort", ""):
|
|
kwargs["reasoning_effort"] = config.reasoning_effort
|
|
if tools:
|
|
kwargs["tools"] = self._format_tools(tools)
|
|
|
|
stream = await self._acreate(kwargs)
|
|
|
|
# Fallback de tool-calls-en-texto: DeepSeek a veces emite las tool calls
|
|
# en su formato interno DSML como TEXTO (en el content) en vez de como
|
|
# tool_calls nativos. El endpoint OpenAI no lo convierte, asi que sin
|
|
# esto el agente "se para" mostrando DSML inerte. Reutilizamos el parser
|
|
# del claude_adapter.
|
|
from .claude_adapter import _parse_xml_tool_calls, _TOOL_CALL_OPEN_RE
|
|
|
|
tool_calls_acc: dict[int, dict[str, str]] = {}
|
|
|
|
final_usage: dict[str, int] = {}
|
|
usage_emitted = False # evita doble conteo si llega usage tras estimar
|
|
full_content = "" # content acumulado (para el fallback DSML)
|
|
full_reasoning = "" # razonamiento acumulado (para estimar usage)
|
|
emitted_chars = 0 # cuanto de full_content ya se emitio como delta
|
|
suppress_text = False # tras detectar un tool-call-en-texto, no emitir mas
|
|
|
|
# DeepSeek thinking mode: el razonamiento llega en `delta.reasoning_content`
|
|
# (antes del content). Lo acumulamos como un bloque `thinking` (block_index 0)
|
|
# para que el orquestador lo persista y `_to_openai_messages` lo reenvie como
|
|
# `reasoning_content` en el siguiente turno — DeepSeek lo exige en multi-turno
|
|
# con tool calls ("reasoning_content ... must be passed back to the API").
|
|
reasoning_seen = False
|
|
reasoning_sig_emitted = False
|
|
|
|
async for chunk in stream:
|
|
# With include_usage, the last chunk has usage but no choices.
|
|
# getattr: el chunk de LiteLLM (ModelResponseStream) no siempre trae
|
|
# el atributo `usage`; el del SDK OpenAI sí (None salvo el ultimo).
|
|
chunk_usage = getattr(chunk, "usage", None)
|
|
if chunk_usage:
|
|
final_usage = {
|
|
"input_tokens": getattr(chunk_usage, "prompt_tokens", 0) or 0,
|
|
"output_tokens": getattr(chunk_usage, "completion_tokens", 0) or 0,
|
|
}
|
|
|
|
choice = chunk.choices[0] if chunk.choices else None
|
|
if not choice:
|
|
# Usage-only chunk (last one with include_usage) — emit it
|
|
if final_usage and not usage_emitted:
|
|
yield StreamChunk(usage=final_usage)
|
|
usage_emitted = True
|
|
continue
|
|
|
|
delta = choice.delta
|
|
|
|
# Reasoning content (DeepSeek thinking mode). Llega como campo extra
|
|
# del delta; lo emitimos como thinking_delta en el bloque index 0.
|
|
reasoning_txt = getattr(delta, "reasoning_content", None) if delta else None
|
|
if reasoning_txt:
|
|
reasoning_seen = True
|
|
full_reasoning += reasoning_txt
|
|
yield StreamChunk(
|
|
thinking_delta=reasoning_txt,
|
|
block_type="thinking",
|
|
block_index=0,
|
|
)
|
|
|
|
# Text content
|
|
if delta and delta.content:
|
|
full_content += delta.content
|
|
if not suppress_text:
|
|
# Si arranca un tool call en texto (DSML/XML), emitimos lo
|
|
# previo y dejamos de emitir el resto (el DSML no debe verse).
|
|
m = _TOOL_CALL_OPEN_RE.search(full_content, emitted_chars)
|
|
if m:
|
|
suppress_text = True
|
|
if m.start() > emitted_chars:
|
|
yield StreamChunk(delta=full_content[emitted_chars:m.start()])
|
|
emitted_chars = len(full_content)
|
|
else:
|
|
yield StreamChunk(delta=full_content[emitted_chars:])
|
|
emitted_chars = len(full_content)
|
|
|
|
# Tool calls
|
|
if delta and delta.tool_calls:
|
|
for tc in delta.tool_calls:
|
|
idx = tc.index
|
|
if idx not in tool_calls_acc:
|
|
tool_calls_acc[idx] = {
|
|
"id": tc.id or "",
|
|
"name": "",
|
|
"arguments": "",
|
|
}
|
|
if tc.id:
|
|
tool_calls_acc[idx]["id"] = tc.id
|
|
if tc.function and tc.function.name:
|
|
tool_calls_acc[idx]["name"] = tc.function.name
|
|
yield StreamChunk(
|
|
tool_call_id=tc.id or tool_calls_acc[idx]["id"],
|
|
tool_name=tc.function.name,
|
|
)
|
|
if tc.function and tc.function.arguments:
|
|
tool_calls_acc[idx]["arguments"] += tc.function.arguments
|
|
yield StreamChunk(
|
|
tool_call_id=tool_calls_acc[idx]["id"],
|
|
tool_name=tool_calls_acc[idx]["name"],
|
|
tool_arguments=tc.function.arguments,
|
|
)
|
|
|
|
# Finish
|
|
if choice.finish_reason:
|
|
# Cerrar el bloque de razonamiento (si lo hubo) con un signature
|
|
# sintetico: el orquestador descarta thinking blocks sin signature
|
|
# (proteccion para MiniMax/Anthropic). DeepSeek no usa signatures;
|
|
# este marcador solo evita el descarte y NUNCA se reenvia — en
|
|
# `_to_openai_messages` el bloque se mapea a `reasoning_content`.
|
|
if reasoning_seen and not reasoning_sig_emitted:
|
|
reasoning_sig_emitted = True
|
|
yield StreamChunk(
|
|
thinking_signature="deepseek-reasoning",
|
|
block_type="thinking",
|
|
block_index=0,
|
|
)
|
|
# Fallback de usage: algunos proveedores via LiteLLM no entregan el
|
|
# chunk de usage (o llega tras el break del orquestador) → billing 0.
|
|
# Estimamos por tokens para no infra-cobrar. Solo si el adapter lo
|
|
# pide (LiteLLM); el path nativo conserva el usage real del proveedor.
|
|
if self._estimate_usage_fallback and not final_usage and not usage_emitted:
|
|
final_usage = _estimate_usage(messages, full_content + "\n" + full_reasoning)
|
|
# IMPORTANTE: DeepSeek (endpoint OpenAI) a veces cierra el stream
|
|
# con finish_reason="stop" AUNQUE haya emitido tool_calls. Si nos
|
|
# fiamos solo de =="tool_calls" perdemos esos tool calls: el agente
|
|
# anuncia la accion en texto y "se para" sin ejecutarla. Por eso
|
|
# disparamos los tool_use SIEMPRE que haya tool calls acumulados,
|
|
# sea cual sea el finish_reason.
|
|
if tool_calls_acc:
|
|
for acc in tool_calls_acc.values():
|
|
yield StreamChunk(
|
|
tool_call_id=acc["id"],
|
|
tool_name=acc["name"],
|
|
tool_arguments=acc["arguments"],
|
|
finish_reason="tool_use",
|
|
)
|
|
# Emit usage after tool_use chunks
|
|
if final_usage and not usage_emitted:
|
|
yield StreamChunk(usage=final_usage)
|
|
usage_emitted = True
|
|
else:
|
|
# Fallback: DeepSeek pudo emitir las tool calls como TEXTO
|
|
# (DSML/XML) en vez de nativas. Parseamos el content y, si hay
|
|
# tool calls, las ejecutamos igual; si no, cerramos el turno.
|
|
text_calls = _parse_xml_tool_calls(full_content) if full_content else []
|
|
if text_calls:
|
|
for c in text_calls:
|
|
yield StreamChunk(
|
|
tool_call_id=c["id"],
|
|
tool_name=c["name"],
|
|
tool_arguments=json.dumps(c.get("arguments", {}), ensure_ascii=False),
|
|
finish_reason="tool_use",
|
|
)
|
|
if final_usage and not usage_emitted:
|
|
yield StreamChunk(usage=final_usage)
|
|
usage_emitted = True
|
|
else:
|
|
yield StreamChunk(
|
|
finish_reason="end_turn"
|
|
if choice.finish_reason in ("stop", "tool_calls")
|
|
else choice.finish_reason,
|
|
usage=final_usage if not usage_emitted else {},
|
|
)
|
|
usage_emitted = True
|
|
|
|
# ------------------------------------------------------------------
|
|
# Non-streaming
|
|
# ------------------------------------------------------------------
|
|
|
|
async def complete(
|
|
self,
|
|
messages: list[dict[str, Any]],
|
|
tools: list[dict[str, Any]] | None = None,
|
|
config: ModelConfig | None = None,
|
|
) -> ModelResponse:
|
|
config = config or ModelConfig(
|
|
model_id=settings.default_model_id,
|
|
max_tokens=settings.max_tokens,
|
|
temperature=settings.temperature,
|
|
)
|
|
|
|
kwargs: dict[str, Any] = {
|
|
"model": config.model_id or settings.default_model_id or "gpt-4o",
|
|
"max_tokens": config.max_tokens,
|
|
"temperature": config.temperature,
|
|
"messages": self._to_openai_messages(messages),
|
|
}
|
|
if getattr(config, "reasoning_effort", ""):
|
|
kwargs["reasoning_effort"] = config.reasoning_effort
|
|
if tools:
|
|
kwargs["tools"] = self._format_tools(tools)
|
|
# Fuerza al modelo a usar un tool concreto para garantizar JSON por schema
|
|
# (usado por /completions con json_schema). Ver ClaudeAdapter para la variante.
|
|
force_tool = (config.extra or {}).get("force_tool")
|
|
if force_tool:
|
|
kwargs["tool_choice"] = {
|
|
"type": "function",
|
|
"function": {"name": force_tool},
|
|
}
|
|
|
|
try:
|
|
response = await self._acreate(kwargs)
|
|
except ContextOverflowError:
|
|
raise
|
|
except Exception as e:
|
|
if _is_context_overflow(e):
|
|
raise ContextOverflowError(str(getattr(e, "message", "") or e)) from e
|
|
raise
|
|
choice = response.choices[0]
|
|
|
|
content = choice.message.content or ""
|
|
tool_calls: list[dict[str, Any]] = []
|
|
|
|
if choice.message.tool_calls:
|
|
for tc in choice.message.tool_calls:
|
|
tool_calls.append(
|
|
{
|
|
"id": tc.id,
|
|
"name": tc.function.name,
|
|
"arguments": json.loads(tc.function.arguments)
|
|
if tc.function.arguments
|
|
else {},
|
|
}
|
|
)
|
|
|
|
return ModelResponse(
|
|
content=content,
|
|
tool_calls=tool_calls,
|
|
finish_reason=choice.finish_reason or "",
|
|
usage={
|
|
"input_tokens": response.usage.prompt_tokens if response.usage else 0,
|
|
"output_tokens": response.usage.completion_tokens if response.usage else 0,
|
|
},
|
|
raw=response,
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Token counting
|
|
# ------------------------------------------------------------------
|
|
|
|
async def count_tokens(self, text: str) -> int:
|
|
from ..context.compactor import estimate_tokens
|
|
return estimate_tokens(text)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def _format_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
"""Convert internal tool definitions to OpenAI function calling format.
|
|
|
|
Si `deepseek_strict_tools`, marca cada funcion con `strict: true` y limpia
|
|
del schema los keywords que DeepSeek strict NO soporta (minLength/maxLength/
|
|
minItems/maxItems), que de otro modo darian 400."""
|
|
strict = settings.deepseek_strict_tools
|
|
formatted: list[dict[str, Any]] = []
|
|
for tool in tools:
|
|
params = tool.get("input_schema", tool.get("parameters", {"type": "object"}))
|
|
fn: dict[str, Any] = {
|
|
"name": tool["name"],
|
|
"description": tool.get("description", ""),
|
|
"parameters": OpenAIAdapter._sanitize_strict_schema(params) if strict else params,
|
|
}
|
|
if strict:
|
|
fn["strict"] = True
|
|
formatted.append({"type": "function", "function": fn})
|
|
return formatted
|
|
|
|
# Keywords no soportados por DeepSeek strict mode (segun docs oficiales).
|
|
_STRICT_UNSUPPORTED_KEYS = ("minLength", "maxLength", "minItems", "maxItems")
|
|
|
|
@staticmethod
|
|
def _sanitize_strict_schema(schema: Any) -> Any:
|
|
"""Elimina recursivamente keywords no soportados por DeepSeek strict."""
|
|
if isinstance(schema, dict):
|
|
return {
|
|
k: OpenAIAdapter._sanitize_strict_schema(v)
|
|
for k, v in schema.items()
|
|
if k not in OpenAIAdapter._STRICT_UNSUPPORTED_KEYS
|
|
}
|
|
if isinstance(schema, list):
|
|
return [OpenAIAdapter._sanitize_strict_schema(x) for x in schema]
|
|
return schema
|
|
|
|
@staticmethod
|
|
def _blocks_text(content: Any) -> str:
|
|
"""Extrae texto plano de un content que puede ser str o lista de bloques."""
|
|
if content is None:
|
|
return ""
|
|
if isinstance(content, str):
|
|
return content
|
|
if isinstance(content, list):
|
|
parts = []
|
|
for b in content:
|
|
if isinstance(b, dict):
|
|
parts.append(b.get("text") or b.get("content") or "")
|
|
else:
|
|
parts.append(str(b))
|
|
return "\n".join(p for p in parts if p)
|
|
return str(content)
|
|
|
|
def _to_openai_messages(self, messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
"""Convierte los mensajes del formato interno (Anthropic-style, con bloques
|
|
`tool_use` / `tool_result`) al formato de la API OpenAI (`tool_calls` en el
|
|
assistant, mensajes `role: tool` con `tool_call_id`). El contexto se construye
|
|
en formato Anthropic, así que sin esto la API OpenAI de DeepSeek rechaza el
|
|
body ('unknown variant tool_use')."""
|
|
out: list[dict[str, Any]] = []
|
|
for msg in messages:
|
|
role = msg.get("role")
|
|
content = msg.get("content")
|
|
if role == "system":
|
|
out.append({"role": "system", "content": content if isinstance(content, str) else self._blocks_text(content)})
|
|
continue
|
|
if not isinstance(content, list):
|
|
out.append({"role": role, "content": content if isinstance(content, str) else str(content or "")})
|
|
continue
|
|
if role == "assistant":
|
|
text_parts: list[str] = []
|
|
tool_calls: list[dict[str, Any]] = []
|
|
reasoning_parts: list[str] = []
|
|
for b in content:
|
|
if not isinstance(b, dict):
|
|
continue
|
|
t = b.get("type")
|
|
if t == "text":
|
|
text_parts.append(b.get("text", ""))
|
|
elif t == "thinking":
|
|
# DeepSeek thinking mode: el razonamiento del turno debe
|
|
# reenviarse como `reasoning_content` (no como signature).
|
|
rc = b.get("thinking", "")
|
|
if rc:
|
|
reasoning_parts.append(rc)
|
|
elif t == "tool_use":
|
|
tool_calls.append({
|
|
"id": b.get("id", ""),
|
|
"type": "function",
|
|
"function": {
|
|
"name": b.get("name", ""),
|
|
"arguments": json.dumps(b.get("input", {}), ensure_ascii=False),
|
|
},
|
|
})
|
|
text_joined = "\n".join(p for p in text_parts if p)
|
|
m: dict[str, Any] = {"role": "assistant", "content": (text_joined or None)}
|
|
if reasoning_parts:
|
|
if not text_joined and not tool_calls:
|
|
# Quirk DeepSeek thinking: a veces emite TODA la respuesta
|
|
# en reasoning_content y cierra sin content ni tool_calls.
|
|
# Reenviar content=None sin tool_calls rompe la API
|
|
# ("content or tool_calls must be set"), asi que promovemos
|
|
# el reasoning a content (sin duplicarlo como reasoning_content).
|
|
m["content"] = "\n".join(reasoning_parts)
|
|
else:
|
|
m["reasoning_content"] = "\n".join(reasoning_parts)
|
|
if tool_calls:
|
|
m["tool_calls"] = tool_calls
|
|
out.append(m)
|
|
else: # user (puede traer tool_result blocks, texto e imágenes)
|
|
text_parts = []
|
|
image_blocks: list[dict[str, Any]] = []
|
|
for b in content:
|
|
if not isinstance(b, dict):
|
|
continue
|
|
t = b.get("type")
|
|
if t == "tool_result":
|
|
out.append({
|
|
"role": "tool",
|
|
"tool_call_id": b.get("tool_use_id", ""),
|
|
"content": self._blocks_text(b.get("content")),
|
|
})
|
|
elif t == "text":
|
|
text_parts.append(b.get("text", ""))
|
|
elif t == "image_url":
|
|
# Visión nativa: preservar el bloque en formato multimodal OpenAI.
|
|
image_blocks.append({"type": "image_url", "image_url": b.get("image_url") or {}})
|
|
if image_blocks:
|
|
# Content como lista de bloques (texto + imágenes).
|
|
parts: list[dict[str, Any]] = []
|
|
joined = "\n".join(p for p in text_parts if p)
|
|
if joined:
|
|
parts.append({"type": "text", "text": joined})
|
|
parts.extend(image_blocks)
|
|
out.append({"role": "user", "content": parts})
|
|
elif text_parts:
|
|
out.append({"role": "user", "content": "\n".join(text_parts)})
|
|
# Guard defensivo: el compactor ya garantiza el invariante tool_use ↔
|
|
# tool_result (`_enforce_tool_pairing`), pero si algo se escapa el
|
|
# proveedor devuelve 400 y la sesion queda bloqueada. Cinturon y tirantes.
|
|
return self._repair_tool_sequence(out)
|
|
|
|
@staticmethod
|
|
def _repair_tool_sequence(out: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
"""Garantiza el contrato OpenAI sobre la secuencia ya convertida:
|
|
|
|
- Todo `role: tool` debe responder a un tool_call_id del assistant
|
|
inmediatamente anterior (o de su bloque contiguo de tool messages).
|
|
Si no → se convierte a user con placeholder.
|
|
- Todo assistant con `tool_calls` debe tener respuesta para CADA id.
|
|
Los tool_calls sin respuesta se eliminan; si la lista queda vacia se
|
|
elimina la key (y se asegura `content` no-None — "content or
|
|
tool_calls must be set").
|
|
|
|
No deberia activarse nunca (el compactor repara antes); si se activa,
|
|
loguea warning para detectar regresiones del compactor.
|
|
"""
|
|
repaired: list[dict[str, Any]] = []
|
|
i = 0
|
|
n = len(out)
|
|
while i < n:
|
|
msg = out[i]
|
|
role = msg.get("role")
|
|
|
|
if role == "assistant" and msg.get("tool_calls"):
|
|
# Bloque contiguo de tool messages que responden a este assistant.
|
|
j = i + 1
|
|
block: list[dict[str, Any]] = []
|
|
while j < n and out[j].get("role") == "tool":
|
|
block.append(out[j])
|
|
j += 1
|
|
answered = {t.get("tool_call_id", "") for t in block}
|
|
kept_calls = [
|
|
tc for tc in msg["tool_calls"] if tc.get("id", "") in answered
|
|
]
|
|
dropped = [
|
|
tc for tc in msg["tool_calls"] if tc.get("id", "") not in answered
|
|
]
|
|
new_msg = dict(msg)
|
|
if dropped:
|
|
for tc in dropped:
|
|
logger.warning(
|
|
"repaired unanswered tool_call at index %d (tool_call_id=%s)",
|
|
i,
|
|
tc.get("id", ""),
|
|
)
|
|
if kept_calls:
|
|
new_msg["tool_calls"] = kept_calls
|
|
else:
|
|
new_msg.pop("tool_calls", None)
|
|
if new_msg.get("content") is None:
|
|
# Promover reasoning a content si existe (mismo
|
|
# criterio que el quirk DeepSeek de arriba); si no,
|
|
# placeholder para no enviar content=None sin tools.
|
|
rc = new_msg.pop("reasoning_content", None)
|
|
new_msg["content"] = rc or "[ASSISTANT COMPACTADO]"
|
|
repaired.append(new_msg)
|
|
valid_ids = {tc.get("id", "") for tc in kept_calls}
|
|
converted: list[dict[str, Any]] = []
|
|
for t in block:
|
|
if t.get("tool_call_id", "") in valid_ids:
|
|
repaired.append(t)
|
|
else:
|
|
logger.warning(
|
|
"repaired orphan tool message (tool_call_id=%s)",
|
|
t.get("tool_call_id", ""),
|
|
)
|
|
converted.append(
|
|
{
|
|
"role": "user",
|
|
"content": "[Resultado de herramienta (contexto compactado)]: "
|
|
+ str(t.get("content", ""))[:500],
|
|
}
|
|
)
|
|
# Los huerfanos convertidos van DESPUES del bloque de tools
|
|
# validos para no romper la contiguidad assistant → tools.
|
|
repaired.extend(converted)
|
|
i = j
|
|
continue
|
|
|
|
if role == "tool":
|
|
# Tool message sin assistant con tool_calls delante → huerfano.
|
|
logger.warning(
|
|
"repaired orphan tool message at index %d (tool_call_id=%s)",
|
|
i,
|
|
msg.get("tool_call_id", ""),
|
|
)
|
|
repaired.append(
|
|
{
|
|
"role": "user",
|
|
"content": "[Resultado de herramienta (contexto compactado)]: "
|
|
+ str(msg.get("content", ""))[:500],
|
|
}
|
|
)
|
|
i += 1
|
|
continue
|
|
|
|
repaired.append(msg)
|
|
i += 1
|
|
return repaired
|