Files
agenticSystem/src/adapters/openai_adapter.py
Jordan Diaz 43337e8554 Hardening: lock de sesion atomico, monitor off por defecto, fix DeepSeek reasoning-only
- session_lock: token uuid + compare-and-delete (Lua), TTL > timeout de
  ejecucion; abort solo limpia el lock tras cancelacion confirmada.
  Evita doble ejecucion concurrente sobre la misma sesion.
- monitor HTTP (puerto 4545) deshabilitado salvo MCP_MONITOR_ENABLED=true
  y atado a 127.0.0.1; no se acumula historial en memoria si esta off.
- DeepSeek/LiteLLM: turnos que llegan solo con reasoning_content (sin
  content ni tool_calls) ya no rompen la sesion (400 'Invalid assistant
  message') ni se pintan como 'pensando': se promueven a texto en el
  historial y en el snapshot persistido.
- litellm pinneado a ==1.80.0 (builds reproducibles).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 15:17:52 +00:00

448 lines
20 KiB
Python

"""OpenAI model adapter with full streaming support."""
from __future__ import annotations
import json
import logging
from typing import Any, AsyncIterator
from openai import AsyncOpenAI
from ..config import settings
from .base import ModelAdapter, ModelConfig, ModelResponse, StreamChunk
logger = logging.getLogger(__name__)
def _estimate_usage(messages: list[dict[str, Any]], output_text: str) -> dict[str, int]:
"""Estimacion de tokens cuando el proveedor no entrega usage (p.ej. LiteLLM
streaming). Aproximada pero evita billing 0."""
from ..context.compactor import estimate_tokens
inp = 0
for m in messages:
c = m.get("content")
if isinstance(c, str):
inp += estimate_tokens(c)
elif isinstance(c, list):
for b in c:
if isinstance(b, dict):
inp += estimate_tokens(
b.get("text") or b.get("thinking") or str(b.get("content") or "")
)
return {"input_tokens": inp, "output_tokens": estimate_tokens(output_text or "")}
class OpenAIAdapter(ModelAdapter):
"""Adapter for the OpenAI API (GPT-4o, o1, etc.)."""
def __init__(self, api_key: str | None = None, base_url: str | None = None) -> None:
kwargs: dict[str, Any] = {
"api_key": api_key or settings.openai_api_key,
}
url = base_url or settings.openai_base_url
if url:
kwargs["base_url"] = url
self._client = AsyncOpenAI(**kwargs)
# El path nativo conserva el usage real del proveedor; subclases que no
# reciben usage fiable en streaming (LiteLLM) lo ponen a True para estimar.
self._estimate_usage_fallback = False
async def _acreate(self, kwargs: dict[str, Any]):
"""Hook de la llamada al modelo. Subclases (p.ej. LiteLLMAdapter) lo
sobreescriben para enrutar por otra librería sin tocar el resto del
flujo (procesado de chunks, tools, mensajes)."""
return await self._client.chat.completions.create(**kwargs)
# ------------------------------------------------------------------
# Streaming
# ------------------------------------------------------------------
async def stream(
self,
messages: list[dict[str, Any]],
tools: list[dict[str, Any]] | None = None,
config: ModelConfig | None = None,
) -> AsyncIterator[StreamChunk]:
config = config or ModelConfig(
model_id=settings.default_model_id,
max_tokens=settings.max_tokens,
temperature=settings.temperature,
)
kwargs: dict[str, Any] = {
"model": config.model_id or settings.default_model_id or "gpt-4o",
"max_tokens": config.max_tokens,
"temperature": config.temperature,
"messages": self._to_openai_messages(messages),
"stream": True,
"stream_options": {"include_usage": True},
}
if tools:
kwargs["tools"] = self._format_tools(tools)
stream = await self._acreate(kwargs)
# Fallback de tool-calls-en-texto: DeepSeek a veces emite las tool calls
# en su formato interno DSML como TEXTO (en el content) en vez de como
# tool_calls nativos. El endpoint OpenAI no lo convierte, asi que sin
# esto el agente "se para" mostrando DSML inerte. Reutilizamos el parser
# del claude_adapter.
from .claude_adapter import _parse_xml_tool_calls, _TOOL_CALL_OPEN_RE
tool_calls_acc: dict[int, dict[str, str]] = {}
final_usage: dict[str, int] = {}
usage_emitted = False # evita doble conteo si llega usage tras estimar
full_content = "" # content acumulado (para el fallback DSML)
full_reasoning = "" # razonamiento acumulado (para estimar usage)
emitted_chars = 0 # cuanto de full_content ya se emitio como delta
suppress_text = False # tras detectar un tool-call-en-texto, no emitir mas
# DeepSeek thinking mode: el razonamiento llega en `delta.reasoning_content`
# (antes del content). Lo acumulamos como un bloque `thinking` (block_index 0)
# para que el orquestador lo persista y `_to_openai_messages` lo reenvie como
# `reasoning_content` en el siguiente turno — DeepSeek lo exige en multi-turno
# con tool calls ("reasoning_content ... must be passed back to the API").
reasoning_seen = False
reasoning_sig_emitted = False
async for chunk in stream:
# With include_usage, the last chunk has usage but no choices.
# getattr: el chunk de LiteLLM (ModelResponseStream) no siempre trae
# el atributo `usage`; el del SDK OpenAI sí (None salvo el ultimo).
chunk_usage = getattr(chunk, "usage", None)
if chunk_usage:
final_usage = {
"input_tokens": getattr(chunk_usage, "prompt_tokens", 0) or 0,
"output_tokens": getattr(chunk_usage, "completion_tokens", 0) or 0,
}
choice = chunk.choices[0] if chunk.choices else None
if not choice:
# Usage-only chunk (last one with include_usage) — emit it
if final_usage and not usage_emitted:
yield StreamChunk(usage=final_usage)
usage_emitted = True
continue
delta = choice.delta
# Reasoning content (DeepSeek thinking mode). Llega como campo extra
# del delta; lo emitimos como thinking_delta en el bloque index 0.
reasoning_txt = getattr(delta, "reasoning_content", None) if delta else None
if reasoning_txt:
reasoning_seen = True
full_reasoning += reasoning_txt
yield StreamChunk(
thinking_delta=reasoning_txt,
block_type="thinking",
block_index=0,
)
# Text content
if delta and delta.content:
full_content += delta.content
if not suppress_text:
# Si arranca un tool call en texto (DSML/XML), emitimos lo
# previo y dejamos de emitir el resto (el DSML no debe verse).
m = _TOOL_CALL_OPEN_RE.search(full_content, emitted_chars)
if m:
suppress_text = True
if m.start() > emitted_chars:
yield StreamChunk(delta=full_content[emitted_chars:m.start()])
emitted_chars = len(full_content)
else:
yield StreamChunk(delta=full_content[emitted_chars:])
emitted_chars = len(full_content)
# Tool calls
if delta and delta.tool_calls:
for tc in delta.tool_calls:
idx = tc.index
if idx not in tool_calls_acc:
tool_calls_acc[idx] = {
"id": tc.id or "",
"name": "",
"arguments": "",
}
if tc.id:
tool_calls_acc[idx]["id"] = tc.id
if tc.function and tc.function.name:
tool_calls_acc[idx]["name"] = tc.function.name
yield StreamChunk(
tool_call_id=tc.id or tool_calls_acc[idx]["id"],
tool_name=tc.function.name,
)
if tc.function and tc.function.arguments:
tool_calls_acc[idx]["arguments"] += tc.function.arguments
yield StreamChunk(
tool_call_id=tool_calls_acc[idx]["id"],
tool_name=tool_calls_acc[idx]["name"],
tool_arguments=tc.function.arguments,
)
# Finish
if choice.finish_reason:
# Cerrar el bloque de razonamiento (si lo hubo) con un signature
# sintetico: el orquestador descarta thinking blocks sin signature
# (proteccion para MiniMax/Anthropic). DeepSeek no usa signatures;
# este marcador solo evita el descarte y NUNCA se reenvia — en
# `_to_openai_messages` el bloque se mapea a `reasoning_content`.
if reasoning_seen and not reasoning_sig_emitted:
reasoning_sig_emitted = True
yield StreamChunk(
thinking_signature="deepseek-reasoning",
block_type="thinking",
block_index=0,
)
# Fallback de usage: algunos proveedores via LiteLLM no entregan el
# chunk de usage (o llega tras el break del orquestador) → billing 0.
# Estimamos por tokens para no infra-cobrar. Solo si el adapter lo
# pide (LiteLLM); el path nativo conserva el usage real del proveedor.
if self._estimate_usage_fallback and not final_usage and not usage_emitted:
final_usage = _estimate_usage(messages, full_content + "\n" + full_reasoning)
# IMPORTANTE: DeepSeek (endpoint OpenAI) a veces cierra el stream
# con finish_reason="stop" AUNQUE haya emitido tool_calls. Si nos
# fiamos solo de =="tool_calls" perdemos esos tool calls: el agente
# anuncia la accion en texto y "se para" sin ejecutarla. Por eso
# disparamos los tool_use SIEMPRE que haya tool calls acumulados,
# sea cual sea el finish_reason.
if tool_calls_acc:
for acc in tool_calls_acc.values():
yield StreamChunk(
tool_call_id=acc["id"],
tool_name=acc["name"],
tool_arguments=acc["arguments"],
finish_reason="tool_use",
)
# Emit usage after tool_use chunks
if final_usage and not usage_emitted:
yield StreamChunk(usage=final_usage)
usage_emitted = True
else:
# Fallback: DeepSeek pudo emitir las tool calls como TEXTO
# (DSML/XML) en vez de nativas. Parseamos el content y, si hay
# tool calls, las ejecutamos igual; si no, cerramos el turno.
text_calls = _parse_xml_tool_calls(full_content) if full_content else []
if text_calls:
for c in text_calls:
yield StreamChunk(
tool_call_id=c["id"],
tool_name=c["name"],
tool_arguments=json.dumps(c.get("arguments", {}), ensure_ascii=False),
finish_reason="tool_use",
)
if final_usage and not usage_emitted:
yield StreamChunk(usage=final_usage)
usage_emitted = True
else:
yield StreamChunk(
finish_reason="end_turn"
if choice.finish_reason in ("stop", "tool_calls")
else choice.finish_reason,
usage=final_usage if not usage_emitted else {},
)
usage_emitted = True
# ------------------------------------------------------------------
# Non-streaming
# ------------------------------------------------------------------
async def complete(
self,
messages: list[dict[str, Any]],
tools: list[dict[str, Any]] | None = None,
config: ModelConfig | None = None,
) -> ModelResponse:
config = config or ModelConfig(
model_id=settings.default_model_id,
max_tokens=settings.max_tokens,
temperature=settings.temperature,
)
kwargs: dict[str, Any] = {
"model": config.model_id or settings.default_model_id or "gpt-4o",
"max_tokens": config.max_tokens,
"temperature": config.temperature,
"messages": self._to_openai_messages(messages),
}
if tools:
kwargs["tools"] = self._format_tools(tools)
# Fuerza al modelo a usar un tool concreto para garantizar JSON por schema
# (usado por /completions con json_schema). Ver ClaudeAdapter para la variante.
force_tool = (config.extra or {}).get("force_tool")
if force_tool:
kwargs["tool_choice"] = {
"type": "function",
"function": {"name": force_tool},
}
response = await self._acreate(kwargs)
choice = response.choices[0]
content = choice.message.content or ""
tool_calls: list[dict[str, Any]] = []
if choice.message.tool_calls:
for tc in choice.message.tool_calls:
tool_calls.append(
{
"id": tc.id,
"name": tc.function.name,
"arguments": json.loads(tc.function.arguments)
if tc.function.arguments
else {},
}
)
return ModelResponse(
content=content,
tool_calls=tool_calls,
finish_reason=choice.finish_reason or "",
usage={
"input_tokens": response.usage.prompt_tokens if response.usage else 0,
"output_tokens": response.usage.completion_tokens if response.usage else 0,
},
raw=response,
)
# ------------------------------------------------------------------
# Token counting
# ------------------------------------------------------------------
async def count_tokens(self, text: str) -> int:
from ..context.compactor import estimate_tokens
return estimate_tokens(text)
# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
@staticmethod
def _format_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Convert internal tool definitions to OpenAI function calling format.
Si `deepseek_strict_tools`, marca cada funcion con `strict: true` y limpia
del schema los keywords que DeepSeek strict NO soporta (minLength/maxLength/
minItems/maxItems), que de otro modo darian 400."""
strict = settings.deepseek_strict_tools
formatted: list[dict[str, Any]] = []
for tool in tools:
params = tool.get("input_schema", tool.get("parameters", {"type": "object"}))
fn: dict[str, Any] = {
"name": tool["name"],
"description": tool.get("description", ""),
"parameters": OpenAIAdapter._sanitize_strict_schema(params) if strict else params,
}
if strict:
fn["strict"] = True
formatted.append({"type": "function", "function": fn})
return formatted
# Keywords no soportados por DeepSeek strict mode (segun docs oficiales).
_STRICT_UNSUPPORTED_KEYS = ("minLength", "maxLength", "minItems", "maxItems")
@staticmethod
def _sanitize_strict_schema(schema: Any) -> Any:
"""Elimina recursivamente keywords no soportados por DeepSeek strict."""
if isinstance(schema, dict):
return {
k: OpenAIAdapter._sanitize_strict_schema(v)
for k, v in schema.items()
if k not in OpenAIAdapter._STRICT_UNSUPPORTED_KEYS
}
if isinstance(schema, list):
return [OpenAIAdapter._sanitize_strict_schema(x) for x in schema]
return schema
@staticmethod
def _blocks_text(content: Any) -> str:
"""Extrae texto plano de un content que puede ser str o lista de bloques."""
if content is None:
return ""
if isinstance(content, str):
return content
if isinstance(content, list):
parts = []
for b in content:
if isinstance(b, dict):
parts.append(b.get("text") or b.get("content") or "")
else:
parts.append(str(b))
return "\n".join(p for p in parts if p)
return str(content)
def _to_openai_messages(self, messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""Convierte los mensajes del formato interno (Anthropic-style, con bloques
`tool_use` / `tool_result`) al formato de la API OpenAI (`tool_calls` en el
assistant, mensajes `role: tool` con `tool_call_id`). El contexto se construye
en formato Anthropic, así que sin esto la API OpenAI de DeepSeek rechaza el
body ('unknown variant tool_use')."""
out: list[dict[str, Any]] = []
for msg in messages:
role = msg.get("role")
content = msg.get("content")
if role == "system":
out.append({"role": "system", "content": content if isinstance(content, str) else self._blocks_text(content)})
continue
if not isinstance(content, list):
out.append({"role": role, "content": content if isinstance(content, str) else str(content or "")})
continue
if role == "assistant":
text_parts: list[str] = []
tool_calls: list[dict[str, Any]] = []
reasoning_parts: list[str] = []
for b in content:
if not isinstance(b, dict):
continue
t = b.get("type")
if t == "text":
text_parts.append(b.get("text", ""))
elif t == "thinking":
# DeepSeek thinking mode: el razonamiento del turno debe
# reenviarse como `reasoning_content` (no como signature).
rc = b.get("thinking", "")
if rc:
reasoning_parts.append(rc)
elif t == "tool_use":
tool_calls.append({
"id": b.get("id", ""),
"type": "function",
"function": {
"name": b.get("name", ""),
"arguments": json.dumps(b.get("input", {}), ensure_ascii=False),
},
})
text_joined = "\n".join(p for p in text_parts if p)
m: dict[str, Any] = {"role": "assistant", "content": (text_joined or None)}
if reasoning_parts:
if not text_joined and not tool_calls:
# Quirk DeepSeek thinking: a veces emite TODA la respuesta
# en reasoning_content y cierra sin content ni tool_calls.
# Reenviar content=None sin tool_calls rompe la API
# ("content or tool_calls must be set"), asi que promovemos
# el reasoning a content (sin duplicarlo como reasoning_content).
m["content"] = "\n".join(reasoning_parts)
else:
m["reasoning_content"] = "\n".join(reasoning_parts)
if tool_calls:
m["tool_calls"] = tool_calls
out.append(m)
else: # user (puede traer tool_result blocks)
text_parts = []
for b in content:
if not isinstance(b, dict):
continue
t = b.get("type")
if t == "tool_result":
out.append({
"role": "tool",
"tool_call_id": b.get("tool_use_id", ""),
"content": self._blocks_text(b.get("content")),
})
elif t == "text":
text_parts.append(b.get("text", ""))
if text_parts:
out.append({"role": "user", "content": "\n".join(text_parts)})
return out