"""OpenAI model adapter with full streaming support.""" from __future__ import annotations import json import logging from typing import Any, AsyncIterator from openai import AsyncOpenAI from ..config import settings from .base import ModelAdapter, ModelConfig, ModelResponse, StreamChunk logger = logging.getLogger(__name__) def _estimate_usage(messages: list[dict[str, Any]], output_text: str) -> dict[str, int]: """Estimacion de tokens cuando el proveedor no entrega usage (p.ej. LiteLLM streaming). Aproximada pero evita billing 0.""" from ..context.compactor import estimate_tokens inp = 0 for m in messages: c = m.get("content") if isinstance(c, str): inp += estimate_tokens(c) elif isinstance(c, list): for b in c: if isinstance(b, dict): inp += estimate_tokens( b.get("text") or b.get("thinking") or str(b.get("content") or "") ) return {"input_tokens": inp, "output_tokens": estimate_tokens(output_text or "")} class OpenAIAdapter(ModelAdapter): """Adapter for the OpenAI API (GPT-4o, o1, etc.).""" def __init__(self, api_key: str | None = None, base_url: str | None = None) -> None: kwargs: dict[str, Any] = { "api_key": api_key or settings.openai_api_key, } url = base_url or settings.openai_base_url if url: kwargs["base_url"] = url self._client = AsyncOpenAI(**kwargs) # El path nativo conserva el usage real del proveedor; subclases que no # reciben usage fiable en streaming (LiteLLM) lo ponen a True para estimar. self._estimate_usage_fallback = False async def _acreate(self, kwargs: dict[str, Any]): """Hook de la llamada al modelo. Subclases (p.ej. LiteLLMAdapter) lo sobreescriben para enrutar por otra librería sin tocar el resto del flujo (procesado de chunks, tools, mensajes).""" return await self._client.chat.completions.create(**kwargs) # ------------------------------------------------------------------ # Streaming # ------------------------------------------------------------------ async def stream( self, messages: list[dict[str, Any]], tools: list[dict[str, Any]] | None = None, config: ModelConfig | None = None, ) -> AsyncIterator[StreamChunk]: config = config or ModelConfig( model_id=settings.default_model_id, max_tokens=settings.max_tokens, temperature=settings.temperature, ) kwargs: dict[str, Any] = { "model": config.model_id or settings.default_model_id or "gpt-4o", "max_tokens": config.max_tokens, "temperature": config.temperature, "messages": self._to_openai_messages(messages), "stream": True, "stream_options": {"include_usage": True}, } if tools: kwargs["tools"] = self._format_tools(tools) stream = await self._acreate(kwargs) # Fallback de tool-calls-en-texto: DeepSeek a veces emite las tool calls # en su formato interno DSML como TEXTO (en el content) en vez de como # tool_calls nativos. El endpoint OpenAI no lo convierte, asi que sin # esto el agente "se para" mostrando DSML inerte. Reutilizamos el parser # del claude_adapter. from .claude_adapter import _parse_xml_tool_calls, _TOOL_CALL_OPEN_RE tool_calls_acc: dict[int, dict[str, str]] = {} final_usage: dict[str, int] = {} usage_emitted = False # evita doble conteo si llega usage tras estimar full_content = "" # content acumulado (para el fallback DSML) full_reasoning = "" # razonamiento acumulado (para estimar usage) emitted_chars = 0 # cuanto de full_content ya se emitio como delta suppress_text = False # tras detectar un tool-call-en-texto, no emitir mas # DeepSeek thinking mode: el razonamiento llega en `delta.reasoning_content` # (antes del content). Lo acumulamos como un bloque `thinking` (block_index 0) # para que el orquestador lo persista y `_to_openai_messages` lo reenvie como # `reasoning_content` en el siguiente turno — DeepSeek lo exige en multi-turno # con tool calls ("reasoning_content ... must be passed back to the API"). reasoning_seen = False reasoning_sig_emitted = False async for chunk in stream: # With include_usage, the last chunk has usage but no choices. # getattr: el chunk de LiteLLM (ModelResponseStream) no siempre trae # el atributo `usage`; el del SDK OpenAI sí (None salvo el ultimo). chunk_usage = getattr(chunk, "usage", None) if chunk_usage: final_usage = { "input_tokens": getattr(chunk_usage, "prompt_tokens", 0) or 0, "output_tokens": getattr(chunk_usage, "completion_tokens", 0) or 0, } choice = chunk.choices[0] if chunk.choices else None if not choice: # Usage-only chunk (last one with include_usage) — emit it if final_usage and not usage_emitted: yield StreamChunk(usage=final_usage) usage_emitted = True continue delta = choice.delta # Reasoning content (DeepSeek thinking mode). Llega como campo extra # del delta; lo emitimos como thinking_delta en el bloque index 0. reasoning_txt = getattr(delta, "reasoning_content", None) if delta else None if reasoning_txt: reasoning_seen = True full_reasoning += reasoning_txt yield StreamChunk( thinking_delta=reasoning_txt, block_type="thinking", block_index=0, ) # Text content if delta and delta.content: full_content += delta.content if not suppress_text: # Si arranca un tool call en texto (DSML/XML), emitimos lo # previo y dejamos de emitir el resto (el DSML no debe verse). m = _TOOL_CALL_OPEN_RE.search(full_content, emitted_chars) if m: suppress_text = True if m.start() > emitted_chars: yield StreamChunk(delta=full_content[emitted_chars:m.start()]) emitted_chars = len(full_content) else: yield StreamChunk(delta=full_content[emitted_chars:]) emitted_chars = len(full_content) # Tool calls if delta and delta.tool_calls: for tc in delta.tool_calls: idx = tc.index if idx not in tool_calls_acc: tool_calls_acc[idx] = { "id": tc.id or "", "name": "", "arguments": "", } if tc.id: tool_calls_acc[idx]["id"] = tc.id if tc.function and tc.function.name: tool_calls_acc[idx]["name"] = tc.function.name yield StreamChunk( tool_call_id=tc.id or tool_calls_acc[idx]["id"], tool_name=tc.function.name, ) if tc.function and tc.function.arguments: tool_calls_acc[idx]["arguments"] += tc.function.arguments yield StreamChunk( tool_call_id=tool_calls_acc[idx]["id"], tool_name=tool_calls_acc[idx]["name"], tool_arguments=tc.function.arguments, ) # Finish if choice.finish_reason: # Cerrar el bloque de razonamiento (si lo hubo) con un signature # sintetico: el orquestador descarta thinking blocks sin signature # (proteccion para MiniMax/Anthropic). DeepSeek no usa signatures; # este marcador solo evita el descarte y NUNCA se reenvia — en # `_to_openai_messages` el bloque se mapea a `reasoning_content`. if reasoning_seen and not reasoning_sig_emitted: reasoning_sig_emitted = True yield StreamChunk( thinking_signature="deepseek-reasoning", block_type="thinking", block_index=0, ) # Fallback de usage: algunos proveedores via LiteLLM no entregan el # chunk de usage (o llega tras el break del orquestador) → billing 0. # Estimamos por tokens para no infra-cobrar. Solo si el adapter lo # pide (LiteLLM); el path nativo conserva el usage real del proveedor. if self._estimate_usage_fallback and not final_usage and not usage_emitted: final_usage = _estimate_usage(messages, full_content + "\n" + full_reasoning) # IMPORTANTE: DeepSeek (endpoint OpenAI) a veces cierra el stream # con finish_reason="stop" AUNQUE haya emitido tool_calls. Si nos # fiamos solo de =="tool_calls" perdemos esos tool calls: el agente # anuncia la accion en texto y "se para" sin ejecutarla. Por eso # disparamos los tool_use SIEMPRE que haya tool calls acumulados, # sea cual sea el finish_reason. if tool_calls_acc: for acc in tool_calls_acc.values(): yield StreamChunk( tool_call_id=acc["id"], tool_name=acc["name"], tool_arguments=acc["arguments"], finish_reason="tool_use", ) # Emit usage after tool_use chunks if final_usage and not usage_emitted: yield StreamChunk(usage=final_usage) usage_emitted = True else: # Fallback: DeepSeek pudo emitir las tool calls como TEXTO # (DSML/XML) en vez de nativas. Parseamos el content y, si hay # tool calls, las ejecutamos igual; si no, cerramos el turno. text_calls = _parse_xml_tool_calls(full_content) if full_content else [] if text_calls: for c in text_calls: yield StreamChunk( tool_call_id=c["id"], tool_name=c["name"], tool_arguments=json.dumps(c.get("arguments", {}), ensure_ascii=False), finish_reason="tool_use", ) if final_usage and not usage_emitted: yield StreamChunk(usage=final_usage) usage_emitted = True else: yield StreamChunk( finish_reason="end_turn" if choice.finish_reason in ("stop", "tool_calls") else choice.finish_reason, usage=final_usage if not usage_emitted else {}, ) usage_emitted = True # ------------------------------------------------------------------ # Non-streaming # ------------------------------------------------------------------ async def complete( self, messages: list[dict[str, Any]], tools: list[dict[str, Any]] | None = None, config: ModelConfig | None = None, ) -> ModelResponse: config = config or ModelConfig( model_id=settings.default_model_id, max_tokens=settings.max_tokens, temperature=settings.temperature, ) kwargs: dict[str, Any] = { "model": config.model_id or settings.default_model_id or "gpt-4o", "max_tokens": config.max_tokens, "temperature": config.temperature, "messages": self._to_openai_messages(messages), } if tools: kwargs["tools"] = self._format_tools(tools) # Fuerza al modelo a usar un tool concreto para garantizar JSON por schema # (usado por /completions con json_schema). Ver ClaudeAdapter para la variante. force_tool = (config.extra or {}).get("force_tool") if force_tool: kwargs["tool_choice"] = { "type": "function", "function": {"name": force_tool}, } response = await self._acreate(kwargs) choice = response.choices[0] content = choice.message.content or "" tool_calls: list[dict[str, Any]] = [] if choice.message.tool_calls: for tc in choice.message.tool_calls: tool_calls.append( { "id": tc.id, "name": tc.function.name, "arguments": json.loads(tc.function.arguments) if tc.function.arguments else {}, } ) return ModelResponse( content=content, tool_calls=tool_calls, finish_reason=choice.finish_reason or "", usage={ "input_tokens": response.usage.prompt_tokens if response.usage else 0, "output_tokens": response.usage.completion_tokens if response.usage else 0, }, raw=response, ) # ------------------------------------------------------------------ # Token counting # ------------------------------------------------------------------ async def count_tokens(self, text: str) -> int: from ..context.compactor import estimate_tokens return estimate_tokens(text) # ------------------------------------------------------------------ # Helpers # ------------------------------------------------------------------ @staticmethod def _format_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]: """Convert internal tool definitions to OpenAI function calling format. Si `deepseek_strict_tools`, marca cada funcion con `strict: true` y limpia del schema los keywords que DeepSeek strict NO soporta (minLength/maxLength/ minItems/maxItems), que de otro modo darian 400.""" strict = settings.deepseek_strict_tools formatted: list[dict[str, Any]] = [] for tool in tools: params = tool.get("input_schema", tool.get("parameters", {"type": "object"})) fn: dict[str, Any] = { "name": tool["name"], "description": tool.get("description", ""), "parameters": OpenAIAdapter._sanitize_strict_schema(params) if strict else params, } if strict: fn["strict"] = True formatted.append({"type": "function", "function": fn}) return formatted # Keywords no soportados por DeepSeek strict mode (segun docs oficiales). _STRICT_UNSUPPORTED_KEYS = ("minLength", "maxLength", "minItems", "maxItems") @staticmethod def _sanitize_strict_schema(schema: Any) -> Any: """Elimina recursivamente keywords no soportados por DeepSeek strict.""" if isinstance(schema, dict): return { k: OpenAIAdapter._sanitize_strict_schema(v) for k, v in schema.items() if k not in OpenAIAdapter._STRICT_UNSUPPORTED_KEYS } if isinstance(schema, list): return [OpenAIAdapter._sanitize_strict_schema(x) for x in schema] return schema @staticmethod def _blocks_text(content: Any) -> str: """Extrae texto plano de un content que puede ser str o lista de bloques.""" if content is None: return "" if isinstance(content, str): return content if isinstance(content, list): parts = [] for b in content: if isinstance(b, dict): parts.append(b.get("text") or b.get("content") or "") else: parts.append(str(b)) return "\n".join(p for p in parts if p) return str(content) def _to_openai_messages(self, messages: list[dict[str, Any]]) -> list[dict[str, Any]]: """Convierte los mensajes del formato interno (Anthropic-style, con bloques `tool_use` / `tool_result`) al formato de la API OpenAI (`tool_calls` en el assistant, mensajes `role: tool` con `tool_call_id`). El contexto se construye en formato Anthropic, así que sin esto la API OpenAI de DeepSeek rechaza el body ('unknown variant tool_use').""" out: list[dict[str, Any]] = [] for msg in messages: role = msg.get("role") content = msg.get("content") if role == "system": out.append({"role": "system", "content": content if isinstance(content, str) else self._blocks_text(content)}) continue if not isinstance(content, list): out.append({"role": role, "content": content if isinstance(content, str) else str(content or "")}) continue if role == "assistant": text_parts: list[str] = [] tool_calls: list[dict[str, Any]] = [] reasoning_parts: list[str] = [] for b in content: if not isinstance(b, dict): continue t = b.get("type") if t == "text": text_parts.append(b.get("text", "")) elif t == "thinking": # DeepSeek thinking mode: el razonamiento del turno debe # reenviarse como `reasoning_content` (no como signature). rc = b.get("thinking", "") if rc: reasoning_parts.append(rc) elif t == "tool_use": tool_calls.append({ "id": b.get("id", ""), "type": "function", "function": { "name": b.get("name", ""), "arguments": json.dumps(b.get("input", {}), ensure_ascii=False), }, }) text_joined = "\n".join(p for p in text_parts if p) m: dict[str, Any] = {"role": "assistant", "content": (text_joined or None)} if reasoning_parts: if not text_joined and not tool_calls: # Quirk DeepSeek thinking: a veces emite TODA la respuesta # en reasoning_content y cierra sin content ni tool_calls. # Reenviar content=None sin tool_calls rompe la API # ("content or tool_calls must be set"), asi que promovemos # el reasoning a content (sin duplicarlo como reasoning_content). m["content"] = "\n".join(reasoning_parts) else: m["reasoning_content"] = "\n".join(reasoning_parts) if tool_calls: m["tool_calls"] = tool_calls out.append(m) else: # user (puede traer tool_result blocks) text_parts = [] for b in content: if not isinstance(b, dict): continue t = b.get("type") if t == "tool_result": out.append({ "role": "tool", "tool_call_id": b.get("tool_use_id", ""), "content": self._blocks_text(b.get("content")), }) elif t == "text": text_parts.append(b.get("text", "")) if text_parts: out.append({"role": "user", "content": "\n".join(text_parts)}) return out