agenticSystem/src/adapters/openai_adapter.py

"""OpenAI model adapter with full streaming support."""

from __future__ import annotations

import json
import logging
from typing import Any, AsyncIterator

from openai import AsyncOpenAI

from ..config import settings
from .base import (
    ContextOverflowError,
    ModelAdapter,
    ModelConfig,
    ModelResponse,
    StreamChunk,
)

logger = logging.getLogger(__name__)

# Señales de que el proveedor rechazó por ventana de contexto. Detectamos por
# tipo (litellm.ContextWindowExceededError) y por mensaje (openai.BadRequestError
# u otros 400), sin acoplar el adapter a litellm con un import duro.
_CONTEXT_OVERFLOW_MARKERS = (
    "context_length_exceeded",
    "maximum context length",
    "context window",
    "context length",
    "too many tokens",
    "reduce the length",
    "prompt is too long",
)


def _is_context_overflow(exc: Exception) -> bool:
    if type(exc).__name__ in ("ContextWindowExceededError",):
        return True
    msg = str(getattr(exc, "message", "") or exc).lower()
    return any(marker in msg for marker in _CONTEXT_OVERFLOW_MARKERS)


def _estimate_usage(messages: list[dict[str, Any]], output_text: str) -> dict[str, int]:
    """Estimacion de tokens cuando el proveedor no entrega usage (p.ej. LiteLLM
    streaming). Aproximada pero evita billing 0."""
    from ..context.compactor import estimate_tokens
    inp = 0
    for m in messages:
        c = m.get("content")
        if isinstance(c, str):
            inp += estimate_tokens(c)
        elif isinstance(c, list):
            for b in c:
                if isinstance(b, dict):
                    inp += estimate_tokens(
                        b.get("text") or b.get("thinking") or str(b.get("content") or "")
                    )
    return {"input_tokens": inp, "output_tokens": estimate_tokens(output_text or "")}


class OpenAIAdapter(ModelAdapter):
    """Adapter for the OpenAI API (GPT-4o, o1, etc.)."""

    def __init__(self, api_key: str | None = None, base_url: str | None = None) -> None:
        kwargs: dict[str, Any] = {
            "api_key": api_key or settings.openai_api_key,
        }
        url = base_url or settings.openai_base_url
        if url:
            kwargs["base_url"] = url
        self._client = AsyncOpenAI(**kwargs)
        # El path nativo conserva el usage real del proveedor; subclases que no
        # reciben usage fiable en streaming (LiteLLM) lo ponen a True para estimar.
        self._estimate_usage_fallback = False

    async def _acreate(self, kwargs: dict[str, Any]):
        """Hook de la llamada al modelo. Subclases (p.ej. LiteLLMAdapter) lo
        sobreescriben para enrutar por otra librería sin tocar el resto del
        flujo (procesado de chunks, tools, mensajes)."""
        return await self._client.chat.completions.create(**kwargs)

    # ------------------------------------------------------------------
    # Streaming
    # ------------------------------------------------------------------

    async def stream(
        self,
        messages: list[dict[str, Any]],
        tools: list[dict[str, Any]] | None = None,
        config: ModelConfig | None = None,
    ) -> AsyncIterator[StreamChunk]:
        """Envoltorio que traduce errores de ventana de contexto del proveedor a
        `ContextOverflowError` (dominio), tanto si saltan al iniciar el stream
        como durante la primera iteración. El loop del agente lo usa para
        reintentar con compactación agresiva si aún no emitió nada."""
        try:
            async for chunk in self._stream_impl(messages, tools, config):
                yield chunk
        except ContextOverflowError:
            raise
        except Exception as e:
            if _is_context_overflow(e):
                raise ContextOverflowError(str(getattr(e, "message", "") or e)) from e
            raise

    async def _stream_impl(
        self,
        messages: list[dict[str, Any]],
        tools: list[dict[str, Any]] | None = None,
        config: ModelConfig | None = None,
    ) -> AsyncIterator[StreamChunk]:
        config = config or ModelConfig(
            model_id=settings.default_model_id,
            max_tokens=settings.max_tokens,
            temperature=settings.temperature,
        )

        kwargs: dict[str, Any] = {
            "model": config.model_id or settings.default_model_id or "gpt-4o",
            "max_tokens": config.max_tokens,
            "temperature": config.temperature,
            "messages": self._to_openai_messages(messages),
            "stream": True,
            "stream_options": {"include_usage": True},
        }
        if getattr(config, "reasoning_effort", ""):
            kwargs["reasoning_effort"] = config.reasoning_effort
        if tools:
            kwargs["tools"] = self._format_tools(tools)

        stream = await self._acreate(kwargs)

        # Fallback de tool-calls-en-texto: DeepSeek a veces emite las tool calls
        # en su formato interno DSML como TEXTO (en el content) en vez de como
        # tool_calls nativos. El endpoint OpenAI no lo convierte, asi que sin
        # esto el agente "se para" mostrando DSML inerte. Reutilizamos el parser
        # del claude_adapter.
        from .claude_adapter import _parse_xml_tool_calls, _TOOL_CALL_OPEN_RE

        tool_calls_acc: dict[int, dict[str, str]] = {}

        final_usage: dict[str, int] = {}
        usage_emitted = False   # evita doble conteo si llega usage tras estimar
        full_content = ""       # content acumulado (para el fallback DSML)
        full_reasoning = ""     # razonamiento acumulado (para estimar usage)
        emitted_chars = 0       # cuanto de full_content ya se emitio como delta
        suppress_text = False   # tras detectar un tool-call-en-texto, no emitir mas

        # DeepSeek thinking mode: el razonamiento llega en `delta.reasoning_content`
        # (antes del content). Lo acumulamos como un bloque `thinking` (block_index 0)
        # para que el orquestador lo persista y `_to_openai_messages` lo reenvie como
        # `reasoning_content` en el siguiente turno — DeepSeek lo exige en multi-turno
        # con tool calls ("reasoning_content ... must be passed back to the API").
        reasoning_seen = False
        reasoning_sig_emitted = False

        async for chunk in stream:
            # With include_usage, the last chunk has usage but no choices.
            # getattr: el chunk de LiteLLM (ModelResponseStream) no siempre trae
            # el atributo `usage`; el del SDK OpenAI sí (None salvo el ultimo).
            chunk_usage = getattr(chunk, "usage", None)
            if chunk_usage:
                final_usage = {
                    "input_tokens": getattr(chunk_usage, "prompt_tokens", 0) or 0,
                    "output_tokens": getattr(chunk_usage, "completion_tokens", 0) or 0,
                }

            choice = chunk.choices[0] if chunk.choices else None
            if not choice:
                # Usage-only chunk (last one with include_usage) — emit it
                if final_usage and not usage_emitted:
                    yield StreamChunk(usage=final_usage)
                    usage_emitted = True
                continue

            delta = choice.delta

            # Reasoning content (DeepSeek thinking mode). Llega como campo extra
            # del delta; lo emitimos como thinking_delta en el bloque index 0.
            reasoning_txt = getattr(delta, "reasoning_content", None) if delta else None
            if reasoning_txt:
                reasoning_seen = True
                full_reasoning += reasoning_txt
                yield StreamChunk(
                    thinking_delta=reasoning_txt,
                    block_type="thinking",
                    block_index=0,
                )

            # Text content
            if delta and delta.content:
                full_content += delta.content
                if not suppress_text:
                    # Si arranca un tool call en texto (DSML/XML), emitimos lo
                    # previo y dejamos de emitir el resto (el DSML no debe verse).
                    m = _TOOL_CALL_OPEN_RE.search(full_content, emitted_chars)
                    if m:
                        suppress_text = True
                        if m.start() > emitted_chars:
                            yield StreamChunk(delta=full_content[emitted_chars:m.start()])
                        emitted_chars = len(full_content)
                    else:
                        yield StreamChunk(delta=full_content[emitted_chars:])
                        emitted_chars = len(full_content)

            # Tool calls
            if delta and delta.tool_calls:
                for tc in delta.tool_calls:
                    idx = tc.index
                    if idx not in tool_calls_acc:
                        tool_calls_acc[idx] = {
                            "id": tc.id or "",
                            "name": "",
                            "arguments": "",
                        }
                    if tc.id:
                        tool_calls_acc[idx]["id"] = tc.id
                    if tc.function and tc.function.name:
                        tool_calls_acc[idx]["name"] = tc.function.name
                        yield StreamChunk(
                            tool_call_id=tc.id or tool_calls_acc[idx]["id"],
                            tool_name=tc.function.name,
                        )
                    if tc.function and tc.function.arguments:
                        tool_calls_acc[idx]["arguments"] += tc.function.arguments
                        yield StreamChunk(
                            tool_call_id=tool_calls_acc[idx]["id"],
                            tool_name=tool_calls_acc[idx]["name"],
                            tool_arguments=tc.function.arguments,
                        )

            # Finish
            if choice.finish_reason:
                # Cerrar el bloque de razonamiento (si lo hubo) con un signature
                # sintetico: el orquestador descarta thinking blocks sin signature
                # (proteccion para MiniMax/Anthropic). DeepSeek no usa signatures;
                # este marcador solo evita el descarte y NUNCA se reenvia — en
                # `_to_openai_messages` el bloque se mapea a `reasoning_content`.
                if reasoning_seen and not reasoning_sig_emitted:
                    reasoning_sig_emitted = True
                    yield StreamChunk(
                        thinking_signature="deepseek-reasoning",
                        block_type="thinking",
                        block_index=0,
                    )
                # Fallback de usage: algunos proveedores via LiteLLM no entregan el
                # chunk de usage (o llega tras el break del orquestador) → billing 0.
                # Estimamos por tokens para no infra-cobrar. Solo si el adapter lo
                # pide (LiteLLM); el path nativo conserva el usage real del proveedor.
                if self._estimate_usage_fallback and not final_usage and not usage_emitted:
                    final_usage = _estimate_usage(messages, full_content + "\n" + full_reasoning)
                # IMPORTANTE: DeepSeek (endpoint OpenAI) a veces cierra el stream
                # con finish_reason="stop" AUNQUE haya emitido tool_calls. Si nos
                # fiamos solo de =="tool_calls" perdemos esos tool calls: el agente
                # anuncia la accion en texto y "se para" sin ejecutarla. Por eso
                # disparamos los tool_use SIEMPRE que haya tool calls acumulados,
                # sea cual sea el finish_reason.
                if tool_calls_acc:
                    for acc in tool_calls_acc.values():
                        yield StreamChunk(
                            tool_call_id=acc["id"],
                            tool_name=acc["name"],
                            tool_arguments=acc["arguments"],
                            finish_reason="tool_use",
                        )
                    # Emit usage after tool_use chunks
                    if final_usage and not usage_emitted:
                        yield StreamChunk(usage=final_usage)
                        usage_emitted = True
                else:
                    # Fallback: DeepSeek pudo emitir las tool calls como TEXTO
                    # (DSML/XML) en vez de nativas. Parseamos el content y, si hay
                    # tool calls, las ejecutamos igual; si no, cerramos el turno.
                    text_calls = _parse_xml_tool_calls(full_content) if full_content else []
                    if text_calls:
                        for c in text_calls:
                            yield StreamChunk(
                                tool_call_id=c["id"],
                                tool_name=c["name"],
                                tool_arguments=json.dumps(c.get("arguments", {}), ensure_ascii=False),
                                finish_reason="tool_use",
                            )
                        if final_usage and not usage_emitted:
                            yield StreamChunk(usage=final_usage)
                            usage_emitted = True
                    else:
                        yield StreamChunk(
                            finish_reason="end_turn"
                            if choice.finish_reason in ("stop", "tool_calls")
                            else choice.finish_reason,
                            usage=final_usage if not usage_emitted else {},
                        )
                        usage_emitted = True

    # ------------------------------------------------------------------
    # Non-streaming
    # ------------------------------------------------------------------

    async def complete(
        self,
        messages: list[dict[str, Any]],
        tools: list[dict[str, Any]] | None = None,
        config: ModelConfig | None = None,
    ) -> ModelResponse:
        config = config or ModelConfig(
            model_id=settings.default_model_id,
            max_tokens=settings.max_tokens,
            temperature=settings.temperature,
        )

        kwargs: dict[str, Any] = {
            "model": config.model_id or settings.default_model_id or "gpt-4o",
            "max_tokens": config.max_tokens,
            "temperature": config.temperature,
            "messages": self._to_openai_messages(messages),
        }
        if getattr(config, "reasoning_effort", ""):
            kwargs["reasoning_effort"] = config.reasoning_effort
        if tools:
            kwargs["tools"] = self._format_tools(tools)
        # Fuerza al modelo a usar un tool concreto para garantizar JSON por schema
        # (usado por /completions con json_schema). Ver ClaudeAdapter para la variante.
        force_tool = (config.extra or {}).get("force_tool")
        if force_tool:
            kwargs["tool_choice"] = {
                "type": "function",
                "function": {"name": force_tool},
            }

        try:
            response = await self._acreate(kwargs)
        except ContextOverflowError:
            raise
        except Exception as e:
            if _is_context_overflow(e):
                raise ContextOverflowError(str(getattr(e, "message", "") or e)) from e
            raise
        choice = response.choices[0]

        content = choice.message.content or ""
        tool_calls: list[dict[str, Any]] = []

        if choice.message.tool_calls:
            for tc in choice.message.tool_calls:
                tool_calls.append(
                    {
                        "id": tc.id,
                        "name": tc.function.name,
                        "arguments": json.loads(tc.function.arguments)
                        if tc.function.arguments
                        else {},
                    }
                )

        return ModelResponse(
            content=content,
            tool_calls=tool_calls,
            finish_reason=choice.finish_reason or "",
            usage={
                "input_tokens": response.usage.prompt_tokens if response.usage else 0,
                "output_tokens": response.usage.completion_tokens if response.usage else 0,
            },
            raw=response,
        )

    # ------------------------------------------------------------------
    # Token counting
    # ------------------------------------------------------------------

    async def count_tokens(self, text: str) -> int:
        from ..context.compactor import estimate_tokens
        return estimate_tokens(text)

    # ------------------------------------------------------------------
    # Helpers
    # ------------------------------------------------------------------

    @staticmethod
    def _format_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]:
        """Convert internal tool definitions to OpenAI function calling format.

        Si `deepseek_strict_tools`, marca cada funcion con `strict: true` y limpia
        del schema los keywords que DeepSeek strict NO soporta (minLength/maxLength/
        minItems/maxItems), que de otro modo darian 400."""
        strict = settings.deepseek_strict_tools
        formatted: list[dict[str, Any]] = []
        for tool in tools:
            params = tool.get("input_schema", tool.get("parameters", {"type": "object"}))
            fn: dict[str, Any] = {
                "name": tool["name"],
                "description": tool.get("description", ""),
                "parameters": OpenAIAdapter._sanitize_strict_schema(params) if strict else params,
            }
            if strict:
                fn["strict"] = True
            formatted.append({"type": "function", "function": fn})
        return formatted

    # Keywords no soportados por DeepSeek strict mode (segun docs oficiales).
    _STRICT_UNSUPPORTED_KEYS = ("minLength", "maxLength", "minItems", "maxItems")

    @staticmethod
    def _sanitize_strict_schema(schema: Any) -> Any:
        """Elimina recursivamente keywords no soportados por DeepSeek strict."""
        if isinstance(schema, dict):
            return {
                k: OpenAIAdapter._sanitize_strict_schema(v)
                for k, v in schema.items()
                if k not in OpenAIAdapter._STRICT_UNSUPPORTED_KEYS
            }
        if isinstance(schema, list):
            return [OpenAIAdapter._sanitize_strict_schema(x) for x in schema]
        return schema

    @staticmethod
    def _blocks_text(content: Any) -> str:
        """Extrae texto plano de un content que puede ser str o lista de bloques."""
        if content is None:
            return ""
        if isinstance(content, str):
            return content
        if isinstance(content, list):
            parts = []
            for b in content:
                if isinstance(b, dict):
                    parts.append(b.get("text") or b.get("content") or "")
                else:
                    parts.append(str(b))
            return "\n".join(p for p in parts if p)
        return str(content)

    def _to_openai_messages(self, messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
        """Convierte los mensajes del formato interno (Anthropic-style, con bloques
        `tool_use` / `tool_result`) al formato de la API OpenAI (`tool_calls` en el
        assistant, mensajes `role: tool` con `tool_call_id`). El contexto se construye
        en formato Anthropic, así que sin esto la API OpenAI de DeepSeek rechaza el
        body ('unknown variant tool_use')."""
        out: list[dict[str, Any]] = []
        for msg in messages:
            role = msg.get("role")
            content = msg.get("content")
            if role == "system":
                out.append({"role": "system", "content": content if isinstance(content, str) else self._blocks_text(content)})
                continue
            if not isinstance(content, list):
                out.append({"role": role, "content": content if isinstance(content, str) else str(content or "")})
                continue
            if role == "assistant":
                text_parts: list[str] = []
                tool_calls: list[dict[str, Any]] = []
                reasoning_parts: list[str] = []
                for b in content:
                    if not isinstance(b, dict):
                        continue
                    t = b.get("type")
                    if t == "text":
                        text_parts.append(b.get("text", ""))
                    elif t == "thinking":
                        # DeepSeek thinking mode: el razonamiento del turno debe
                        # reenviarse como `reasoning_content` (no como signature).
                        rc = b.get("thinking", "")
                        if rc:
                            reasoning_parts.append(rc)
                    elif t == "tool_use":
                        tool_calls.append({
                            "id": b.get("id", ""),
                            "type": "function",
                            "function": {
                                "name": b.get("name", ""),
                                "arguments": json.dumps(b.get("input", {}), ensure_ascii=False),
                            },
                        })
                text_joined = "\n".join(p for p in text_parts if p)
                m: dict[str, Any] = {"role": "assistant", "content": (text_joined or None)}
                if reasoning_parts:
                    if not text_joined and not tool_calls:
                        # Quirk DeepSeek thinking: a veces emite TODA la respuesta
                        # en reasoning_content y cierra sin content ni tool_calls.
                        # Reenviar content=None sin tool_calls rompe la API
                        # ("content or tool_calls must be set"), asi que promovemos
                        # el reasoning a content (sin duplicarlo como reasoning_content).
                        m["content"] = "\n".join(reasoning_parts)
                    else:
                        m["reasoning_content"] = "\n".join(reasoning_parts)
                if tool_calls:
                    m["tool_calls"] = tool_calls
                out.append(m)
            else:  # user (puede traer tool_result blocks, texto e imágenes)
                text_parts = []
                image_blocks: list[dict[str, Any]] = []
                for b in content:
                    if not isinstance(b, dict):
                        continue
                    t = b.get("type")
                    if t == "tool_result":
                        out.append({
                            "role": "tool",
                            "tool_call_id": b.get("tool_use_id", ""),
                            "content": self._blocks_text(b.get("content")),
                        })
                    elif t == "text":
                        text_parts.append(b.get("text", ""))
                    elif t == "image_url":
                        # Visión nativa: preservar el bloque en formato multimodal OpenAI.
                        image_blocks.append({"type": "image_url", "image_url": b.get("image_url") or {}})
                if image_blocks:
                    # Content como lista de bloques (texto + imágenes).
                    parts: list[dict[str, Any]] = []
                    joined = "\n".join(p for p in text_parts if p)
                    if joined:
                        parts.append({"type": "text", "text": joined})
                    parts.extend(image_blocks)
                    out.append({"role": "user", "content": parts})
                elif text_parts:
                    out.append({"role": "user", "content": "\n".join(text_parts)})
        # Guard defensivo: el compactor ya garantiza el invariante tool_use ↔
        # tool_result (`_enforce_tool_pairing`), pero si algo se escapa el
        # proveedor devuelve 400 y la sesion queda bloqueada. Cinturon y tirantes.
        return self._repair_tool_sequence(out)

    @staticmethod
    def _repair_tool_sequence(out: list[dict[str, Any]]) -> list[dict[str, Any]]:
        """Garantiza el contrato OpenAI sobre la secuencia ya convertida:

        - Todo `role: tool` debe responder a un tool_call_id del assistant
          inmediatamente anterior (o de su bloque contiguo de tool messages).
          Si no → se convierte a user con placeholder.
        - Todo assistant con `tool_calls` debe tener respuesta para CADA id.
          Los tool_calls sin respuesta se eliminan; si la lista queda vacia se
          elimina la key (y se asegura `content` no-None — "content or
          tool_calls must be set").

        No deberia activarse nunca (el compactor repara antes); si se activa,
        loguea warning para detectar regresiones del compactor.
        """
        repaired: list[dict[str, Any]] = []
        i = 0
        n = len(out)
        while i < n:
            msg = out[i]
            role = msg.get("role")

            if role == "assistant" and msg.get("tool_calls"):
                # Bloque contiguo de tool messages que responden a este assistant.
                j = i + 1
                block: list[dict[str, Any]] = []
                while j < n and out[j].get("role") == "tool":
                    block.append(out[j])
                    j += 1
                answered = {t.get("tool_call_id", "") for t in block}
                kept_calls = [
                    tc for tc in msg["tool_calls"] if tc.get("id", "") in answered
                ]
                dropped = [
                    tc for tc in msg["tool_calls"] if tc.get("id", "") not in answered
                ]
                new_msg = dict(msg)
                if dropped:
                    for tc in dropped:
                        logger.warning(
                            "repaired unanswered tool_call at index %d (tool_call_id=%s)",
                            i,
                            tc.get("id", ""),
                        )
                    if kept_calls:
                        new_msg["tool_calls"] = kept_calls
                    else:
                        new_msg.pop("tool_calls", None)
                        if new_msg.get("content") is None:
                            # Promover reasoning a content si existe (mismo
                            # criterio que el quirk DeepSeek de arriba); si no,
                            # placeholder para no enviar content=None sin tools.
                            rc = new_msg.pop("reasoning_content", None)
                            new_msg["content"] = rc or "[ASSISTANT COMPACTADO]"
                repaired.append(new_msg)
                valid_ids = {tc.get("id", "") for tc in kept_calls}
                converted: list[dict[str, Any]] = []
                for t in block:
                    if t.get("tool_call_id", "") in valid_ids:
                        repaired.append(t)
                    else:
                        logger.warning(
                            "repaired orphan tool message (tool_call_id=%s)",
                            t.get("tool_call_id", ""),
                        )
                        converted.append(
                            {
                                "role": "user",
                                "content": "[Resultado de herramienta (contexto compactado)]: "
                                + str(t.get("content", ""))[:500],
                            }
                        )
                # Los huerfanos convertidos van DESPUES del bloque de tools
                # validos para no romper la contiguidad assistant → tools.
                repaired.extend(converted)
                i = j
                continue

            if role == "tool":
                # Tool message sin assistant con tool_calls delante → huerfano.
                logger.warning(
                    "repaired orphan tool message at index %d (tool_call_id=%s)",
                    i,
                    msg.get("tool_call_id", ""),
                )
                repaired.append(
                    {
                        "role": "user",
                        "content": "[Resultado de herramienta (contexto compactado)]: "
                        + str(msg.get("content", ""))[:500],
                    }
                )
                i += 1
                continue

            repaired.append(msg)
            i += 1
        return repaired