Harden DeepSeek agent: LiteLLM adapter, DSML/reasoning/embeddings/error fixes

- LiteLLMAdapter (subclasses OpenAIAdapter via _acreate hook): routes DeepSeek through LiteLLM. Opt-in AGENTIC_DEFAULT_MODEL_PROVIDER=litellm. A/B beat the hand-rolled adapter (0 DSML, 0 parse-fails). Defensive chunk.usage getattr, token-estimate usage fallback for billing, quiet litellm logs. - DSML parser: tolerate single/multi fullwidth pipes, honor string="true/false" typed args (openai_adapter fallback when DeepSeek leaks tool calls as text). - Thinking mode: capture and round-trip reasoning_content across turns. - Embeddings: dedicated AGENTIC_EMBEDDINGS_API_KEY (DeepSeek has no embeddings); disable cleanly when unset to avoid per-turn 401. - claude_format: friendly generic error messages to the chat, raw only in logs. - acai agent max_tokens 4096->16384 (whole-file writes no longer truncate); system.md size-based edit policy; strict tools opt-in (off). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 14:49:48 +00:00
parent e34a39e3bf
commit 6a03fdf284
12 changed files with 396 additions and 58 deletions
--- a/src/adapters/openai_adapter.py
+++ b/src/adapters/openai_adapter.py
@@ -14,6 +14,24 @@ from .base import ModelAdapter, ModelConfig, ModelResponse, StreamChunk
 logger = logging.getLogger(__name__)


+def _estimate_usage(messages: list[dict[str, Any]], output_text: str) -> dict[str, int]:
+    """Estimacion de tokens cuando el proveedor no entrega usage (p.ej. LiteLLM
+    streaming). Aproximada pero evita billing 0."""
+    from ..context.compactor import estimate_tokens
+    inp = 0
+    for m in messages:
+        c = m.get("content")
+        if isinstance(c, str):
+            inp += estimate_tokens(c)
+        elif isinstance(c, list):
+            for b in c:
+                if isinstance(b, dict):
+                    inp += estimate_tokens(
+                        b.get("text") or b.get("thinking") or str(b.get("content") or "")
+                    )
+    return {"input_tokens": inp, "output_tokens": estimate_tokens(output_text or "")}
+
+
 class OpenAIAdapter(ModelAdapter):
    """Adapter for the OpenAI API (GPT-4o, o1, etc.)."""

@@ -25,6 +43,15 @@ class OpenAIAdapter(ModelAdapter):
        if url:
            kwargs["base_url"] = url
        self._client = AsyncOpenAI(**kwargs)
+        # El path nativo conserva el usage real del proveedor; subclases que no
+        # reciben usage fiable en streaming (LiteLLM) lo ponen a True para estimar.
+        self._estimate_usage_fallback = False
+
+    async def _acreate(self, kwargs: dict[str, Any]):
+        """Hook de la llamada al modelo. Subclases (p.ej. LiteLLMAdapter) lo
+        sobreescriben para enrutar por otra librería sin tocar el resto del
+        flujo (procesado de chunks, tools, mensajes)."""
+        return await self._client.chat.completions.create(**kwargs)

    # ------------------------------------------------------------------
    # Streaming
@@ -53,7 +80,7 @@ class OpenAIAdapter(ModelAdapter):
        if tools:
            kwargs["tools"] = self._format_tools(tools)

-        stream = await self._client.chat.completions.create(**kwargs)
+        stream = await self._acreate(kwargs)

        # Fallback de tool-calls-en-texto: DeepSeek a veces emite las tool calls
        # en su formato interno DSML como TEXTO (en el content) en vez de como
@@ -65,28 +92,53 @@ class OpenAIAdapter(ModelAdapter):
        tool_calls_acc: dict[int, dict[str, str]] = {}

        final_usage: dict[str, int] = {}
+        usage_emitted = False   # evita doble conteo si llega usage tras estimar
        full_content = ""       # content acumulado (para el fallback DSML)
+        full_reasoning = ""     # razonamiento acumulado (para estimar usage)
        emitted_chars = 0       # cuanto de full_content ya se emitio como delta
        suppress_text = False   # tras detectar un tool-call-en-texto, no emitir mas

+        # DeepSeek thinking mode: el razonamiento llega en `delta.reasoning_content`
+        # (antes del content). Lo acumulamos como un bloque `thinking` (block_index 0)
+        # para que el orquestador lo persista y `_to_openai_messages` lo reenvie como
+        # `reasoning_content` en el siguiente turno — DeepSeek lo exige en multi-turno
+        # con tool calls ("reasoning_content ... must be passed back to the API").
+        reasoning_seen = False
+        reasoning_sig_emitted = False
+
        async for chunk in stream:
-            # With include_usage, the last chunk has usage but no choices
-            if chunk.usage:
+            # With include_usage, the last chunk has usage but no choices.
+            # getattr: el chunk de LiteLLM (ModelResponseStream) no siempre trae
+            # el atributo `usage`; el del SDK OpenAI sí (None salvo el ultimo).
+            chunk_usage = getattr(chunk, "usage", None)
+            if chunk_usage:
                final_usage = {
-                    "input_tokens": chunk.usage.prompt_tokens or 0,
-                    "output_tokens": chunk.usage.completion_tokens or 0,
+                    "input_tokens": getattr(chunk_usage, "prompt_tokens", 0) or 0,
+                    "output_tokens": getattr(chunk_usage, "completion_tokens", 0) or 0,
                }

            choice = chunk.choices[0] if chunk.choices else None
            if not choice:
                # Usage-only chunk (last one with include_usage) — emit it
-                if final_usage:
+                if final_usage and not usage_emitted:
                    yield StreamChunk(usage=final_usage)
-                    final_usage = {}  # Only emit once
+                    usage_emitted = True
                continue

            delta = choice.delta

+            # Reasoning content (DeepSeek thinking mode). Llega como campo extra
+            # del delta; lo emitimos como thinking_delta en el bloque index 0.
+            reasoning_txt = getattr(delta, "reasoning_content", None) if delta else None
+            if reasoning_txt:
+                reasoning_seen = True
+                full_reasoning += reasoning_txt
+                yield StreamChunk(
+                    thinking_delta=reasoning_txt,
+                    block_type="thinking",
+                    block_index=0,
+                )
+
            # Text content
            if delta and delta.content:
                full_content += delta.content
@@ -131,6 +183,24 @@ class OpenAIAdapter(ModelAdapter):

            # Finish
            if choice.finish_reason:
+                # Cerrar el bloque de razonamiento (si lo hubo) con un signature
+                # sintetico: el orquestador descarta thinking blocks sin signature
+                # (proteccion para MiniMax/Anthropic). DeepSeek no usa signatures;
+                # este marcador solo evita el descarte y NUNCA se reenvia — en
+                # `_to_openai_messages` el bloque se mapea a `reasoning_content`.
+                if reasoning_seen and not reasoning_sig_emitted:
+                    reasoning_sig_emitted = True
+                    yield StreamChunk(
+                        thinking_signature="deepseek-reasoning",
+                        block_type="thinking",
+                        block_index=0,
+                    )
+                # Fallback de usage: algunos proveedores via LiteLLM no entregan el
+                # chunk de usage (o llega tras el break del orquestador) → billing 0.
+                # Estimamos por tokens para no infra-cobrar. Solo si el adapter lo
+                # pide (LiteLLM); el path nativo conserva el usage real del proveedor.
+                if self._estimate_usage_fallback and not final_usage and not usage_emitted:
+                    final_usage = _estimate_usage(messages, full_content + "\n" + full_reasoning)
                # IMPORTANTE: DeepSeek (endpoint OpenAI) a veces cierra el stream
                # con finish_reason="stop" AUNQUE haya emitido tool_calls. Si nos
                # fiamos solo de =="tool_calls" perdemos esos tool calls: el agente
@@ -146,8 +216,9 @@ class OpenAIAdapter(ModelAdapter):
                            finish_reason="tool_use",
                        )
                    # Emit usage after tool_use chunks
-                    if final_usage:
+                    if final_usage and not usage_emitted:
                        yield StreamChunk(usage=final_usage)
+                        usage_emitted = True
                else:
                    # Fallback: DeepSeek pudo emitir las tool calls como TEXTO
                    # (DSML/XML) en vez de nativas. Parseamos el content y, si hay
@@ -161,15 +232,17 @@ class OpenAIAdapter(ModelAdapter):
                                tool_arguments=json.dumps(c.get("arguments", {}), ensure_ascii=False),
                                finish_reason="tool_use",
                            )
-                        if final_usage:
+                        if final_usage and not usage_emitted:
                            yield StreamChunk(usage=final_usage)
+                            usage_emitted = True
                    else:
                        yield StreamChunk(
                            finish_reason="end_turn"
                            if choice.finish_reason in ("stop", "tool_calls")
                            else choice.finish_reason,
-                            usage=final_usage,
+                            usage=final_usage if not usage_emitted else {},
                        )
+                        usage_emitted = True

    # ------------------------------------------------------------------
    # Non-streaming
@@ -204,7 +277,7 @@ class OpenAIAdapter(ModelAdapter):
                "function": {"name": force_tool},
            }

-        response = await self._client.chat.completions.create(**kwargs)
+        response = await self._acreate(kwargs)
        choice = response.choices[0]

        content = choice.message.content or ""
@@ -247,23 +320,41 @@ class OpenAIAdapter(ModelAdapter):

    @staticmethod
    def _format_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]:
-        """Convert internal tool definitions to OpenAI function calling format."""
+        """Convert internal tool definitions to OpenAI function calling format.
+
+        Si `deepseek_strict_tools`, marca cada funcion con `strict: true` y limpia
+        del schema los keywords que DeepSeek strict NO soporta (minLength/maxLength/
+        minItems/maxItems), que de otro modo darian 400."""
+        strict = settings.deepseek_strict_tools
        formatted: list[dict[str, Any]] = []
        for tool in tools:
-            formatted.append(
-                {
-                    "type": "function",
-                    "function": {
-                        "name": tool["name"],
-                        "description": tool.get("description", ""),
-                        "parameters": tool.get(
-                            "input_schema", tool.get("parameters", {"type": "object"})
-                        ),
-                    },
-                }
-            )
+            params = tool.get("input_schema", tool.get("parameters", {"type": "object"}))
+            fn: dict[str, Any] = {
+                "name": tool["name"],
+                "description": tool.get("description", ""),
+                "parameters": OpenAIAdapter._sanitize_strict_schema(params) if strict else params,
+            }
+            if strict:
+                fn["strict"] = True
+            formatted.append({"type": "function", "function": fn})
        return formatted

+    # Keywords no soportados por DeepSeek strict mode (segun docs oficiales).
+    _STRICT_UNSUPPORTED_KEYS = ("minLength", "maxLength", "minItems", "maxItems")
+
+    @staticmethod
+    def _sanitize_strict_schema(schema: Any) -> Any:
+        """Elimina recursivamente keywords no soportados por DeepSeek strict."""
+        if isinstance(schema, dict):
+            return {
+                k: OpenAIAdapter._sanitize_strict_schema(v)
+                for k, v in schema.items()
+                if k not in OpenAIAdapter._STRICT_UNSUPPORTED_KEYS
+            }
+        if isinstance(schema, list):
+            return [OpenAIAdapter._sanitize_strict_schema(x) for x in schema]
+        return schema
+
    @staticmethod
    def _blocks_text(content: Any) -> str:
        """Extrae texto plano de un content que puede ser str o lista de bloques."""
@@ -300,12 +391,19 @@ class OpenAIAdapter(ModelAdapter):
            if role == "assistant":
                text_parts: list[str] = []
                tool_calls: list[dict[str, Any]] = []
+                reasoning_parts: list[str] = []
                for b in content:
                    if not isinstance(b, dict):
                        continue
                    t = b.get("type")
                    if t == "text":
                        text_parts.append(b.get("text", ""))
+                    elif t == "thinking":
+                        # DeepSeek thinking mode: el razonamiento del turno debe
+                        # reenviarse como `reasoning_content` (no como signature).
+                        rc = b.get("thinking", "")
+                        if rc:
+                            reasoning_parts.append(rc)
                    elif t == "tool_use":
                        tool_calls.append({
                            "id": b.get("id", ""),
@@ -315,8 +413,9 @@ class OpenAIAdapter(ModelAdapter):
                                "arguments": json.dumps(b.get("input", {}), ensure_ascii=False),
                            },
                        })
-                    # thinking / otros bloques: se ignoran (OpenAI no los soporta)
                m: dict[str, Any] = {"role": "assistant", "content": ("\n".join(p for p in text_parts if p) or None)}
+                if reasoning_parts:
+                    m["reasoning_content"] = "\n".join(reasoning_parts)
                if tool_calls:
                    m["tool_calls"] = tool_calls
                out.append(m)