Harden DeepSeek agent: LiteLLM adapter, DSML/reasoning/embeddings/error fixes
- LiteLLMAdapter (subclasses OpenAIAdapter via _acreate hook): routes DeepSeek through LiteLLM. Opt-in AGENTIC_DEFAULT_MODEL_PROVIDER=litellm. A/B beat the hand-rolled adapter (0 DSML, 0 parse-fails). Defensive chunk.usage getattr, token-estimate usage fallback for billing, quiet litellm logs. - DSML parser: tolerate single/multi fullwidth pipes, honor string="true/false" typed args (openai_adapter fallback when DeepSeek leaks tool calls as text). - Thinking mode: capture and round-trip reasoning_content across turns. - Embeddings: dedicated AGENTIC_EMBEDDINGS_API_KEY (DeepSeek has no embeddings); disable cleanly when unset to avoid per-turn 401. - claude_format: friendly generic error messages to the chat, raw only in logs. - acai agent max_tokens 4096->16384 (whole-file writes no longer truncate); system.md size-based edit policy; strict tools opt-in (off). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -14,6 +14,24 @@ from .base import ModelAdapter, ModelConfig, ModelResponse, StreamChunk
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _estimate_usage(messages: list[dict[str, Any]], output_text: str) -> dict[str, int]:
|
||||
"""Estimacion de tokens cuando el proveedor no entrega usage (p.ej. LiteLLM
|
||||
streaming). Aproximada pero evita billing 0."""
|
||||
from ..context.compactor import estimate_tokens
|
||||
inp = 0
|
||||
for m in messages:
|
||||
c = m.get("content")
|
||||
if isinstance(c, str):
|
||||
inp += estimate_tokens(c)
|
||||
elif isinstance(c, list):
|
||||
for b in c:
|
||||
if isinstance(b, dict):
|
||||
inp += estimate_tokens(
|
||||
b.get("text") or b.get("thinking") or str(b.get("content") or "")
|
||||
)
|
||||
return {"input_tokens": inp, "output_tokens": estimate_tokens(output_text or "")}
|
||||
|
||||
|
||||
class OpenAIAdapter(ModelAdapter):
|
||||
"""Adapter for the OpenAI API (GPT-4o, o1, etc.)."""
|
||||
|
||||
@@ -25,6 +43,15 @@ class OpenAIAdapter(ModelAdapter):
|
||||
if url:
|
||||
kwargs["base_url"] = url
|
||||
self._client = AsyncOpenAI(**kwargs)
|
||||
# El path nativo conserva el usage real del proveedor; subclases que no
|
||||
# reciben usage fiable en streaming (LiteLLM) lo ponen a True para estimar.
|
||||
self._estimate_usage_fallback = False
|
||||
|
||||
async def _acreate(self, kwargs: dict[str, Any]):
|
||||
"""Hook de la llamada al modelo. Subclases (p.ej. LiteLLMAdapter) lo
|
||||
sobreescriben para enrutar por otra librería sin tocar el resto del
|
||||
flujo (procesado de chunks, tools, mensajes)."""
|
||||
return await self._client.chat.completions.create(**kwargs)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Streaming
|
||||
@@ -53,7 +80,7 @@ class OpenAIAdapter(ModelAdapter):
|
||||
if tools:
|
||||
kwargs["tools"] = self._format_tools(tools)
|
||||
|
||||
stream = await self._client.chat.completions.create(**kwargs)
|
||||
stream = await self._acreate(kwargs)
|
||||
|
||||
# Fallback de tool-calls-en-texto: DeepSeek a veces emite las tool calls
|
||||
# en su formato interno DSML como TEXTO (en el content) en vez de como
|
||||
@@ -65,28 +92,53 @@ class OpenAIAdapter(ModelAdapter):
|
||||
tool_calls_acc: dict[int, dict[str, str]] = {}
|
||||
|
||||
final_usage: dict[str, int] = {}
|
||||
usage_emitted = False # evita doble conteo si llega usage tras estimar
|
||||
full_content = "" # content acumulado (para el fallback DSML)
|
||||
full_reasoning = "" # razonamiento acumulado (para estimar usage)
|
||||
emitted_chars = 0 # cuanto de full_content ya se emitio como delta
|
||||
suppress_text = False # tras detectar un tool-call-en-texto, no emitir mas
|
||||
|
||||
# DeepSeek thinking mode: el razonamiento llega en `delta.reasoning_content`
|
||||
# (antes del content). Lo acumulamos como un bloque `thinking` (block_index 0)
|
||||
# para que el orquestador lo persista y `_to_openai_messages` lo reenvie como
|
||||
# `reasoning_content` en el siguiente turno — DeepSeek lo exige en multi-turno
|
||||
# con tool calls ("reasoning_content ... must be passed back to the API").
|
||||
reasoning_seen = False
|
||||
reasoning_sig_emitted = False
|
||||
|
||||
async for chunk in stream:
|
||||
# With include_usage, the last chunk has usage but no choices
|
||||
if chunk.usage:
|
||||
# With include_usage, the last chunk has usage but no choices.
|
||||
# getattr: el chunk de LiteLLM (ModelResponseStream) no siempre trae
|
||||
# el atributo `usage`; el del SDK OpenAI sí (None salvo el ultimo).
|
||||
chunk_usage = getattr(chunk, "usage", None)
|
||||
if chunk_usage:
|
||||
final_usage = {
|
||||
"input_tokens": chunk.usage.prompt_tokens or 0,
|
||||
"output_tokens": chunk.usage.completion_tokens or 0,
|
||||
"input_tokens": getattr(chunk_usage, "prompt_tokens", 0) or 0,
|
||||
"output_tokens": getattr(chunk_usage, "completion_tokens", 0) or 0,
|
||||
}
|
||||
|
||||
choice = chunk.choices[0] if chunk.choices else None
|
||||
if not choice:
|
||||
# Usage-only chunk (last one with include_usage) — emit it
|
||||
if final_usage:
|
||||
if final_usage and not usage_emitted:
|
||||
yield StreamChunk(usage=final_usage)
|
||||
final_usage = {} # Only emit once
|
||||
usage_emitted = True
|
||||
continue
|
||||
|
||||
delta = choice.delta
|
||||
|
||||
# Reasoning content (DeepSeek thinking mode). Llega como campo extra
|
||||
# del delta; lo emitimos como thinking_delta en el bloque index 0.
|
||||
reasoning_txt = getattr(delta, "reasoning_content", None) if delta else None
|
||||
if reasoning_txt:
|
||||
reasoning_seen = True
|
||||
full_reasoning += reasoning_txt
|
||||
yield StreamChunk(
|
||||
thinking_delta=reasoning_txt,
|
||||
block_type="thinking",
|
||||
block_index=0,
|
||||
)
|
||||
|
||||
# Text content
|
||||
if delta and delta.content:
|
||||
full_content += delta.content
|
||||
@@ -131,6 +183,24 @@ class OpenAIAdapter(ModelAdapter):
|
||||
|
||||
# Finish
|
||||
if choice.finish_reason:
|
||||
# Cerrar el bloque de razonamiento (si lo hubo) con un signature
|
||||
# sintetico: el orquestador descarta thinking blocks sin signature
|
||||
# (proteccion para MiniMax/Anthropic). DeepSeek no usa signatures;
|
||||
# este marcador solo evita el descarte y NUNCA se reenvia — en
|
||||
# `_to_openai_messages` el bloque se mapea a `reasoning_content`.
|
||||
if reasoning_seen and not reasoning_sig_emitted:
|
||||
reasoning_sig_emitted = True
|
||||
yield StreamChunk(
|
||||
thinking_signature="deepseek-reasoning",
|
||||
block_type="thinking",
|
||||
block_index=0,
|
||||
)
|
||||
# Fallback de usage: algunos proveedores via LiteLLM no entregan el
|
||||
# chunk de usage (o llega tras el break del orquestador) → billing 0.
|
||||
# Estimamos por tokens para no infra-cobrar. Solo si el adapter lo
|
||||
# pide (LiteLLM); el path nativo conserva el usage real del proveedor.
|
||||
if self._estimate_usage_fallback and not final_usage and not usage_emitted:
|
||||
final_usage = _estimate_usage(messages, full_content + "\n" + full_reasoning)
|
||||
# IMPORTANTE: DeepSeek (endpoint OpenAI) a veces cierra el stream
|
||||
# con finish_reason="stop" AUNQUE haya emitido tool_calls. Si nos
|
||||
# fiamos solo de =="tool_calls" perdemos esos tool calls: el agente
|
||||
@@ -146,8 +216,9 @@ class OpenAIAdapter(ModelAdapter):
|
||||
finish_reason="tool_use",
|
||||
)
|
||||
# Emit usage after tool_use chunks
|
||||
if final_usage:
|
||||
if final_usage and not usage_emitted:
|
||||
yield StreamChunk(usage=final_usage)
|
||||
usage_emitted = True
|
||||
else:
|
||||
# Fallback: DeepSeek pudo emitir las tool calls como TEXTO
|
||||
# (DSML/XML) en vez de nativas. Parseamos el content y, si hay
|
||||
@@ -161,15 +232,17 @@ class OpenAIAdapter(ModelAdapter):
|
||||
tool_arguments=json.dumps(c.get("arguments", {}), ensure_ascii=False),
|
||||
finish_reason="tool_use",
|
||||
)
|
||||
if final_usage:
|
||||
if final_usage and not usage_emitted:
|
||||
yield StreamChunk(usage=final_usage)
|
||||
usage_emitted = True
|
||||
else:
|
||||
yield StreamChunk(
|
||||
finish_reason="end_turn"
|
||||
if choice.finish_reason in ("stop", "tool_calls")
|
||||
else choice.finish_reason,
|
||||
usage=final_usage,
|
||||
usage=final_usage if not usage_emitted else {},
|
||||
)
|
||||
usage_emitted = True
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Non-streaming
|
||||
@@ -204,7 +277,7 @@ class OpenAIAdapter(ModelAdapter):
|
||||
"function": {"name": force_tool},
|
||||
}
|
||||
|
||||
response = await self._client.chat.completions.create(**kwargs)
|
||||
response = await self._acreate(kwargs)
|
||||
choice = response.choices[0]
|
||||
|
||||
content = choice.message.content or ""
|
||||
@@ -247,23 +320,41 @@ class OpenAIAdapter(ModelAdapter):
|
||||
|
||||
@staticmethod
|
||||
def _format_tools(tools: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""Convert internal tool definitions to OpenAI function calling format."""
|
||||
"""Convert internal tool definitions to OpenAI function calling format.
|
||||
|
||||
Si `deepseek_strict_tools`, marca cada funcion con `strict: true` y limpia
|
||||
del schema los keywords que DeepSeek strict NO soporta (minLength/maxLength/
|
||||
minItems/maxItems), que de otro modo darian 400."""
|
||||
strict = settings.deepseek_strict_tools
|
||||
formatted: list[dict[str, Any]] = []
|
||||
for tool in tools:
|
||||
formatted.append(
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tool["name"],
|
||||
"description": tool.get("description", ""),
|
||||
"parameters": tool.get(
|
||||
"input_schema", tool.get("parameters", {"type": "object"})
|
||||
),
|
||||
},
|
||||
}
|
||||
)
|
||||
params = tool.get("input_schema", tool.get("parameters", {"type": "object"}))
|
||||
fn: dict[str, Any] = {
|
||||
"name": tool["name"],
|
||||
"description": tool.get("description", ""),
|
||||
"parameters": OpenAIAdapter._sanitize_strict_schema(params) if strict else params,
|
||||
}
|
||||
if strict:
|
||||
fn["strict"] = True
|
||||
formatted.append({"type": "function", "function": fn})
|
||||
return formatted
|
||||
|
||||
# Keywords no soportados por DeepSeek strict mode (segun docs oficiales).
|
||||
_STRICT_UNSUPPORTED_KEYS = ("minLength", "maxLength", "minItems", "maxItems")
|
||||
|
||||
@staticmethod
|
||||
def _sanitize_strict_schema(schema: Any) -> Any:
|
||||
"""Elimina recursivamente keywords no soportados por DeepSeek strict."""
|
||||
if isinstance(schema, dict):
|
||||
return {
|
||||
k: OpenAIAdapter._sanitize_strict_schema(v)
|
||||
for k, v in schema.items()
|
||||
if k not in OpenAIAdapter._STRICT_UNSUPPORTED_KEYS
|
||||
}
|
||||
if isinstance(schema, list):
|
||||
return [OpenAIAdapter._sanitize_strict_schema(x) for x in schema]
|
||||
return schema
|
||||
|
||||
@staticmethod
|
||||
def _blocks_text(content: Any) -> str:
|
||||
"""Extrae texto plano de un content que puede ser str o lista de bloques."""
|
||||
@@ -300,12 +391,19 @@ class OpenAIAdapter(ModelAdapter):
|
||||
if role == "assistant":
|
||||
text_parts: list[str] = []
|
||||
tool_calls: list[dict[str, Any]] = []
|
||||
reasoning_parts: list[str] = []
|
||||
for b in content:
|
||||
if not isinstance(b, dict):
|
||||
continue
|
||||
t = b.get("type")
|
||||
if t == "text":
|
||||
text_parts.append(b.get("text", ""))
|
||||
elif t == "thinking":
|
||||
# DeepSeek thinking mode: el razonamiento del turno debe
|
||||
# reenviarse como `reasoning_content` (no como signature).
|
||||
rc = b.get("thinking", "")
|
||||
if rc:
|
||||
reasoning_parts.append(rc)
|
||||
elif t == "tool_use":
|
||||
tool_calls.append({
|
||||
"id": b.get("id", ""),
|
||||
@@ -315,8 +413,9 @@ class OpenAIAdapter(ModelAdapter):
|
||||
"arguments": json.dumps(b.get("input", {}), ensure_ascii=False),
|
||||
},
|
||||
})
|
||||
# thinking / otros bloques: se ignoran (OpenAI no los soporta)
|
||||
m: dict[str, Any] = {"role": "assistant", "content": ("\n".join(p for p in text_parts if p) or None)}
|
||||
if reasoning_parts:
|
||||
m["reasoning_content"] = "\n".join(reasoning_parts)
|
||||
if tool_calls:
|
||||
m["tool_calls"] = tool_calls
|
||||
out.append(m)
|
||||
|
||||
Reference in New Issue
Block a user