Añadir completion + ajustes del chat
This commit is contained in:
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from typing import Any, AsyncIterator
|
from typing import Any, AsyncIterator
|
||||||
@@ -14,6 +15,27 @@ from .base import ModelAdapter, ModelConfig, ModelResponse, StreamChunk
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# Errores transitorios del proxy del modelo (MiniMax/Anthropic). Reintentamos
|
||||||
|
# con backoff exponencial: 1s, 3s, 9s. 529 es overloaded_error de Anthropic;
|
||||||
|
# 429 rate-limit; 503 service unavailable.
|
||||||
|
_TRANSIENT_STATUSES = {429, 503, 529}
|
||||||
|
_RETRY_DELAYS = (1.0, 3.0, 9.0)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_transient(exc: Exception) -> bool:
|
||||||
|
"""True si el error es seguro de reintentar (sobrecarga / red transitoria)."""
|
||||||
|
if isinstance(exc, (anthropic.APIConnectionError, anthropic.APITimeoutError)):
|
||||||
|
return True
|
||||||
|
if isinstance(exc, anthropic.APIStatusError):
|
||||||
|
status = getattr(exc, "status_code", None)
|
||||||
|
if status in _TRANSIENT_STATUSES:
|
||||||
|
return True
|
||||||
|
msg = str(exc).lower()
|
||||||
|
if "overloaded" in msg or "high load" in msg:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class ClaudeAdapter(ModelAdapter):
|
class ClaudeAdapter(ModelAdapter):
|
||||||
"""Adapter for the Anthropic Claude API."""
|
"""Adapter for the Anthropic Claude API."""
|
||||||
|
|
||||||
@@ -63,65 +85,87 @@ class ClaudeAdapter(ModelAdapter):
|
|||||||
if tools:
|
if tools:
|
||||||
kwargs["tools"] = self._format_tools(tools)
|
kwargs["tools"] = self._format_tools(tools)
|
||||||
|
|
||||||
async with self._client.messages.stream(**kwargs) as stream:
|
# Retry con backoff sobre errores transitorios al ABRIR el stream.
|
||||||
current_tool_id = ""
|
# Si ya hemos empezado a emitir chunks al consumidor, NO podemos
|
||||||
current_tool_name = ""
|
# reintentar (el orquestador ya recibió contenido parcial).
|
||||||
accumulated_args = ""
|
attempt = 0
|
||||||
input_tokens = 0
|
max_attempts = len(_RETRY_DELAYS) + 1
|
||||||
|
while True:
|
||||||
async for event in stream:
|
yielded_any = False
|
||||||
if event.type == "message_start" and hasattr(event, "message"):
|
try:
|
||||||
usage = getattr(event.message, "usage", None)
|
async with self._client.messages.stream(**kwargs) as stream:
|
||||||
if usage:
|
|
||||||
input_tokens = getattr(usage, "input_tokens", 0)
|
|
||||||
|
|
||||||
if event.type == "content_block_start":
|
|
||||||
block = event.content_block
|
|
||||||
if block.type == "tool_use":
|
|
||||||
current_tool_id = block.id
|
|
||||||
current_tool_name = block.name
|
|
||||||
accumulated_args = ""
|
|
||||||
yield StreamChunk(
|
|
||||||
tool_call_id=current_tool_id,
|
|
||||||
tool_name=current_tool_name,
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if event.type == "content_block_delta":
|
|
||||||
delta = event.delta
|
|
||||||
if delta.type == "text_delta":
|
|
||||||
yield StreamChunk(delta=delta.text)
|
|
||||||
elif delta.type == "input_json_delta":
|
|
||||||
accumulated_args += delta.partial_json
|
|
||||||
yield StreamChunk(
|
|
||||||
tool_call_id=current_tool_id,
|
|
||||||
tool_name=current_tool_name,
|
|
||||||
tool_arguments=delta.partial_json,
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if event.type == "content_block_stop":
|
|
||||||
if current_tool_id and accumulated_args:
|
|
||||||
yield StreamChunk(
|
|
||||||
tool_call_id=current_tool_id,
|
|
||||||
tool_name=current_tool_name,
|
|
||||||
tool_arguments=accumulated_args,
|
|
||||||
finish_reason="tool_use",
|
|
||||||
)
|
|
||||||
current_tool_id = ""
|
current_tool_id = ""
|
||||||
current_tool_name = ""
|
current_tool_name = ""
|
||||||
accumulated_args = ""
|
accumulated_args = ""
|
||||||
continue
|
input_tokens = 0
|
||||||
|
|
||||||
if event.type == "message_delta":
|
async for event in stream:
|
||||||
output_tokens = getattr(event.usage, "output_tokens", 0) if event.usage else 0
|
yielded_any = True
|
||||||
yield StreamChunk(
|
if event.type == "message_start" and hasattr(event, "message"):
|
||||||
finish_reason=event.delta.stop_reason or "",
|
usage = getattr(event.message, "usage", None)
|
||||||
usage={
|
if usage:
|
||||||
"input_tokens": input_tokens,
|
input_tokens = getattr(usage, "input_tokens", 0)
|
||||||
"output_tokens": output_tokens,
|
|
||||||
},
|
if event.type == "content_block_start":
|
||||||
)
|
block = event.content_block
|
||||||
|
if block.type == "tool_use":
|
||||||
|
current_tool_id = block.id
|
||||||
|
current_tool_name = block.name
|
||||||
|
accumulated_args = ""
|
||||||
|
yield StreamChunk(
|
||||||
|
tool_call_id=current_tool_id,
|
||||||
|
tool_name=current_tool_name,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if event.type == "content_block_delta":
|
||||||
|
delta = event.delta
|
||||||
|
if delta.type == "text_delta":
|
||||||
|
yield StreamChunk(delta=delta.text)
|
||||||
|
elif delta.type == "input_json_delta":
|
||||||
|
accumulated_args += delta.partial_json
|
||||||
|
yield StreamChunk(
|
||||||
|
tool_call_id=current_tool_id,
|
||||||
|
tool_name=current_tool_name,
|
||||||
|
tool_arguments=delta.partial_json,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if event.type == "content_block_stop":
|
||||||
|
if current_tool_id and accumulated_args:
|
||||||
|
yield StreamChunk(
|
||||||
|
tool_call_id=current_tool_id,
|
||||||
|
tool_name=current_tool_name,
|
||||||
|
tool_arguments=accumulated_args,
|
||||||
|
finish_reason="tool_use",
|
||||||
|
)
|
||||||
|
current_tool_id = ""
|
||||||
|
current_tool_name = ""
|
||||||
|
accumulated_args = ""
|
||||||
|
continue
|
||||||
|
|
||||||
|
if event.type == "message_delta":
|
||||||
|
output_tokens = getattr(event.usage, "output_tokens", 0) if event.usage else 0
|
||||||
|
yield StreamChunk(
|
||||||
|
finish_reason=event.delta.stop_reason or "",
|
||||||
|
usage={
|
||||||
|
"input_tokens": input_tokens,
|
||||||
|
"output_tokens": output_tokens,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return # consumo OK, salimos del retry loop
|
||||||
|
except Exception as e:
|
||||||
|
# Si ya emitimos algo al consumidor, no podemos reintentar
|
||||||
|
# de forma segura: el contenido parcial ya viajó.
|
||||||
|
if yielded_any or not _is_transient(e) or attempt >= max_attempts - 1:
|
||||||
|
raise
|
||||||
|
wait = _RETRY_DELAYS[attempt]
|
||||||
|
logger.warning(
|
||||||
|
"Claude stream() transient error (attempt %d/%d), retrying in %.1fs: %s",
|
||||||
|
attempt + 1, max_attempts, wait, str(e)[:200],
|
||||||
|
)
|
||||||
|
await asyncio.sleep(wait)
|
||||||
|
attempt += 1
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Non-streaming
|
# Non-streaming
|
||||||
@@ -158,8 +202,31 @@ class ClaudeAdapter(ModelAdapter):
|
|||||||
kwargs["system"] = system_content
|
kwargs["system"] = system_content
|
||||||
if tools:
|
if tools:
|
||||||
kwargs["tools"] = self._format_tools(tools)
|
kwargs["tools"] = self._format_tools(tools)
|
||||||
|
# Fuerza al modelo a usar un tool concreto para garantizar JSON por schema
|
||||||
|
# (usado por /completions con json_schema). Ver OpenAIAdapter para la variante.
|
||||||
|
force_tool = (config.extra or {}).get("force_tool")
|
||||||
|
if force_tool:
|
||||||
|
kwargs["tool_choice"] = {"type": "tool", "name": force_tool}
|
||||||
|
|
||||||
response = await self._client.messages.create(**kwargs)
|
# Retry con backoff sobre errores transitorios (429/503/529). El proxy
|
||||||
|
# MiniMax devuelve 529 overloaded_error con cierta frecuencia bajo carga.
|
||||||
|
last_exc: Exception | None = None
|
||||||
|
for attempt in range(len(_RETRY_DELAYS) + 1):
|
||||||
|
try:
|
||||||
|
response = await self._client.messages.create(**kwargs)
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
if not _is_transient(e) or attempt == len(_RETRY_DELAYS):
|
||||||
|
raise
|
||||||
|
wait = _RETRY_DELAYS[attempt]
|
||||||
|
logger.warning(
|
||||||
|
"Claude complete() transient error (attempt %d/%d), retrying in %.1fs: %s",
|
||||||
|
attempt + 1, len(_RETRY_DELAYS) + 1, wait, str(e)[:200],
|
||||||
|
)
|
||||||
|
last_exc = e
|
||||||
|
await asyncio.sleep(wait)
|
||||||
|
else:
|
||||||
|
raise last_exc or RuntimeError("Claude complete() retry exhausted")
|
||||||
|
|
||||||
content = ""
|
content = ""
|
||||||
tool_calls: list[dict[str, Any]] = []
|
tool_calls: list[dict[str, Any]] = []
|
||||||
|
|||||||
@@ -152,6 +152,14 @@ class OpenAIAdapter(ModelAdapter):
|
|||||||
}
|
}
|
||||||
if tools:
|
if tools:
|
||||||
kwargs["tools"] = self._format_tools(tools)
|
kwargs["tools"] = self._format_tools(tools)
|
||||||
|
# Fuerza al modelo a usar un tool concreto para garantizar JSON por schema
|
||||||
|
# (usado por /completions con json_schema). Ver ClaudeAdapter para la variante.
|
||||||
|
force_tool = (config.extra or {}).get("force_tool")
|
||||||
|
if force_tool:
|
||||||
|
kwargs["tool_choice"] = {
|
||||||
|
"type": "function",
|
||||||
|
"function": {"name": force_tool},
|
||||||
|
}
|
||||||
|
|
||||||
response = await self._client.chat.completions.create(**kwargs)
|
response = await self._client.chat.completions.create(**kwargs)
|
||||||
choice = response.choices[0]
|
choice = response.choices[0]
|
||||||
|
|||||||
@@ -48,6 +48,28 @@ class SendMessageRequest(BaseModel):
|
|||||||
agent_id: str | None = None
|
agent_id: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class CompletionRequest(BaseModel):
|
||||||
|
"""One-shot structured completion sin sesión, sin MCP, sin agente.
|
||||||
|
|
||||||
|
Para callers que necesitan una respuesta directa del LLM (opcionalmente
|
||||||
|
conforme a un JSON schema). Ejemplo: `content_export.py` cuando extrae
|
||||||
|
brand info o genera chunks de texto para webs locales.
|
||||||
|
"""
|
||||||
|
message: str
|
||||||
|
system: str | None = None
|
||||||
|
model_id: str | None = None
|
||||||
|
max_tokens: int = 4096
|
||||||
|
temperature: float = 0.3
|
||||||
|
json_schema: dict[str, Any] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class CompletionResponse(BaseModel):
|
||||||
|
content: str = ""
|
||||||
|
parsed: dict[str, Any] | None = None
|
||||||
|
usage: dict[str, int] = Field(default_factory=dict)
|
||||||
|
model_id: str = ""
|
||||||
|
|
||||||
|
|
||||||
class SessionResponse(BaseModel):
|
class SessionResponse(BaseModel):
|
||||||
session_id: str
|
session_id: str
|
||||||
status: str
|
status: str
|
||||||
@@ -157,6 +179,70 @@ async def create_session(body: CreateSessionRequest) -> CreateSessionResponse:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# POST /completions — one-shot structured completion
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
@router.post("/completions", response_model=CompletionResponse)
|
||||||
|
async def completion(body: CompletionRequest) -> CompletionResponse:
|
||||||
|
"""Llamada directa al LLM sin sesión, sin MCP, sin agente.
|
||||||
|
|
||||||
|
Si se proporciona `json_schema`, el modelo es forzado a rellenar un tool
|
||||||
|
con ese schema (tool_use forzado para Claude/Anthropic-compatible, function
|
||||||
|
calling forzado para OpenAI). El resultado se devuelve parseado en `parsed`
|
||||||
|
además del JSON stringified en `content`.
|
||||||
|
"""
|
||||||
|
from ..adapters.base import ModelConfig
|
||||||
|
|
||||||
|
adapter = _deps.get("model_adapter")
|
||||||
|
if adapter is None:
|
||||||
|
raise HTTPException(status_code=503, detail="Model adapter not initialized")
|
||||||
|
|
||||||
|
messages: list[dict[str, Any]] = []
|
||||||
|
if body.system:
|
||||||
|
messages.append({"role": "system", "content": body.system})
|
||||||
|
messages.append({"role": "user", "content": body.message})
|
||||||
|
|
||||||
|
config = ModelConfig(
|
||||||
|
model_id=body.model_id or settings.default_model_id,
|
||||||
|
max_tokens=body.max_tokens,
|
||||||
|
temperature=body.temperature,
|
||||||
|
)
|
||||||
|
|
||||||
|
tools_param: list[dict[str, Any]] | None = None
|
||||||
|
if body.json_schema:
|
||||||
|
tools_param = [
|
||||||
|
{
|
||||||
|
"name": "emit_response",
|
||||||
|
"description": "Emite la respuesta estructurada conforme al schema.",
|
||||||
|
"input_schema": body.json_schema,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
config.extra = {"force_tool": "emit_response"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await adapter.complete(messages, tools=tools_param, config=config)
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("completion failed: %s", e)
|
||||||
|
raise HTTPException(status_code=502, detail="Model call failed: {}".format(e))
|
||||||
|
|
||||||
|
parsed: dict[str, Any] | None = None
|
||||||
|
content_out = response.content or ""
|
||||||
|
if body.json_schema and response.tool_calls:
|
||||||
|
tool_args = response.tool_calls[0].get("arguments")
|
||||||
|
if isinstance(tool_args, dict):
|
||||||
|
parsed = tool_args
|
||||||
|
import json as _json
|
||||||
|
content_out = _json.dumps(parsed, ensure_ascii=False)
|
||||||
|
|
||||||
|
return CompletionResponse(
|
||||||
|
content=content_out,
|
||||||
|
parsed=parsed,
|
||||||
|
usage=response.usage or {},
|
||||||
|
model_id=config.model_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# POST /sessions/{id}/messages
|
# POST /sessions/{id}/messages
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
@@ -170,6 +256,18 @@ async def send_message(
|
|||||||
if not session:
|
if not session:
|
||||||
raise HTTPException(status_code=404, detail="Session not found")
|
raise HTTPException(status_code=404, detail="Session not found")
|
||||||
|
|
||||||
|
# Resetear sesión en estado ERROR al recibir un mensaje nuevo. Sin esto,
|
||||||
|
# un fallo transitorio (p.ej. 529 overloaded del proxy del modelo) deja
|
||||||
|
# la sesión bloqueada para siempre y los siguientes mensajes del usuario
|
||||||
|
# no se procesan. Limpiamos el current_task fallido también.
|
||||||
|
if session.status == SessionStatus.ERROR:
|
||||||
|
logger.info(
|
||||||
|
"Session %s was in ERROR state, resetting to ACTIVE for new message",
|
||||||
|
session_id,
|
||||||
|
)
|
||||||
|
session.status = SessionStatus.ACTIVE
|
||||||
|
session.current_task = None
|
||||||
|
|
||||||
# Get or create session's MCP manager
|
# Get or create session's MCP manager
|
||||||
registry = _get_mcp_registry()
|
registry = _get_mcp_registry()
|
||||||
mcp_manager = registry.get_for_session(session_id)
|
mcp_manager = registry.get_for_session(session_id)
|
||||||
|
|||||||
Reference in New Issue
Block a user