Files
agenticSystem/src/mcp/client.py
Jordan Diaz 96b4542918 fix(mcp): el read loop ya no muere con respuestas grandes (screenshots)
Sintoma: "el agente se para cuando hace acciones". MCPClient._read_loop lee las
respuestas JSON-RPC con stdout.readline(), cuyo StreamReader tenia buffer de 1MB.
Una respuesta llega en UNA linea; playwright__browser_take_screenshot({fullPage:
true}) devuelve la imagen en base64 en esa linea y supera el limite →
asyncio.LimitOverrunError → el except Exception mataba el read loop y dejaba la
sesion MCP inservible (los turnos siguientes ejecutaban 0 tools).

Fix en dos capas:
- MCP_STREAM_LIMIT=64MB en create_subprocess_exec(limit=...) — cubre cualquier
  screenshot real.
- Read loop tolerante: captura (ValueError, LimitOverrunError), descarta solo esa
  respuesta re-sincronizando el stream hasta el \n (_drain_until_newline) y sigue
  vivo, en vez de matar toda la sesion MCP.

Validado: navegar + screenshot fullPage + glob ejecuta las 4 tools sin "read loop
error" y sin colapsar el contexto.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-05 17:38:19 +00:00

335 lines
12 KiB
Python

"""MCP (Model Context Protocol) client — stdio transport.
Manages subprocess lifecycle, JSON-RPC request/response, timeouts,
and a tool registry populated from the server's capabilities.
"""
from __future__ import annotations
import asyncio
import json
import logging
import os
import uuid
from typing import Any
from ..config import settings
from ..models.tools import ToolDefinition
logger = logging.getLogger(__name__)
# Buffer maximo (bytes) del StreamReader para leer las respuestas JSON-RPC del
# MCP por stdout. Una respuesta llega en UNA sola linea; tools como el
# screenshot fullPage de Playwright devuelven la imagen en base64 en esa linea
# y superan de largo el 64KB por defecto de asyncio (y el 1MB que teniamos),
# lanzando LimitOverrunError que mataba el read loop y dejaba la sesion MCP
# inservible (el agente "se paraba" al hacer acciones). 64MB cubre cualquier
# screenshot real; por encima, el read loop descarta esa respuesta y sigue vivo.
MCP_STREAM_LIMIT = 64 * 1024 * 1024
class MCPClientError(Exception):
pass
class MCPClient:
"""Stdio-based MCP client with full lifecycle management."""
def __init__(
self,
command: str | None = None,
args: list[str] | None = None,
timeout: float | None = None,
startup_timeout: float | None = None,
env: dict[str, str] | None = None,
name: str = "mcp",
) -> None:
self.name = name
self._command = command or settings.mcp_server_command
self._args = args if args is not None else list(settings.mcp_server_args)
self._timeout = timeout or settings.mcp_timeout_seconds
self._startup_timeout = startup_timeout or settings.mcp_startup_timeout_seconds
# Inherit current env + any overrides (passes ACAI_* vars to MCP server)
self._env = {**os.environ, **(env or {})}
self._process: asyncio.subprocess.Process | None = None
self._tools: dict[str, ToolDefinition] = {}
self._pending: dict[str, asyncio.Future[dict[str, Any]]] = {}
self._reader_task: asyncio.Task[None] | None = None
self._running = False
@property
def tools(self) -> dict[str, ToolDefinition]:
return dict(self._tools)
@property
def is_running(self) -> bool:
return self._running and self._process is not None
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
async def start(self) -> None:
"""Start the MCP server subprocess and discover tools."""
if not self._command:
logger.warning("No MCP server command configured — skipping start")
return
logger.info("Starting MCP server [%s]: %s %s", self.name, self._command, self._args)
self._process = await asyncio.create_subprocess_exec(
self._command,
*self._args,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
env=self._env,
limit=MCP_STREAM_LIMIT, # buffer grande para respuestas MCP (screenshots base64)
)
self._running = True
self._reader_task = asyncio.create_task(self._read_loop())
# Initialize
try:
init_result = await asyncio.wait_for(
self._send_request("initialize", {
"protocolVersion": "2024-11-05",
"capabilities": {},
"clientInfo": {"name": "agentic-microservice", "version": "1.0.0"},
}),
timeout=self._startup_timeout,
)
logger.info("MCP initialized: %s", init_result)
# Send initialized notification
await self._send_notification("notifications/initialized", {})
# Discover tools
tools_result = await asyncio.wait_for(
self._send_request("tools/list", {}),
timeout=self._startup_timeout,
)
self._register_tools(tools_result)
logger.info("Discovered %d MCP tools", len(self._tools))
except asyncio.TimeoutError:
logger.error("MCP server startup timed out")
await self.stop()
raise MCPClientError("MCP server startup timed out")
async def stop(self) -> None:
"""Gracefully stop the MCP server."""
self._running = False
if self._reader_task:
self._reader_task.cancel()
try:
await self._reader_task
except asyncio.CancelledError:
pass
if self._process:
try:
if self._process.stdin:
self._process.stdin.close()
self._process.terminate()
await asyncio.wait_for(self._process.wait(), timeout=5.0)
except (asyncio.TimeoutError, ProcessLookupError):
self._process.kill()
self._process = None
# Cancel any pending requests
for fut in self._pending.values():
if not fut.done():
fut.cancel()
self._pending.clear()
self._tools.clear()
# ------------------------------------------------------------------
# Tool execution
# ------------------------------------------------------------------
async def call_tool(
self, tool_name: str, arguments: dict[str, Any]
) -> dict[str, Any]:
"""Call a tool on the MCP server with timeout."""
if not self.is_running:
raise MCPClientError("MCP client is not running")
if tool_name not in self._tools:
raise MCPClientError(f"Unknown tool: {tool_name}")
try:
result = await asyncio.wait_for(
self._send_request("tools/call", {
"name": tool_name,
"arguments": arguments,
}),
timeout=self._timeout,
)
return result
except asyncio.TimeoutError:
raise MCPClientError(
f"Tool '{tool_name}' timed out after {self._timeout}s"
)
def get_tool_definitions(self) -> list[dict[str, Any]]:
"""Return tool definitions in a format suitable for model adapters."""
definitions: list[dict[str, Any]] = []
for tool in self._tools.values():
definitions.append({
"name": tool.name,
"description": tool.description,
"input_schema": tool.input_schema,
})
return definitions
# ------------------------------------------------------------------
# JSON-RPC transport
# ------------------------------------------------------------------
async def _send_request(
self, method: str, params: dict[str, Any]
) -> dict[str, Any]:
"""Send a JSON-RPC request and await the response."""
request_id = uuid.uuid4().hex[:12]
message = {
"jsonrpc": "2.0",
"id": request_id,
"method": method,
"params": params,
}
loop = asyncio.get_event_loop()
future: asyncio.Future[dict[str, Any]] = loop.create_future()
self._pending[request_id] = future
await self._write_message(message)
try:
return await future
finally:
self._pending.pop(request_id, None)
async def _send_notification(
self, method: str, params: dict[str, Any]
) -> None:
"""Send a JSON-RPC notification (no response expected)."""
message = {
"jsonrpc": "2.0",
"method": method,
"params": params,
}
await self._write_message(message)
async def _write_message(self, message: dict[str, Any]) -> None:
"""Write a JSON-RPC message to the server's stdin."""
if not self._process or not self._process.stdin:
raise MCPClientError("MCP process stdin not available")
data = json.dumps(message) + "\n"
self._process.stdin.write(data.encode())
await self._process.stdin.drain()
async def _read_loop(self) -> None:
"""Continuously read JSON-RPC responses from stdout."""
if not self._process or not self._process.stdout:
return
stdout = self._process.stdout
try:
while self._running:
try:
line = await stdout.readline()
except (ValueError, asyncio.LimitOverrunError):
# Una respuesta JSON-RPC supero el buffer (p.ej. screenshot
# fullPage de Playwright en base64 por encima de 64MB). Antes
# esto mataba el read loop y dejaba TODA la sesion MCP muerta
# (el agente se "paraba" en la siguiente accion). Ahora
# descartamos solo esa respuesta, re-sincronizamos el stream
# y seguimos vivos para las demas tools.
logger.warning(
"MCP [%s]: respuesta supera el buffer (%d MB), se descarta y se continua",
self.name, MCP_STREAM_LIMIT // (1024 * 1024),
)
await self._drain_until_newline(stdout)
continue
if not line:
logger.warning("MCP server stdout closed")
break
line_str = line.decode(errors="replace").strip()
if not line_str:
continue
try:
message = json.loads(line_str)
except json.JSONDecodeError:
logger.debug("Non-JSON MCP output: %s", line_str[:200])
continue
self._handle_message(message)
except asyncio.CancelledError:
pass
except Exception:
logger.exception("MCP read loop error")
finally:
self._running = False
async def _drain_until_newline(self, stdout: asyncio.StreamReader) -> None:
"""Consume bytes del stream hasta el proximo salto de linea para
re-sincronizar tras un LimitOverrunError (la respuesta sobredimensionada
se descarta). `read()` no usa separador, asi que no vuelve a disparar el
overrun y va vaciando el buffer hasta liberar la linea gigante."""
while self._running:
try:
chunk = await stdout.read(65536)
except Exception:
return
if not chunk:
return
if b"\n" in chunk:
return
def _handle_message(self, message: dict[str, Any]) -> None:
"""Route an incoming JSON-RPC message."""
msg_id = message.get("id")
if msg_id and msg_id in self._pending:
future = self._pending[msg_id]
if future.done():
return
if "error" in message:
future.set_exception(
MCPClientError(
f"MCP error {message['error'].get('code')}: "
f"{message['error'].get('message')}"
)
)
else:
future.set_result(message.get("result", {}))
elif "method" in message:
# Server-initiated notification — log it
logger.debug(
"MCP notification: %s", message.get("method")
)
# ------------------------------------------------------------------
# Tool registry
# ------------------------------------------------------------------
def _register_tools(self, tools_result: dict[str, Any]) -> None:
"""Parse tools/list response and populate the registry."""
raw_tools = tools_result.get("tools", [])
for t in raw_tools:
name = t.get("name", "")
if not name:
continue
self._tools[name] = ToolDefinition(
name=name,
description=t.get("description", ""),
input_schema=t.get("inputSchema", {}),
server_name=self.name,
)