From 9277862e56a7acbcc501f221e7d85ef8cd917e2d Mon Sep 17 00:00:00 2001 From: Jordan Diaz Date: Thu, 11 Jun 2026 17:23:53 +0000 Subject: [PATCH] read_doc: resolver docs por ACAI_PROJECT_DIR + knowledge load idempotente MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - mcp-server _docsReader.js: resolveDocsDir → ACAI_DOCS_DIR / $ACAI_PROJECT_DIR/docs / /app/docs. Arregla DOC_NOT_FOUND en VSCode (HTTP MCP) y local; el .mcp.json ya inyecta ACAI_PROJECT_DIR - routes.py: /knowledge/load idempotente — salta embeddings si el hash de contenido no cambió (clave Redis kbhash), para dispararlo libremente desde el botón de scaffold sin re-embeber Co-Authored-By: Claude Opus 4.8 (1M context) --- mcp-server/tools/docs/_docsReader.js | 35 ++++++++++--- src/api/routes.py | 77 ++++++++++++++++++++++------ 2 files changed, 88 insertions(+), 24 deletions(-) diff --git a/mcp-server/tools/docs/_docsReader.js b/mcp-server/tools/docs/_docsReader.js index 3433ca2..9a93b51 100644 --- a/mcp-server/tools/docs/_docsReader.js +++ b/mcp-server/tools/docs/_docsReader.js @@ -1,20 +1,41 @@ import fs from "node:fs/promises"; +import { existsSync } from "node:fs"; import path from "node:path"; /** * Lectura directa de los markdown del knowledge base desde el filesystem. * - * El MCP server corre dentro del container `agentic` junto al FastAPI, asi - * que los .md viven en `/app/docs/` (la imagen los copia ahi). - * - * En caso de override por entorno, respeta `ACAI_DOCS_DIR`. En desarrollo - * fuera del container, fallback a paths relativos al cwd. + * Orden de resolucion del directorio de docs: + * 1. `ACAI_DOCS_DIR` — override explicito por entorno (si esta definido y no vacio). + * 2. `/docs` — caso principal: cada proyecto/web tiene su + * propio `docs/`. El `.mcp.json` inyecta `ACAI_PROJECT_DIR` (p.ej. + * `/opt/acai/webs//`), funciona tanto en local (VSCode) como + * en cloud (agentic). + * 3. `/app/docs` — fallback final: container `agentic` donde esta horneada la + * copia canonica de los .md. */ +function dirExists(p) { + try { + return existsSync(p); + } catch { + return false; + } +} + function resolveDocsDir() { + // 1. Override explicito const override = process.env.ACAI_DOCS_DIR; - if (override) return override; - // Container path + if (override && override.trim() !== "") return override; + + // 2. Docs del proyecto/web + const projectDir = process.env.ACAI_PROJECT_DIR; + if (projectDir && projectDir.trim() !== "") { + const projectDocs = path.join(projectDir, "docs"); + if (dirExists(projectDocs)) return projectDocs; + } + + // 3. Fallback al container agentic return "/app/docs"; } diff --git a/src/api/routes.py b/src/api/routes.py index de6f29d..6e759b0 100644 --- a/src/api/routes.py +++ b/src/api/routes.py @@ -786,25 +786,62 @@ async def _load_knowledge_from_dir(docs_path: str = "docs") -> dict[str, Any]: docs_data.append((doc_id, title, content, summary, tags, priority, load_when)) - # Generate embeddings in batch (solo si hay credencial de embeddings; sin - # ella la llamada daria 401 — se omite limpiamente). + # Hash de contenido por doc — base del skip idempotente de embeddings. + import hashlib + + def _embed_text(title, summary, content): + return f"{title}\n{summary}\n{content[:2000]}" + + def _doc_hash(title, summary, content): + return hashlib.md5(_embed_text(title, summary, content).encode("utf-8")).hexdigest() + + new_hashes = [_doc_hash(t, s, c) for _, t, c, s, _, _, _ in docs_data] + + # Generate embeddings SOLO para docs nuevos o cuyo contenido cambió (skip + # idempotente): si el hash coincide con el guardado y ya existe el embedding + # en Redis, se reutiliza y NO se vuelve a llamar a la API. Esto permite que + # /knowledge/load se dispare libremente (botón de scaffold, etc.) sin re-embeber. embeddings: list[Any] = [None] * len(docs_data) + already_embedded = [False] * len(docs_data) has_embeddings = False if settings.embeddings_enabled: - from ..memory.embeddings import EmbeddingService - embed_service = EmbeddingService() - embed_texts = [ - f"{title}\n{summary}\n{content[:2000]}" - for _, title, content, summary, _, _, _ in docs_data - ] - try: - embeddings = await embed_service.embed_batch(embed_texts) + to_embed = [] # indices que hay que (re)embeber + for i, (doc_id, title, content, summary, _, _, _) in enumerate(docs_data): + try: + prev = await memory._r.get(memory._key("kbhash", "knowledge", doc_id)) + if isinstance(prev, bytes): + prev = prev.decode("utf-8") + has_embed = await memory._r.exists(memory._key("embeddings", "knowledge", doc_id)) + except Exception: + prev, has_embed = None, 0 + if prev == new_hashes[i] and has_embed: + already_embedded[i] = True # sin cambios → reutiliza el embedding existente + else: + to_embed.append(i) + + if to_embed: + from ..memory.embeddings import EmbeddingService + embed_service = EmbeddingService() + embed_texts = [ + _embed_text(docs_data[i][1], docs_data[i][3], docs_data[i][2]) + for i in to_embed + ] + try: + fresh = await embed_service.embed_batch(embed_texts) + for j, i in enumerate(to_embed): + embeddings[i] = fresh[j] + has_embeddings = True + logger.info( + "Generated %d embeddings (%d sin cambios, omitidos)", + len(to_embed), len(docs_data) - len(to_embed), + ) + except Exception as e: + logger.warning("Failed to generate embeddings: %s — loading without semantic search", e) + embeddings = [None] * len(docs_data) + has_embeddings = False + else: has_embeddings = True - logger.info("Generated %d embeddings for knowledge base", len(embeddings)) - except Exception as e: - logger.warning("Failed to generate embeddings: %s — loading without semantic search", e) - embeddings = [None] * len(docs_data) - has_embeddings = False + logger.info("Knowledge sin cambios — no se regeneraron embeddings (%d docs)", len(docs_data)) else: logger.info("Embeddings disabled (no AGENTIC_EMBEDDINGS_API_KEY) — KB loaded without semantic search") @@ -817,9 +854,10 @@ async def _load_knowledge_from_dir(docs_path: str = "docs") -> dict[str, Any]: for existing in existing_docs: if existing.memory_id not in current_ids: await memory.delete_document(existing.memory_id, namespace="knowledge") - # Borra también el embedding asociado + # Borra también el embedding asociado y el hash de contenido embed_key = memory._key("embeddings", "knowledge", existing.memory_id) await memory._r.delete(embed_key) + await memory._r.delete(memory._key("kbhash", "knowledge", existing.memory_id)) removed.append(existing.memory_id) if removed: logger.info("Removed %d stale knowledge docs: %s", len(removed), removed) @@ -842,6 +880,11 @@ async def _load_knowledge_from_dir(docs_path: str = "docs") -> dict[str, Any]: if embeddings[i] is not None: await memory.store_embedding(doc_id, embeddings[i], namespace="knowledge") + # Guarda el hash de contenido para el skip idempotente del próximo load + try: + await memory._r.set(memory._key("kbhash", "knowledge", doc_id), new_hashes[i]) + except Exception: + pass loaded.append({ "id": doc_id, @@ -850,7 +893,7 @@ async def _load_knowledge_from_dir(docs_path: str = "docs") -> dict[str, Any]: "tags": tags[:5], "priority": priority, "load_when": load_when, - "embedded": embeddings[i] is not None, + "embedded": embeddings[i] is not None or already_embedded[i], }) logger.info("Loaded %d knowledge documents from %s (embeddings: %s)", len(loaded), docs_dir, has_embeddings)