read_doc: resolver docs por ACAI_PROJECT_DIR + knowledge load idempotente
- mcp-server _docsReader.js: resolveDocsDir → ACAI_DOCS_DIR / $ACAI_PROJECT_DIR/docs / /app/docs. Arregla DOC_NOT_FOUND en VSCode (HTTP MCP) y local; el .mcp.json ya inyecta ACAI_PROJECT_DIR - routes.py: /knowledge/load idempotente — salta embeddings si el hash de contenido no cambió (clave Redis kbhash), para dispararlo libremente desde el botón de scaffold sin re-embeber Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,20 +1,41 @@
|
||||
import fs from "node:fs/promises";
|
||||
import { existsSync } from "node:fs";
|
||||
import path from "node:path";
|
||||
|
||||
/**
|
||||
* Lectura directa de los markdown del knowledge base desde el filesystem.
|
||||
*
|
||||
* El MCP server corre dentro del container `agentic` junto al FastAPI, asi
|
||||
* que los .md viven en `/app/docs/` (la imagen los copia ahi).
|
||||
*
|
||||
* En caso de override por entorno, respeta `ACAI_DOCS_DIR`. En desarrollo
|
||||
* fuera del container, fallback a paths relativos al cwd.
|
||||
* Orden de resolucion del directorio de docs:
|
||||
* 1. `ACAI_DOCS_DIR` — override explicito por entorno (si esta definido y no vacio).
|
||||
* 2. `<ACAI_PROJECT_DIR>/docs` — caso principal: cada proyecto/web tiene su
|
||||
* propio `docs/`. El `.mcp.json` inyecta `ACAI_PROJECT_DIR` (p.ej.
|
||||
* `/opt/acai/webs/<user>/<site>`), funciona tanto en local (VSCode) como
|
||||
* en cloud (agentic).
|
||||
* 3. `/app/docs` — fallback final: container `agentic` donde esta horneada la
|
||||
* copia canonica de los .md.
|
||||
*/
|
||||
|
||||
function dirExists(p) {
|
||||
try {
|
||||
return existsSync(p);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function resolveDocsDir() {
|
||||
// 1. Override explicito
|
||||
const override = process.env.ACAI_DOCS_DIR;
|
||||
if (override) return override;
|
||||
// Container path
|
||||
if (override && override.trim() !== "") return override;
|
||||
|
||||
// 2. Docs del proyecto/web
|
||||
const projectDir = process.env.ACAI_PROJECT_DIR;
|
||||
if (projectDir && projectDir.trim() !== "") {
|
||||
const projectDocs = path.join(projectDir, "docs");
|
||||
if (dirExists(projectDocs)) return projectDocs;
|
||||
}
|
||||
|
||||
// 3. Fallback al container agentic
|
||||
return "/app/docs";
|
||||
}
|
||||
|
||||
|
||||
@@ -786,25 +786,62 @@ async def _load_knowledge_from_dir(docs_path: str = "docs") -> dict[str, Any]:
|
||||
|
||||
docs_data.append((doc_id, title, content, summary, tags, priority, load_when))
|
||||
|
||||
# Generate embeddings in batch (solo si hay credencial de embeddings; sin
|
||||
# ella la llamada daria 401 — se omite limpiamente).
|
||||
# Hash de contenido por doc — base del skip idempotente de embeddings.
|
||||
import hashlib
|
||||
|
||||
def _embed_text(title, summary, content):
|
||||
return f"{title}\n{summary}\n{content[:2000]}"
|
||||
|
||||
def _doc_hash(title, summary, content):
|
||||
return hashlib.md5(_embed_text(title, summary, content).encode("utf-8")).hexdigest()
|
||||
|
||||
new_hashes = [_doc_hash(t, s, c) for _, t, c, s, _, _, _ in docs_data]
|
||||
|
||||
# Generate embeddings SOLO para docs nuevos o cuyo contenido cambió (skip
|
||||
# idempotente): si el hash coincide con el guardado y ya existe el embedding
|
||||
# en Redis, se reutiliza y NO se vuelve a llamar a la API. Esto permite que
|
||||
# /knowledge/load se dispare libremente (botón de scaffold, etc.) sin re-embeber.
|
||||
embeddings: list[Any] = [None] * len(docs_data)
|
||||
already_embedded = [False] * len(docs_data)
|
||||
has_embeddings = False
|
||||
if settings.embeddings_enabled:
|
||||
to_embed = [] # indices que hay que (re)embeber
|
||||
for i, (doc_id, title, content, summary, _, _, _) in enumerate(docs_data):
|
||||
try:
|
||||
prev = await memory._r.get(memory._key("kbhash", "knowledge", doc_id))
|
||||
if isinstance(prev, bytes):
|
||||
prev = prev.decode("utf-8")
|
||||
has_embed = await memory._r.exists(memory._key("embeddings", "knowledge", doc_id))
|
||||
except Exception:
|
||||
prev, has_embed = None, 0
|
||||
if prev == new_hashes[i] and has_embed:
|
||||
already_embedded[i] = True # sin cambios → reutiliza el embedding existente
|
||||
else:
|
||||
to_embed.append(i)
|
||||
|
||||
if to_embed:
|
||||
from ..memory.embeddings import EmbeddingService
|
||||
embed_service = EmbeddingService()
|
||||
embed_texts = [
|
||||
f"{title}\n{summary}\n{content[:2000]}"
|
||||
for _, title, content, summary, _, _, _ in docs_data
|
||||
_embed_text(docs_data[i][1], docs_data[i][3], docs_data[i][2])
|
||||
for i in to_embed
|
||||
]
|
||||
try:
|
||||
embeddings = await embed_service.embed_batch(embed_texts)
|
||||
fresh = await embed_service.embed_batch(embed_texts)
|
||||
for j, i in enumerate(to_embed):
|
||||
embeddings[i] = fresh[j]
|
||||
has_embeddings = True
|
||||
logger.info("Generated %d embeddings for knowledge base", len(embeddings))
|
||||
logger.info(
|
||||
"Generated %d embeddings (%d sin cambios, omitidos)",
|
||||
len(to_embed), len(docs_data) - len(to_embed),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to generate embeddings: %s — loading without semantic search", e)
|
||||
embeddings = [None] * len(docs_data)
|
||||
has_embeddings = False
|
||||
else:
|
||||
has_embeddings = True
|
||||
logger.info("Knowledge sin cambios — no se regeneraron embeddings (%d docs)", len(docs_data))
|
||||
else:
|
||||
logger.info("Embeddings disabled (no AGENTIC_EMBEDDINGS_API_KEY) — KB loaded without semantic search")
|
||||
|
||||
@@ -817,9 +854,10 @@ async def _load_knowledge_from_dir(docs_path: str = "docs") -> dict[str, Any]:
|
||||
for existing in existing_docs:
|
||||
if existing.memory_id not in current_ids:
|
||||
await memory.delete_document(existing.memory_id, namespace="knowledge")
|
||||
# Borra también el embedding asociado
|
||||
# Borra también el embedding asociado y el hash de contenido
|
||||
embed_key = memory._key("embeddings", "knowledge", existing.memory_id)
|
||||
await memory._r.delete(embed_key)
|
||||
await memory._r.delete(memory._key("kbhash", "knowledge", existing.memory_id))
|
||||
removed.append(existing.memory_id)
|
||||
if removed:
|
||||
logger.info("Removed %d stale knowledge docs: %s", len(removed), removed)
|
||||
@@ -842,6 +880,11 @@ async def _load_knowledge_from_dir(docs_path: str = "docs") -> dict[str, Any]:
|
||||
|
||||
if embeddings[i] is not None:
|
||||
await memory.store_embedding(doc_id, embeddings[i], namespace="knowledge")
|
||||
# Guarda el hash de contenido para el skip idempotente del próximo load
|
||||
try:
|
||||
await memory._r.set(memory._key("kbhash", "knowledge", doc_id), new_hashes[i])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
loaded.append({
|
||||
"id": doc_id,
|
||||
@@ -850,7 +893,7 @@ async def _load_knowledge_from_dir(docs_path: str = "docs") -> dict[str, Any]:
|
||||
"tags": tags[:5],
|
||||
"priority": priority,
|
||||
"load_when": load_when,
|
||||
"embedded": embeddings[i] is not None,
|
||||
"embedded": embeddings[i] is not None or already_embedded[i],
|
||||
})
|
||||
|
||||
logger.info("Loaded %d knowledge documents from %s (embeddings: %s)", len(loaded), docs_dir, has_embeddings)
|
||||
|
||||
Reference in New Issue
Block a user