ultimos ajustes

2026-04-02 00:28:57 +01:00
parent 0795b28b54
commit bfccb02373
3 changed files with 155 additions and 117 deletions
--- a/src/api/routes.py
+++ b/src/api/routes.py
@@ -315,27 +315,29 @@ class LoadKnowledgeRequest(BaseModel):

@router.post("/knowledge/load")
 async def load_knowledge(body: LoadKnowledgeRequest) -> dict[str, Any]:
-    """Load markdown docs from a directory into the knowledge base."""
+    """Load markdown docs from a directory into the knowledge base.
+
+    Generates embeddings for semantic search via OpenAI text-embedding-3-small.
+    """
    memory = _deps.get("memory_store")
    if not memory:
        raise HTTPException(status_code=501, detail="Memory store not available")

    docs_dir = pathlib.Path(body.docs_path)
    if not docs_dir.is_absolute():
-        # Resolve relative to project root
        docs_dir = pathlib.Path(__file__).resolve().parent.parent.parent / body.docs_path

    if not docs_dir.is_dir():
        raise HTTPException(status_code=400, detail=f"Directory not found: {docs_dir}")

-    loaded = []
+    # Read all docs
+    docs_data: list[tuple[str, str, str, str, list[str]]] = []  # (id, title, content, summary, tags)
    for md_file in sorted(docs_dir.glob("*.md")):
        content = md_file.read_text(encoding="utf-8")
        doc_id = md_file.stem
-
-        # Build a summary from the first ~500 chars
        lines = content.strip().splitlines()
        title = lines[0].lstrip("#").strip() if lines else doc_id
+
        summary_lines = []
        for line in lines[:30]:
            line = line.strip()
@@ -345,12 +347,30 @@ async def load_knowledge(body: LoadKnowledgeRequest) -> dict[str, Any]:
                break
        summary = " ".join(summary_lines)[:500]

-        # Extract tags from headings
        tags = []
        for line in lines:
            if line.startswith("## "):
                tags.append(line.lstrip("#").strip().lower()[:30])

+        docs_data.append((doc_id, title, content, summary, tags[:10]))
+
+    # Generate embeddings in batch
+    from ..memory.embeddings import EmbeddingService
+    embed_service = EmbeddingService()
+    embed_texts = [f"{title}\n{summary}\n{content[:2000]}" for _, title, content, summary, _ in docs_data]
+
+    try:
+        embeddings = await embed_service.embed_batch(embed_texts)
+        has_embeddings = True
+        logger.info("Generated %d embeddings for knowledge base", len(embeddings))
+    except Exception as e:
+        logger.warning("Failed to generate embeddings: %s — loading without semantic search", e)
+        embeddings = [None] * len(docs_data)
+        has_embeddings = False
+
+    # Store docs + embeddings
+    loaded = []
+    for i, (doc_id, title, content, summary, tags) in enumerate(docs_data):
        doc = MemoryDocument(
            memory_id=doc_id,
            memory_type=MemoryType.DOCUMENT,
@@ -358,20 +378,26 @@ async def load_knowledge(body: LoadKnowledgeRequest) -> dict[str, Any]:
            title=title,
            content=content,
            summary=summary,
-            tags=tags[:10],
+            tags=tags,
        )
        await memory.store_document(doc)
+
+        if embeddings[i] is not None:
+            await memory.store_embedding(doc_id, embeddings[i], namespace="knowledge")
+
        loaded.append({
            "id": doc_id,
            "title": title,
            "chars": len(content),
            "tags": tags[:5],
+            "embedded": embeddings[i] is not None,
        })

-    logger.info("Loaded %d knowledge documents from %s", len(loaded), docs_dir)
+    logger.info("Loaded %d knowledge documents from %s (embeddings: %s)", len(loaded), docs_dir, has_embeddings)
    return {
        "status": "loaded",
        "count": len(loaded),
+        "embeddings": has_embeddings,
        "documents": loaded,
    }