ultimos ajustes

This commit is contained in:
Jordan
2026-04-02 00:28:57 +01:00
parent 0795b28b54
commit bfccb02373
3 changed files with 155 additions and 117 deletions

View File

@@ -315,27 +315,29 @@ class LoadKnowledgeRequest(BaseModel):
@router.post("/knowledge/load")
async def load_knowledge(body: LoadKnowledgeRequest) -> dict[str, Any]:
"""Load markdown docs from a directory into the knowledge base."""
"""Load markdown docs from a directory into the knowledge base.
Generates embeddings for semantic search via OpenAI text-embedding-3-small.
"""
memory = _deps.get("memory_store")
if not memory:
raise HTTPException(status_code=501, detail="Memory store not available")
docs_dir = pathlib.Path(body.docs_path)
if not docs_dir.is_absolute():
# Resolve relative to project root
docs_dir = pathlib.Path(__file__).resolve().parent.parent.parent / body.docs_path
if not docs_dir.is_dir():
raise HTTPException(status_code=400, detail=f"Directory not found: {docs_dir}")
loaded = []
# Read all docs
docs_data: list[tuple[str, str, str, str, list[str]]] = [] # (id, title, content, summary, tags)
for md_file in sorted(docs_dir.glob("*.md")):
content = md_file.read_text(encoding="utf-8")
doc_id = md_file.stem
# Build a summary from the first ~500 chars
lines = content.strip().splitlines()
title = lines[0].lstrip("#").strip() if lines else doc_id
summary_lines = []
for line in lines[:30]:
line = line.strip()
@@ -345,12 +347,30 @@ async def load_knowledge(body: LoadKnowledgeRequest) -> dict[str, Any]:
break
summary = " ".join(summary_lines)[:500]
# Extract tags from headings
tags = []
for line in lines:
if line.startswith("## "):
tags.append(line.lstrip("#").strip().lower()[:30])
docs_data.append((doc_id, title, content, summary, tags[:10]))
# Generate embeddings in batch
from ..memory.embeddings import EmbeddingService
embed_service = EmbeddingService()
embed_texts = [f"{title}\n{summary}\n{content[:2000]}" for _, title, content, summary, _ in docs_data]
try:
embeddings = await embed_service.embed_batch(embed_texts)
has_embeddings = True
logger.info("Generated %d embeddings for knowledge base", len(embeddings))
except Exception as e:
logger.warning("Failed to generate embeddings: %s — loading without semantic search", e)
embeddings = [None] * len(docs_data)
has_embeddings = False
# Store docs + embeddings
loaded = []
for i, (doc_id, title, content, summary, tags) in enumerate(docs_data):
doc = MemoryDocument(
memory_id=doc_id,
memory_type=MemoryType.DOCUMENT,
@@ -358,20 +378,26 @@ async def load_knowledge(body: LoadKnowledgeRequest) -> dict[str, Any]:
title=title,
content=content,
summary=summary,
tags=tags[:10],
tags=tags,
)
await memory.store_document(doc)
if embeddings[i] is not None:
await memory.store_embedding(doc_id, embeddings[i], namespace="knowledge")
loaded.append({
"id": doc_id,
"title": title,
"chars": len(content),
"tags": tags[:5],
"embedded": embeddings[i] is not None,
})
logger.info("Loaded %d knowledge documents from %s", len(loaded), docs_dir)
logger.info("Loaded %d knowledge documents from %s (embeddings: %s)", len(loaded), docs_dir, has_embeddings)
return {
"status": "loaded",
"count": len(loaded),
"embeddings": has_embeddings,
"documents": loaded,
}