ultimos ajustes
This commit is contained in:
@@ -315,27 +315,29 @@ class LoadKnowledgeRequest(BaseModel):
|
||||
|
||||
@router.post("/knowledge/load")
|
||||
async def load_knowledge(body: LoadKnowledgeRequest) -> dict[str, Any]:
|
||||
"""Load markdown docs from a directory into the knowledge base."""
|
||||
"""Load markdown docs from a directory into the knowledge base.
|
||||
|
||||
Generates embeddings for semantic search via OpenAI text-embedding-3-small.
|
||||
"""
|
||||
memory = _deps.get("memory_store")
|
||||
if not memory:
|
||||
raise HTTPException(status_code=501, detail="Memory store not available")
|
||||
|
||||
docs_dir = pathlib.Path(body.docs_path)
|
||||
if not docs_dir.is_absolute():
|
||||
# Resolve relative to project root
|
||||
docs_dir = pathlib.Path(__file__).resolve().parent.parent.parent / body.docs_path
|
||||
|
||||
if not docs_dir.is_dir():
|
||||
raise HTTPException(status_code=400, detail=f"Directory not found: {docs_dir}")
|
||||
|
||||
loaded = []
|
||||
# Read all docs
|
||||
docs_data: list[tuple[str, str, str, str, list[str]]] = [] # (id, title, content, summary, tags)
|
||||
for md_file in sorted(docs_dir.glob("*.md")):
|
||||
content = md_file.read_text(encoding="utf-8")
|
||||
doc_id = md_file.stem
|
||||
|
||||
# Build a summary from the first ~500 chars
|
||||
lines = content.strip().splitlines()
|
||||
title = lines[0].lstrip("#").strip() if lines else doc_id
|
||||
|
||||
summary_lines = []
|
||||
for line in lines[:30]:
|
||||
line = line.strip()
|
||||
@@ -345,12 +347,30 @@ async def load_knowledge(body: LoadKnowledgeRequest) -> dict[str, Any]:
|
||||
break
|
||||
summary = " ".join(summary_lines)[:500]
|
||||
|
||||
# Extract tags from headings
|
||||
tags = []
|
||||
for line in lines:
|
||||
if line.startswith("## "):
|
||||
tags.append(line.lstrip("#").strip().lower()[:30])
|
||||
|
||||
docs_data.append((doc_id, title, content, summary, tags[:10]))
|
||||
|
||||
# Generate embeddings in batch
|
||||
from ..memory.embeddings import EmbeddingService
|
||||
embed_service = EmbeddingService()
|
||||
embed_texts = [f"{title}\n{summary}\n{content[:2000]}" for _, title, content, summary, _ in docs_data]
|
||||
|
||||
try:
|
||||
embeddings = await embed_service.embed_batch(embed_texts)
|
||||
has_embeddings = True
|
||||
logger.info("Generated %d embeddings for knowledge base", len(embeddings))
|
||||
except Exception as e:
|
||||
logger.warning("Failed to generate embeddings: %s — loading without semantic search", e)
|
||||
embeddings = [None] * len(docs_data)
|
||||
has_embeddings = False
|
||||
|
||||
# Store docs + embeddings
|
||||
loaded = []
|
||||
for i, (doc_id, title, content, summary, tags) in enumerate(docs_data):
|
||||
doc = MemoryDocument(
|
||||
memory_id=doc_id,
|
||||
memory_type=MemoryType.DOCUMENT,
|
||||
@@ -358,20 +378,26 @@ async def load_knowledge(body: LoadKnowledgeRequest) -> dict[str, Any]:
|
||||
title=title,
|
||||
content=content,
|
||||
summary=summary,
|
||||
tags=tags[:10],
|
||||
tags=tags,
|
||||
)
|
||||
await memory.store_document(doc)
|
||||
|
||||
if embeddings[i] is not None:
|
||||
await memory.store_embedding(doc_id, embeddings[i], namespace="knowledge")
|
||||
|
||||
loaded.append({
|
||||
"id": doc_id,
|
||||
"title": title,
|
||||
"chars": len(content),
|
||||
"tags": tags[:5],
|
||||
"embedded": embeddings[i] is not None,
|
||||
})
|
||||
|
||||
logger.info("Loaded %d knowledge documents from %s", len(loaded), docs_dir)
|
||||
logger.info("Loaded %d knowledge documents from %s (embeddings: %s)", len(loaded), docs_dir, has_embeddings)
|
||||
return {
|
||||
"status": "loaded",
|
||||
"count": len(loaded),
|
||||
"embeddings": has_embeddings,
|
||||
"documents": loaded,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user