ultimos ajustes

2026-04-02 00:28:57 +01:00
parent 0795b28b54
commit bfccb02373
3 changed files with 155 additions and 117 deletions
--- a/src/context/engine.py
+++ b/src/context/engine.py
@@ -17,6 +17,7 @@ from ..config import settings
 logger = logging.getLogger(__name__)
 from ..models.agent import AgentProfile
 from ..models.artifacts import ArtifactSummary
+from ..memory.embeddings import EmbeddingService
 from ..memory.store import MemoryStore
 from ..models.context import (
    ContextPackage,
@@ -47,6 +48,7 @@ class ContextEngine:
            max_tokens=settings.context_max_tokens
        )
        self.memory = memory_store
+        self._embed_service: EmbeddingService | None = None
        # Debug history: last N context builds per session
        self._history: dict[str, list[dict[str, Any]]] = defaultdict(list)
        self._max_history = 20
@@ -263,21 +265,15 @@ class ContextEngine:
    async def _build_knowledge_base(
        self, session: SessionState
    ) -> ContextSection | None:
-        """Load relevant knowledge documents from the memory store.
+        """Load relevant knowledge documents via semantic search.

-        Uses keyword matching against the task objective and step
-        description to select only the most relevant docs.
-        Max budget: ~15k tokens for knowledge.
+        Uses embeddings to find the most relevant docs for the current
+        task. Always includes a title index of ALL docs so the agent
+        knows what exists and can request more.
        """
        if not self.memory:
            return None

-        # Build search terms from current context
-        search_terms = self._extract_search_terms(session)
-        if not search_terms:
-            # No task → load summaries of all docs (lightweight)
-            return await self._build_knowledge_summaries_only()
-
        all_docs: list[MemoryDocument] = []
        all_docs.extend(await self.memory.list_documents(
            namespace="knowledge",
@@ -291,52 +287,57 @@ class ContextEngine:
        if not all_docs:
            return None

-        # Score each doc by relevance
-        scored = self._score_docs(all_docs, search_terms)
+        doc_map = {d.memory_id: d for d in all_docs}

-        # Select top docs within token budget
+        # Rank docs by semantic similarity
+        query = self._build_search_query(session)
+        ranked_ids: list[str] = []
+
+        if query:
+            ranked_ids = await self._semantic_rank(query)
+
+        if not ranked_ids:
+            # No embeddings or no task — sort by size (smallest first)
+            ranked_ids = [
+                d.memory_id
+                for d in sorted(all_docs, key=lambda d: len(d.content))
+            ]
+
+        # Fill token budget with top-ranked docs
        max_kb_tokens = 15_000
-        selected: list[tuple[MemoryDocument, int]] = []
        token_budget = max_kb_tokens
+        full_docs: list[MemoryDocument] = []

-        for doc, score in scored:
-            if score == 0:
+        for doc_id in ranked_ids:
+            doc = doc_map.get(doc_id)
+            if not doc:
                continue
            doc_tokens = estimate_tokens(doc.content)
-            if doc_tokens > token_budget:
-                # Include summary instead of full content
-                summary_tokens = estimate_tokens(doc.summary or doc.title)
-                if summary_tokens < token_budget:
-                    selected.append((doc, -1))  # -1 = summary only
-                    token_budget -= summary_tokens
-                continue
-            selected.append((doc, score))
-            token_budget -= doc_tokens
+            if doc_tokens <= token_budget:
+                full_docs.append(doc)
+                token_budget -= doc_tokens

-        if not selected:
-            return await self._build_knowledge_summaries_only()
-
-        # Build section
-        full_docs = [(d, s) for d, s in selected if s > 0]
-        summary_docs = [(d, s) for d, s in selected if s == -1]
+        # Build section — ALWAYS include title index of ALL docs
+        included_ids = {d.memory_id for d in full_docs}
+        not_included = [d for d in all_docs if d.memory_id not in included_ids]

        lines = [
            "# Knowledge Base",
-            f"_{len(full_docs)} relevant doc(s) loaded, "
-            f"{len(summary_docs)} summarized, "
-            f"{len(all_docs) - len(selected)} filtered out_",
+            f"_{len(full_docs)} doc(s) loaded in full, "
+            f"{len(not_included)} available on request_",
            "",
        ]

-        for doc, _ in full_docs:
+        for doc in full_docs:
            lines.append(f"## {doc.title}")
            lines.append(doc.content)
            lines.append("")

-        if summary_docs:
-            lines.append("## Other Available Docs (summaries)")
-            for doc, _ in summary_docs:
-                lines.append(f"- **{doc.title}**: {doc.summary[:200]}")
+        if not_included:
+            lines.append("## Other Available Docs")
+            lines.append("_Ask for any of these if you need the full content:_")
+            for doc in not_included:
+                lines.append(f"- **{doc.title}** ({doc.memory_id}): {doc.summary[:150]}")
            lines.append("")

        content = "\n".join(lines)
@@ -347,80 +348,36 @@ class ContextEngine:
            token_estimate=estimate_tokens(content),
        )

-    async def _build_knowledge_summaries_only(self) -> ContextSection | None:
-        """Lightweight: only doc titles and summaries (no full content)."""
-        if not self.memory:
-            return None
-        docs = await self.memory.list_documents(
-            namespace="knowledge", memory_type=MemoryType.DOCUMENT
-        )
-        if not docs:
-            return None
-        lines = ["# Knowledge Base (summaries)", ""]
-        for doc in docs:
-            lines.append(f"- **{doc.title}**: {doc.summary[:150]}")
-        content = "\n".join(lines)
-        return ContextSection(
-            section_type=ContextSectionType.KNOWLEDGE_BASE,
-            content=content,
-            priority=60,
-            token_estimate=estimate_tokens(content),
-        )
+    async def _semantic_rank(self, query: str) -> list[str]:
+        """Rank knowledge docs by cosine similarity to the query."""
+        try:
+            if not self._embed_service:
+                self._embed_service = EmbeddingService()

-    def _extract_search_terms(self, session: SessionState) -> set[str]:
-        """Extract keywords from the current task for doc matching."""
-        terms: set[str] = set()
-        if not session.current_task:
-            return terms
+            query_embedding = await self._embed_service.embed(query)
+            results = await self.memory.search_by_similarity(
+                query_embedding=query_embedding,
+                namespace="knowledge",
+                top_k=50,
+            )
+            return [doc_id for doc_id, _score in results]

-        text = session.current_task.objective.lower()
-        step = session.current_task.current_step()
-        if step:
-            text += " " + step.description.lower()
-
-        # Split into words, filter short/common ones
-        stopwords = {
-            "de", "la", "el", "en", "un", "una", "los", "las", "del", "al",
-            "por", "para", "con", "que", "como", "cómo", "qué", "es", "son",
-            "se", "su", "más", "ya", "si", "no", "este", "esta", "esto",
-            "the", "a", "an", "is", "are", "and", "or", "to", "in", "of",
-            "for", "on", "with", "how", "what", "do", "does", "can",
-        }
-        for word in text.split():
-            word = word.strip(".,;:!?¿¡()[]{}\"'`")
-            if len(word) >= 3 and word not in stopwords:
-                terms.add(word)
-
-        return terms
+        except Exception as e:
+            logger.warning("Semantic search failed: %s — loading all docs", e)
+            return []

    @staticmethod
-    def _score_docs(
-        docs: list[MemoryDocument], terms: set[str]
-    ) -> list[tuple[MemoryDocument, int]]:
-        """Score docs by keyword match against title, tags, and content."""
-        scored: list[tuple[MemoryDocument, int]] = []
-
-        for doc in docs:
-            score = 0
-            title_lower = doc.title.lower()
-            tags_lower = " ".join(doc.tags).lower()
-            content_lower = doc.content[:2000].lower()
-
-            for term in terms:
-                # Title match = high weight
-                if term in title_lower:
-                    score += 10
-                # Tag match = medium weight
-                if term in tags_lower:
-                    score += 5
-                # Content match = low weight
-                if term in content_lower:
-                    score += 1
-
-            scored.append((doc, score))
-
-        scored.sort(key=lambda x: x[1], reverse=True)
-        return scored
+    def _build_search_query(session: SessionState) -> str:
+        """Build a natural language query from the current task."""
+        if not session.current_task:
+            return ""
+        parts = [session.current_task.objective]
+        step = session.current_task.current_step()
+        if step:
+            parts.append(step.description)
+        if session.current_task.facts_extracted:
+            parts.extend(session.current_task.facts_extracted[-5:])
+        return " ".join(parts)

    def _build_task_state(self, task: TaskState) -> ContextSection:
        lines = [