agenticSystem/src/config.py

"""Application configuration via environment variables."""

from __future__ import annotations

from pydantic_settings import BaseSettings
from pydantic import Field


class Settings(BaseSettings):
    # --- Service ---
    service_name: str = "agentic-microservice"
    service_version: str = "1.0.0"
    host: str = "0.0.0.0"
    port: int = 8000
    debug: bool = False

    # --- Redis ---
    redis_host: str = "localhost"
    redis_port: int = 6379
    redis_db: int = 0
    redis_password: str = ""
    redis_key_prefix: str = "agentic"
    session_ttl_seconds: int = 86400  # 24h

    @property
    def redis_url(self) -> str:
        auth = f":{self.redis_password}@" if self.redis_password else ""
        return f"redis://{auth}{self.redis_host}:{self.redis_port}/{self.redis_db}"

    # --- Model providers ---
    anthropic_api_key: str = ""
    anthropic_base_url: str = ""  # Custom base URL (for MiniMax Anthropic-compatible, etc.)
    openai_api_key: str = ""
    openai_base_url: str = ""  # Custom base URL (for MiniMax, DeepInfra, etc.)
    # --- Embeddings (semantic search) ---
    # Credenciales DEDICADAS para embeddings. Necesarias porque el chat usa
    # `openai_api_key` apuntando a un endpoint compatible (p.ej. DeepSeek, que NO
    # tiene API de embeddings). Si vacio, cae a `openai_api_key` por compat. El
    # base_url vacio => OpenAI real (api.openai.com); NO hereda `openai_base_url`.
    embeddings_api_key: str = ""
    embeddings_base_url: str = ""
    embeddings_model: str = "text-embedding-3-small"
    # Spike LiteLLM: si default_model_provider=litellm, modelo a usar (formato
    # litellm, p.ej. "deepseek/deepseek-v4-pro"). Vacío → deriva de default_model_id.
    litellm_model: str = ""

    @property
    def effective_embeddings_key(self) -> str:
        """Key a usar para embeddings. Prioriza la dedicada; reutiliza la del
        chat SOLO si el chat es OpenAI real (sin `openai_base_url` custom) — si
        apunta a DeepSeek u otro proveedor, esa key no sirve para embeddings."""
        if self.embeddings_api_key:
            return self.embeddings_api_key
        if not self.openai_base_url:
            return self.openai_api_key
        return ""

    @property
    def embeddings_enabled(self) -> bool:
        return bool(self.effective_embeddings_key or self.embeddings_base_url)

    default_model_provider: str = "claude"
    default_model_id: str = "claude-sonnet-4-20250514"
    # Modelo override SOLO para el sub-loop del planner (acai_plan). Si vacio,
    # usa default_model_id. Pensado para usar un modelo mas potente al planificar
    # (p.ej. deepseek-v4-pro) y otro mas rapido al ejecutar (p.ej. deepseek-v4-flash).
    planner_model_id: str = ""
    # Max tokens del planner. Mas alto que el agente principal porque Pro con
    # thinking puede gastar 2-4k tokens razonando antes de emitir el JSON del plan.
    planner_max_tokens: int = 16000
    max_tokens: int = 4096
    temperature: float = 0.3
    # DeepSeek strict function calling (beta). OPT-IN (default False): exige schemas
    # tipo OpenAI (additionalProperties:false, todos required, etc.) que los tools MCP
    # actuales NO cumplen → da 400. Para activarlo: schemas compatibles + base_url
    # https://api.deepseek.com/beta + AGENTIC_DEEPSEEK_STRICT_TOOLS=true.
    deepseek_strict_tools: bool = False

    # --- Context engine ---
    model_context_window: int = 0  # 0 = use legacy fixed budget / explicit override
    model_max_output_tokens: int = 4096
    context_max_tokens: int = 0  # 0 = auto-budget from model window, fallback legacy 120k
    compaction_threshold_tokens: int = 0  # 0 = derive from ratio
    compaction_threshold_ratio: float = 0.80
    context_reserve_ratio: float = 0.10
    artifact_summary_max_chars: int = 2000
    # KB inyectada como system prompt. Default 4k (antes 30k) — la doc
    # oficial de M2.7 advierte que system prompts grandes degradan rendimiento.
    # Top-2 docs medianos + cheat sheet ≈ 4k tokens caben con margen.
    # Se sobrescribe per-agent via `agent.yaml.kb_max_tokens`.
    knowledge_base_max_tokens: int = 4_000
    # Cap absoluto del numero de docs incluidos (filtro tras ranking).
    kb_top_n_docs: int = 2
    # Penalty al `load_priority` de docs `load_when: [ranked]` para que
    # no entren "por defecto" en el branch top_n, solo si rankean muy alto.
    kb_ranked_penalty: int = 10
    # Umbral de similitud por debajo del cual el ranking no es confiable
    # y se usa el `load_priority` del frontmatter como tie-break.
    kb_similarity_floor: float = 0.6
    working_context_max_items: int = 20
    tool_raw_output_max_chars: int = 16000  # Antes 2000 (calibrado MiniMax 200k). Subido para DeepSeek 1M context.
    conversation_recent_raw_limit: int = 2
    task_history_max_entries: int = 20
    task_history_max_tokens: int = 1500
    # Presupuesto de tokens para la ventana de recent_messages persistida en
    # sesion. Sin esto crece sin limite y empuja al compactor a su paso
    # destructivo (colapsar bloques perdiendo tool_use ids). 0 = sin limite.
    recent_messages_max_tokens: int = 60_000

    # --- MCP ---
    mcp_config_path: str = ""  # Path to mcp.json; empty = legacy single-server mode
    mcp_server_command: str = ""  # Legacy: single server command
    mcp_server_args: list[str] = Field(default_factory=list)
    mcp_timeout_seconds: float = 30.0
    mcp_startup_timeout_seconds: float = 10.0

    # --- Pricing (per 1M tokens) ---
    cost_per_1m_input: float = 2.50
    cost_per_1m_output: float = 15.00

    # --- Orchestrator ---
    max_execution_steps: int = 25
    subagent_max_steps: int = 30
    max_execution_timeout_seconds: float = 300.0  # 5 min global timeout

    # --- SSE ---
    sse_keepalive_seconds: float = 15.0

    model_config = {"env_prefix": "AGENTIC_", "env_file": ".env", "extra": "ignore"}

    @property
    def reserve_tokens(self) -> int:
        if self.model_context_window <= 0:
            return 0
        return max(0, int(self.model_context_window * self.context_reserve_ratio))

    @property
    def effective_context_budget(self) -> int:
        if self.context_max_tokens > 0:
            return self.context_max_tokens

        if self.model_context_window > 0:
            budget = (
                self.model_context_window
                - max(0, self.model_max_output_tokens)
                - self.reserve_tokens
            )
            return max(1, budget)

        return 120_000

    @property
    def effective_compaction_threshold(self) -> int:
        if self.compaction_threshold_tokens > 0:
            return min(self.compaction_threshold_tokens, self.effective_context_budget)
        return max(1, int(self.effective_context_budget * self.compaction_threshold_ratio))

    def budget_for_window(self, window: int, max_output: int | None = None) -> int:
        """Budget de contexto para la ventana REAL del modelo activo.

        Misma fórmula que `effective_context_budget` (`window - max_output -
        reserve`) pero parametrizada por la ventana del modelo del turno. Si la
        ventana no es válida, cae al budget estático. Un override explícito
        (`context_max_tokens`) siempre manda (lo aplica el caller)."""
        if window <= 0:
            return self.effective_context_budget
        out = self.model_max_output_tokens if max_output is None else max_output
        reserve = int(window * self.context_reserve_ratio)
        return max(1, window - max(0, out) - max(0, reserve))

    def compaction_threshold_for(self, budget: int) -> int:
        """Umbral de compactación para un budget dado (ratio configurable)."""
        if self.compaction_threshold_tokens > 0:
            return min(self.compaction_threshold_tokens, budget)
        return max(1, int(budget * self.compaction_threshold_ratio))


settings = Settings()