feat: fallback provider chain for auxiliary compression model

When the primary auxiliary compression model fails the minimum context-length check (64K floor), iterate through the user's fallback_providers chain instead of raising ValueError and killing the session. - check_compression_model_feasibility: catch ValueError from MINIMUM_CONTEXT_LENGTH rejection, try each fallback provider in order, store the first suitable one as _compression_fallback on the ContextCompressor - ContextCompressor._generate_summary: when _compression_fallback is set, override main_runtime (provider/model/base_url/api_key) so summaries route to the fallback provider - When all fallbacks are exhausted, emit a clear warning and continue without summaries (same as 'no auxiliary LLM' path) Fixes the session-killing crash when switching to a model whose provider's auto-detected compression model has <64K context.
2026-05-29 14:52:31 +00:00
12 changed files with 279 additions and 644 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -2831,74 +2831,16 @@ def _try_main_agent_model_fallback(
    return client, resolved_model or main_model, label


-def _coerce_positive_int(value: Any) -> Optional[int]:
-    try:
-        parsed = int(value)
-    except (TypeError, ValueError):
-        return None
-    return parsed if parsed > 0 else None
-
-
-def _estimate_auxiliary_request_tokens(messages: list, max_tokens: Optional[int] = None) -> int:
-    """Rough token estimate for local auxiliary context-window checks."""
-    try:
-        from agent.model_metadata import estimate_messages_tokens_rough
-        input_tokens = estimate_messages_tokens_rough(messages or [])
-    except Exception:
-        input_tokens = 0
-        for msg in messages or []:
-            content = msg.get("content", "") if isinstance(msg, dict) else str(msg)
-            input_tokens += max(1, len(str(content)) // 4)
-    return input_tokens + (_coerce_positive_int(max_tokens) or 0)
-
-
-def _context_length_error(
-    *,
-    task: str,
-    provider: str,
-    model: Optional[str],
-    context_length: Optional[int],
-    messages: list,
-    max_tokens: Optional[int],
-) -> Optional[ValueError]:
-    ctx = _coerce_positive_int(context_length)
-    if not ctx:
-        return None
-    estimated = _estimate_auxiliary_request_tokens(messages, max_tokens)
-    if estimated <= ctx:
-        return None
-    return ValueError(
-        f"Auxiliary {task or 'call'} request needs ~{estimated} tokens, "
-        f"exceeding configured context_length={ctx} for "
-        f"{provider or 'auto'}/{model or 'default'}"
-    )
-
-
-def _is_context_length_error(exc: Exception) -> bool:
-    text = str(exc).lower()
-    return (
-        "context_length" in text
-        or "context length" in text
-        or "context window" in text
-        or "too many tokens" in text
-        or "exceeding configured context" in text
-        or "exceeds the max_model_len" in text
-    )
-
-
 def _try_configured_fallback_chain(
    task: str,
    failed_provider: str,
    reason: str = "error",
-    failed_model: Optional[str] = None,
-    messages: Optional[list] = None,
-    max_tokens: Optional[int] = None,
 ) -> Tuple[Optional[Any], Optional[str], str]:
    """Try user-configured fallback_chain for a specific auxiliary task.

    Reads auxiliary.<task>.fallback_chain from config.yaml and tries each
    entry in order.  Each entry must have at least ``provider``; ``model``,
-    ``base_url``, ``api_key``, and ``context_length`` are optional.
+    ``base_url``, and ``api_key`` are optional.

    Returns:
        (client, model, provider_label) or (None, None, "") if no fallback.
@ -2911,46 +2853,21 @@ def _try_configured_fallback_chain(
    if not chain or not isinstance(chain, list):
        return None, None, ""

-    skip_provider = failed_provider.lower().strip()
-    skip_model = str(failed_model or "").lower().strip()
+    skip = failed_provider.lower().strip()
    tried = []

    for i, entry in enumerate(chain):
        if not isinstance(entry, dict):
            continue
        fb_provider = str(entry.get("provider", "")).strip()
-        if not fb_provider:
+        if not fb_provider or fb_provider.lower() == skip:
            continue
        fb_model = str(entry.get("model", "")).strip() or None
-        # Skip only the exact failed provider+model pair. Same provider with a
-        # different model is a valid self-healing rung (e.g. opencode_go
-        # deepseek-v4-pro -> opencode_go gpt-5.5).
-        if fb_provider.lower() == skip_provider and (
-            not skip_model or (fb_model or "").lower() == skip_model
-        ):
-            continue
        fb_base_url = str(entry.get("base_url", "")).strip() or None
        fb_api_key = str(entry.get("api_key", "")).strip() or None

        label = f"fallback_chain[{i}]({fb_provider})"

-        if messages is not None:
-            context_err = _context_length_error(
-                task=task,
-                provider=fb_provider,
-                model=fb_model,
-                context_length=entry.get("context_length"),
-                messages=messages,
-                max_tokens=max_tokens,
-            )
-            if context_err is not None:
-                logger.info(
-                    "Auxiliary %s: skipping %s (%s) because it also exceeds context_length: %s",
-                    task, label, fb_model or "default", context_err,
-                )
-                tried.append(label)
-                continue
-
        try:
            fb_client = _resolve_single_provider(
                fb_provider, fb_model, fb_base_url, fb_api_key)
@ -2972,6 +2889,7 @@ def _try_configured_fallback_chain(
        )
    return None, None, ""

+
 def _resolve_single_provider(
    provider: str,
    model: Optional[str] = None,
@ -4971,17 +4889,6 @@ def call_llm(
    # Handle unsupported temperature, max_tokens vs max_completion_tokens retry,
    # then payment fallback.
    try:
-        task_context = _get_auxiliary_task_config(task).get("context_length") if task else None
-        context_err = _context_length_error(
-            task=task or "call",
-            provider=resolved_provider,
-            model=final_model,
-            context_length=task_context,
-            messages=messages,
-            max_tokens=max_tokens,
-        )
-        if context_err is not None:
-            raise context_err
        return _validate_llm_response(
            client.chat.completions.create(**kwargs), task)
    except Exception as first_err:
@ -5165,7 +5072,6 @@ def call_llm(
            _is_payment_error(first_err)
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
-            or _is_context_length_error(first_err)
        )
        # Respect explicit provider choice for transient errors (auth, request
        # validation, etc.) but allow fallback when the provider clearly cannot
@ -5176,11 +5082,7 @@ def call_llm(
        is_auto = resolved_provider in {"auto", "", None}
        # Capacity errors bypass the explicit-provider gate: the provider
        # literally cannot serve this request regardless of user intent.
-        is_capacity_error = (
-            _is_payment_error(first_err)
-            or _is_connection_error(first_err)
-            or _is_context_length_error(first_err)
-        )
+        is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
        if should_fallback and (is_auto or is_capacity_error):
            if _is_payment_error(first_err):
                reason = "payment error"
@ -5193,8 +5095,6 @@ def call_llm(
                )
            elif _is_rate_limit_error(first_err):
                reason = "rate limit"
-            elif _is_context_length_error(first_err):
-                reason = "context length"
            else:
                reason = "connection error"
            logger.info("Auxiliary %s: %s on %s (%s), trying fallback",
@ -5212,8 +5112,7 @@ def call_llm(
                    resolved_provider, task, reason=reason)
            else:
                fb_client, fb_model, fb_label = _try_configured_fallback_chain(
-                    task, resolved_provider or "auto", reason=reason,
-                    failed_model=final_model, messages=messages, max_tokens=max_tokens)
+                    task, resolved_provider or "auto", reason=reason)
                if fb_client is None:
                    fb_client, fb_model, fb_label = _try_main_agent_model_fallback(
                        resolved_provider, task, reason=reason)
@ -5396,17 +5295,6 @@ async def async_call_llm(
        kwargs["messages"] = _convert_openai_images_to_anthropic(kwargs["messages"])

    try:
-        task_context = _get_auxiliary_task_config(task).get("context_length") if task else None
-        context_err = _context_length_error(
-            task=task or "call",
-            provider=resolved_provider,
-            model=final_model,
-            context_length=task_context,
-            messages=messages,
-            max_tokens=max_tokens,
-        )
-        if context_err is not None:
-            raise context_err
        return _validate_llm_response(
            await client.chat.completions.create(**kwargs), task)
    except Exception as first_err:
@ -5558,17 +5446,12 @@ async def async_call_llm(
            _is_payment_error(first_err)
            or _is_connection_error(first_err)
            or _is_rate_limit_error(first_err)
-            or _is_context_length_error(first_err)
        )
        # Capacity errors (payment/quota/connection) bypass the explicit-provider
        # gate — the provider cannot serve the request regardless of user intent.
        # See #26803: daily token quota must fall back like a 402 credit error.
        is_auto = resolved_provider in {"auto", "", None}
-        is_capacity_error = (
-            _is_payment_error(first_err)
-            or _is_connection_error(first_err)
-            or _is_context_length_error(first_err)
-        )
+        is_capacity_error = _is_payment_error(first_err) or _is_connection_error(first_err)
        if should_fallback and (is_auto or is_capacity_error):
            if _is_payment_error(first_err):
                reason = "payment error"
@ -5577,8 +5460,6 @@ async def async_call_llm(
                )
            elif _is_rate_limit_error(first_err):
                reason = "rate limit"
-            elif _is_context_length_error(first_err):
-                reason = "context length"
            else:
                reason = "connection error"
            logger.info("Auxiliary %s (async): %s on %s (%s), trying fallback",
@ -5595,8 +5476,7 @@ async def async_call_llm(
                    resolved_provider, task, reason=reason)
            else:
                fb_client, fb_model, fb_label = _try_configured_fallback_chain(
-                    task, resolved_provider or "auto", reason=reason,
-                    failed_model=final_model, messages=messages, max_tokens=max_tokens)
+                    task, resolved_provider or "auto", reason=reason)
                if fb_client is None:
                    fb_client, fb_model, fb_label = _try_main_agent_model_fallback(
                        resolved_provider, task, reason=reason)
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@ -580,6 +580,13 @@ class ContextCompressor(ContextEngine):

        self.summary_model = summary_model_override or ""

+        # Compression-model fallback: set by check_compression_model_feasibility
+        # when the primary aux compression model fails the minimum context check.
+        # If set, _generate_summary uses this provider/model for the LLM call
+        # instead of the main compressor attributes.  Dict keys:
+        #   provider, model, base_url, api_key
+        self._compression_fallback: Optional[Dict[str, str]] = None
+
        # Stores the previous compaction summary for iterative updates
        self._previous_summary: Optional[str] = None
        # Anti-thrashing: track whether last compression was effective
@ -1069,6 +1076,20 @@ The user has requested that this compaction PRIORITISE preserving all informatio
            }
            if self.summary_model:
                call_kwargs["model"] = self.summary_model
+            # Compression-model fallback: when the primary aux compression
+            # model was rejected for insufficient context, the feasibility
+            # check stored a replacement provider/model here.  Override the
+            # entire main_runtime so call_llm routes the summary request to
+            # the fallback provider instead of the main one.
+            if self._compression_fallback:
+                _fb = self._compression_fallback
+                call_kwargs["main_runtime"] = {
+                    "model": _fb["model"],
+                    "provider": _fb["provider"],
+                    "base_url": _fb.get("base_url", ""),
+                    "api_key": _fb.get("api_key", ""),
+                    "api_mode": _fb.get("api_mode", self.api_mode),
+                }
            response = call_llm(**call_kwargs)
            content = response.choices[0].message.content
            # Handle cases where content is not a string (e.g., dict from llama.cpp)
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@ -221,9 +221,101 @@ def check_compression_model_feasibility(agent: Any) -> None:
                new_threshold,
            )
    except ValueError:
-        # Hard rejections (aux below minimum context) must propagate
-        # so the session refuses to start.
-        raise
+        # Primary compression model failed the minimum context check
+        # (context_length < MINIMUM_CONTEXT_LENGTH).  Before giving up,
+        # try the user's fallback provider chain so a model switch or
+        # provider outage doesn't silently disable compression.
+        _fallback_chain = getattr(agent, '_fallback_chain', None) or []
+        _tried = [f"{aux_model} ({_aux_cfg_provider or 'auto'}): {aux_context:,} ctx < {MINIMUM_CONTEXT_LENGTH:,}"]
+
+        for _fb_entry in _fallback_chain:
+            _fb_provider = _fb_entry.get("provider", "")
+            _fb_model = _fb_entry.get("model", "")
+            if not _fb_provider or not _fb_model:
+                continue
+
+            try:
+                from agent.auxiliary_client import resolve_provider_client
+
+                _fb_client, _fb_resolved_model = resolve_provider_client(
+                    _fb_provider,
+                    _fb_model,
+                    explicit_base_url=_fb_entry.get("base_url", ""),
+                    explicit_api_key=_fb_entry.get("api_key", ""),
+                    main_runtime=agent._current_main_runtime(),
+                )
+                if _fb_client is None or not _fb_resolved_model:
+                    _tried.append(f"{_fb_model} ({_fb_provider}): unavailable")
+                    continue
+
+                _fb_base_url = str(getattr(_fb_client, "base_url", ""))
+                _fb_api_key_raw = getattr(_fb_client, "api_key", "")
+                _fb_api_key = (
+                    ""
+                    if callable(_fb_api_key_raw) and not isinstance(_fb_api_key_raw, str)
+                    else str(_fb_api_key_raw or "")
+                )
+
+                _fb_context = get_model_context_length(
+                    _fb_resolved_model,
+                    base_url=_fb_base_url,
+                    api_key=_fb_api_key,
+                    provider=_fb_provider,
+                    custom_providers=getattr(agent, "_custom_providers", None),
+                )
+
+                if _fb_context and _fb_context < MINIMUM_CONTEXT_LENGTH:
+                    _tried.append(
+                        f"{_fb_resolved_model} ({_fb_provider}): "
+                        f"{_fb_context:,} ctx < {MINIMUM_CONTEXT_LENGTH:,}"
+                    )
+                    continue
+
+                # ── Found a suitable fallback ──────────────────────────
+                logger.warning(
+                    "Compression model %s (%s) has only %d token context "
+                    "(minimum %d).  Falling back to %s (%s) with %d token context.",
+                    aux_model, _aux_cfg_provider or "auto", aux_context,
+                    MINIMUM_CONTEXT_LENGTH, _fb_resolved_model, _fb_provider,
+                    _fb_context or 0,
+                )
+
+                agent.context_compressor._compression_fallback = {
+                    "provider": _fb_provider,
+                    "model": _fb_resolved_model,
+                    "base_url": _fb_base_url,
+                    "api_key": _fb_api_key,
+                }
+
+                _msg = (
+                    f"⚠ Compression model {aux_model} has only "
+                    f"{aux_context:,} token context (minimum "
+                    f"{MINIMUM_CONTEXT_LENGTH:,} required).  "
+                    f"Falling back to {_fb_resolved_model} ({_fb_provider}) "
+                    f"for summaries."
+                )
+                agent._compression_warning = _msg
+                agent._emit_status(_msg)
+                return
+
+            except Exception as _fb_err:
+                _tried.append(f"{_fb_model} ({_fb_provider}): {_fb_err}")
+                continue
+
+        # No fallback worked — warn and let compression run without
+        # summaries (same behavior as 'no auxiliary LLM' above).
+        _all_tried = "; ".join(_tried)
+        _msg = (
+            f"⚠ No suitable compression model available.  "
+            f"Tried: {_all_tried}.  "
+            f"Compression will drop middle turns without summaries.  "
+            f"Run `hermes setup` or set "
+            f"auxiliary.compression.model in config.yaml."
+        )
+        agent._compression_warning = _msg
+        agent._emit_status(_msg)
+        logger.warning("Compression model fallback exhausted: %s", _all_tried)
+        return
    except Exception as exc:
        logger.debug(
            "Compression feasibility check failed (non-fatal): %s", exc
--- a/agent/session_stats.py
+++ b/agent/session_stats.py
@ -1,129 +0,0 @@
-"""Session telemetry collectors for the /stats dashboard."""
-
-from __future__ import annotations
-
-import logging
-from typing import Any, Dict, Optional
-
-logger = logging.getLogger(__name__)
-
-_TOKEN_FIELDS = (
-    "input_tokens",
-    "output_tokens",
-    "cache_read_tokens",
-    "cache_write_tokens",
-    "reasoning_tokens",
-)
-
-
-def _coerce_int(value: Any, default: int = 0) -> int:
-    try:
-        return int(value or 0)
-    except (TypeError, ValueError):
-        return default
-
-
-def _sum_tokens(row: Any) -> int:
-    if not row:
-        return 0
-    total = 0
-    for field in _TOKEN_FIELDS:
-        try:
-            value = row.get(field) if hasattr(row, "get") else row[field]
-        except Exception:
-            value = 0
-        total += _coerce_int(value)
-    return total
-
-
-def _query_one(session_db: Any, sql: str, params: tuple = ()) -> Optional[dict]:
-    conn = getattr(session_db, "_conn", None)
-    if conn is None:
-        return None
-    cur = conn.execute(sql, params)
-    row = cur.fetchone()
-    return dict(row) if row is not None else None
-
-
-def collect_context_stats(*, agent: Any = None, session_db: Any = None, session_id: str | None = None) -> Dict[str, Any]:
-    """Return current model/provider/context telemetry from live agent + SessionDB."""
-    model = getattr(agent, "model", None) or "unknown"
-    provider = getattr(agent, "provider", None) or "unknown"
-    context_length = _coerce_int(getattr(getattr(agent, "context_compressor", None), "context_length", 0))
-    threshold_tokens = _coerce_int(getattr(getattr(agent, "context_compressor", None), "threshold_tokens", 0))
-    total_tokens = _coerce_int(getattr(agent, "session_total_tokens", 0))
-
-    if session_db is not None and session_id and total_tokens <= 0:
-        try:
-            row = session_db.get_session(session_id)
-            total_tokens = _sum_tokens(row)
-            if row and (model == "unknown"):
-                model = row.get("model") or model
-        except Exception as exc:
-            logger.debug("Failed to read current session token totals: %s", exc, exc_info=True)
-
-    usage_percent = (total_tokens / context_length * 100.0) if context_length else None
-    fallback_chain = []
-    for entry in getattr(agent, "_fallback_chain", []) or []:
-        if isinstance(entry, dict):
-            fb_provider = str(entry.get("provider") or "").strip()
-            fb_model = str(entry.get("model") or "").strip()
-            if fb_provider or fb_model:
-                fallback_chain.append({"provider": fb_provider, "model": fb_model})
-
-    return {
-        "model": model,
-        "provider": provider,
-        "context_length": context_length,
-        "threshold_tokens": threshold_tokens,
-        "total_tokens": total_tokens,
-        "usage_percent": usage_percent,
-        "fallback_chain": fallback_chain,
-    }
-
-
-def collect_semantic_rle_stats(session_db: Any = None) -> Dict[str, Any]:
-    """Approximate compression/RLE savings from real SessionDB compression chains.
-
-    Hermes persists context-compression continuations as sessions whose parent
-    ended with ``end_reason='compression'``. We derive counts and token deltas
-    from those persisted parent/child rows instead of inventing counters.
-    """
-    if session_db is None or getattr(session_db, "_conn", None) is None:
-        return {"sessions_compressed": 0, "compression_ratio": None, "avg_tokens_saved": 0, "source": "SessionDB unavailable"}
-
-    try:
-        row = _query_one(
-            session_db,
-            """
-            SELECT COUNT(*) AS n,
-                   COALESCE(SUM(input_tokens + output_tokens + cache_read_tokens + cache_write_tokens + reasoning_tokens), 0) AS parent_tokens
-            FROM sessions
-            WHERE end_reason = 'compression'
-            """,
-        ) or {}
-        compressed = _coerce_int(row.get("n"))
-        parent_tokens = _coerce_int(row.get("parent_tokens"))
-
-        child = _query_one(
-            session_db,
-            """
-            SELECT COALESCE(SUM(c.input_tokens + c.output_tokens + c.cache_read_tokens + c.cache_write_tokens + c.reasoning_tokens), 0) AS child_tokens
-            FROM sessions p
-            JOIN sessions c ON c.parent_session_id = p.id
-            WHERE p.end_reason = 'compression'
-            """,
-        ) or {}
-        child_tokens = _coerce_int(child.get("child_tokens"))
-    except Exception as exc:
-        logger.debug("Failed to collect compression stats: %s", exc, exc_info=True)
-        return {"sessions_compressed": 0, "compression_ratio": None, "avg_tokens_saved": 0, "source": "SessionDB query failed"}
-
-    saved = max(parent_tokens - child_tokens, 0)
-    ratio = (child_tokens / parent_tokens) if parent_tokens else None
-    return {
-        "sessions_compressed": compressed,
-        "compression_ratio": ratio,
-        "avg_tokens_saved": int(saved / compressed) if compressed else 0,
-        "source": "SessionDB compression chains",
-    }
--- a/agent/skill_stats.py
+++ b/agent/skill_stats.py
@ -1,84 +0,0 @@
-"""Skill and curator telemetry collectors for the /stats dashboard."""
-
-from __future__ import annotations
-
-from datetime import datetime, timedelta, timezone
-from pathlib import Path
-from typing import Any, Dict, List
-
-from hermes_constants import get_hermes_home
-from tools.skill_usage import (
-    STATE_ARCHIVED,
-    activity_count,
-    latest_activity_at,
-    load_usage,
-)
-
-
-def _parse_dt(value: Any):
-    if not value:
-        return None
-    try:
-        dt = datetime.fromisoformat(str(value))
-    except (TypeError, ValueError):
-        return None
-    if dt.tzinfo is None:
-        dt = dt.replace(tzinfo=timezone.utc)
-    return dt
-
-
-def _activity_count(record: Dict[str, Any]) -> int:
-    return activity_count(record)
-
-
-def collect_skill_stats(limit: int = 5) -> Dict[str, Any]:
-    usage = load_usage()
-    rows: List[Dict[str, Any]] = []
-    for name, record in usage.items():
-        if not isinstance(record, dict):
-            continue
-        count = _activity_count(record)
-        rows.append({
-            "name": str(name),
-            "activity_count": count,
-            "use_count": int(record.get("use_count") or 0),
-            "view_count": int(record.get("view_count") or 0),
-            "patch_count": int(record.get("patch_count") or 0),
-            "last_activity_at": latest_activity_at(record),
-            "state": record.get("state") or "active",
-        })
-    rows.sort(key=lambda r: (r["activity_count"], r["name"]), reverse=True)
-    return {"top_skills": rows[:limit], "usage_records": len(rows)}
-
-
-def collect_curator_prunes(days: int = 7, limit: int = 3) -> Dict[str, Any]:
-    cutoff = datetime.now(timezone.utc) - timedelta(days=days)
-    usage = load_usage()
-    archived = []
-    for name, record in usage.items():
-        if not isinstance(record, dict):
-            continue
-        if record.get("state") != STATE_ARCHIVED:
-            continue
-        ts = record.get("archived_at") or record.get("last_patched_at") or record.get("created_at")
-        dt = _parse_dt(ts)
-        if dt is not None and dt < cutoff:
-            continue
-        archived.append({"name": str(name), "archived_at": ts})
-
-    # Also inspect the archive directory so manually restored/old usage sidecars
-    # still have a real filesystem source for the dashboard.
-    archive_dir = get_hermes_home() / "skills" / ".archive"
-    if archive_dir.exists():
-        for path in archive_dir.iterdir():
-            if not path.is_dir():
-                continue
-            try:
-                dt = datetime.fromtimestamp(path.stat().st_mtime, timezone.utc)
-            except OSError:
-                continue
-            if dt >= cutoff and not any(row["name"] == path.name for row in archived):
-                archived.append({"name": path.name, "archived_at": dt.isoformat()})
-
-    archived.sort(key=lambda r: str(r.get("archived_at") or ""), reverse=True)
-    return {"recent_prunes": archived[:limit], "recent_prune_count": len(archived), "days": days}
--- a/agent/stats_dashboard.py
+++ b/agent/stats_dashboard.py
@ -1,76 +0,0 @@
-"""Renderer for the Telegram-friendly /stats dashboard."""
-
-from __future__ import annotations
-
-from typing import Any
-
-from agent.session_stats import collect_context_stats, collect_semantic_rle_stats
-from agent.skill_stats import collect_curator_prunes, collect_skill_stats
-from agent.system_health import collect_system_health
-
-
-def _fmt_int(value: Any) -> str:
-    try:
-        return f"{int(value):,}"
-    except (TypeError, ValueError):
-        return "0"
-
-
-def _fmt_pct(value: Any) -> str:
-    if value is None:
-        return "unknown"
-    try:
-        return f"{float(value):.1f}%"
-    except (TypeError, ValueError):
-        return "unknown"
-
-
-def _fallback_text(chain: list[dict]) -> str:
-    if not chain:
-        return "none"
-    return " → ".join(
-        f"{item.get('model') or '?'} ({item.get('provider') or '?'})"
-        for item in chain
-    )
-
-
-def format_stats_dashboard(*, agent: Any = None, session_db: Any = None, session_id: str | None = None, started_at: Any = None, start_monotonic: float | None = None) -> str:
-    context = collect_context_stats(agent=agent, session_db=session_db, session_id=session_id)
-    rle = collect_semantic_rle_stats(session_db=session_db)
-    skills = collect_skill_stats(limit=5)
-    prunes = collect_curator_prunes(days=7, limit=3)
-    health = collect_system_health(started_at=started_at, start_monotonic=start_monotonic)
-    cron = health.get("cron") or {}
-
-    lines = [
-        "📊 Hermes stats",
-        "",
-        f"• Model: {context['model']} ({context['provider']})",
-        f"• Fallback: {_fallback_text(context.get('fallback_chain') or [])}",
-        f"• Context: {_fmt_int(context.get('total_tokens'))}/{_fmt_int(context.get('context_length'))} tokens ({_fmt_pct(context.get('usage_percent'))})",
-        f"• Semantic RLE: {rle.get('sessions_compressed', 0)} sessions · ratio {_fmt_pct((rle.get('compression_ratio') or 0) * 100 if rle.get('compression_ratio') is not None else None)} · avg saved {_fmt_int(rle.get('avg_tokens_saved'))} tokens",
-        "",
-        "• Top skills:",
-    ]
-
-    top = skills.get("top_skills") or []
-    if top:
-        for row in top[:5]:
-            lines.append(f"  - {row['name']}: {row['activity_count']} activity ({row['use_count']} use / {row['view_count']} view / {row['patch_count']} patch)")
-    else:
-        lines.append("  - no skill usage telemetry yet")
-
-    lines.append("• Gardener prunes (7d):")
-    recent_prunes = prunes.get("recent_prunes") or []
-    if recent_prunes:
-        for row in recent_prunes[:3]:
-            stamp = str(row.get("archived_at") or "unknown").split("T", 1)[0]
-            lines.append(f"  - {row.get('name')}: {stamp}")
-    else:
-        lines.append("  - none")
-
-    lines.extend([
-        f"• Nightly/cron 24h: {cron.get('runs', 0)} runs · {cron.get('ok', 0)} ok · {cron.get('error', 0)} errors · {cron.get('health_checks', 0)} health checks",
-        f"• Uptime/version: {health.get('uptime')} · v{health.get('version')} · pid {health.get('pid')}",
-    ])
-    return "\n".join(lines)
--- a/agent/system_health.py
+++ b/agent/system_health.py
@ -1,90 +0,0 @@
-"""System health and cron telemetry collectors for /stats."""
-
-from __future__ import annotations
-
-import os
-import time
-from datetime import datetime, timedelta, timezone
-from typing import Any, Dict
-
-
-def _parse_dt(value: Any):
-    if not value:
-        return None
-    try:
-        dt = datetime.fromisoformat(str(value))
-    except (TypeError, ValueError):
-        return None
-    if dt.tzinfo is None:
-        dt = dt.replace(tzinfo=timezone.utc)
-    return dt
-
-
-def format_duration(seconds: int | float | None) -> str:
-    if seconds is None:
-        return "unknown"
-    seconds = max(0, int(seconds))
-    days, rem = divmod(seconds, 86400)
-    hours, rem = divmod(rem, 3600)
-    minutes, _ = divmod(rem, 60)
-    if days:
-        return f"{days}d {hours}h"
-    if hours:
-        return f"{hours}h {minutes}m"
-    return f"{minutes}m"
-
-
-def collect_cron_activity(hours: int = 24) -> Dict[str, Any]:
-    cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
-    try:
-        from cron.jobs import list_jobs
-        jobs = list_jobs(include_disabled=True)
-    except Exception:
-        jobs = []
-
-    recent = []
-    ok = error = health_checks = 0
-    for job in jobs:
-        if not isinstance(job, dict):
-            continue
-        dt = _parse_dt(job.get("last_run_at"))
-        if dt is None or dt < cutoff:
-            continue
-        status = str(job.get("last_status") or "unknown")
-        if status == "ok":
-            ok += 1
-        elif status == "error":
-            error += 1
-        haystack = " ".join(str(job.get(k) or "") for k in ("name", "prompt", "id")).lower()
-        if "health" in haystack or "doctor" in haystack:
-            health_checks += 1
-        recent.append({"id": job.get("id"), "name": job.get("name"), "status": status, "last_run_at": job.get("last_run_at")})
-    recent.sort(key=lambda r: str(r.get("last_run_at") or ""), reverse=True)
-    return {"hours": hours, "runs": len(recent), "ok": ok, "error": error, "health_checks": health_checks, "recent": recent[:5]}
-
-
-def collect_system_health(*, started_at: Any = None, start_monotonic: float | None = None) -> Dict[str, Any]:
-    try:
-        from hermes_cli import __version__ as version
-    except Exception:
-        version = "unknown"
-
-    uptime_seconds = None
-    if start_monotonic is not None:
-        uptime_seconds = time.monotonic() - float(start_monotonic)
-    else:
-        dt = started_at
-        if isinstance(dt, (int, float)):
-            uptime_seconds = time.time() - float(dt)
-        elif isinstance(dt, datetime):
-            if dt.tzinfo is None:
-                dt = dt.replace(tzinfo=timezone.utc)
-            uptime_seconds = (datetime.now(timezone.utc) - dt).total_seconds()
-
-    return {
-        "version": version,
-        "pid": os.getpid(),
-        "uptime_seconds": int(uptime_seconds) if uptime_seconds is not None else None,
-        "uptime": format_duration(uptime_seconds),
-        "cron": collect_cron_activity(hours=24),
-    }
--- a/cli.py
+++ b/cli.py
@ -5988,24 +5988,6 @@ class HermesCLI:

        self._console_print("\n".join(lines), highlight=False, markup=False)
    
-    def _handle_stats_command(self):
-        """Show comprehensive system stats — model, skills, curator, cron, uptime."""
-        try:
-            from agent.stats_dashboard import format_stats_dashboard
-        except ImportError as exc:
-            self._console_print(f"stats module unavailable: {exc}", highlight=False, markup=False)
-            return
-
-        agent = getattr(self, "agent", None)
-        uptime_start = getattr(self, "session_start", None)
-        dashboard = format_stats_dashboard(
-            agent=agent,
-            session_db=getattr(self, "_session_db", None),
-            session_id=self.session_id,
-            started_at=uptime_start,
-        )
-        self._console_print(dashboard, highlight=False, markup=False)
-
    def _fast_command_available(self) -> bool:
        try:
            from hermes_cli.models import model_supports_fast_mode
@ -8507,8 +8489,6 @@ class HermesCLI:
                self._handle_skills_command(cmd_original)
        elif canonical == "platforms":
            self._show_gateway_status()
-        elif canonical == "stats":
-            self._handle_stats_command()
        elif canonical == "status":
            self._show_session_status()
        elif canonical == "statusbar":
--- a/gateway/run.py
+++ b/gateway/run.py
@ -7470,9 +7470,6 @@ class GatewayRunner:
        if canonical == "status":
            return await self._handle_status_command(event)

-        if canonical == "stats":
-            return await self._handle_stats_command(event)
-
        if canonical == "agents":
            return await self._handle_agents_command(event)

@ -9710,30 +9707,6 @@ class GatewayRunner:

        return "\n".join(lines)

-    async def _handle_stats_command(self, event: MessageEvent) -> str:
-        """Handle /stats command — comprehensive system telemetry dashboard."""
-        try:
-            from agent.stats_dashboard import format_stats_dashboard
-        except ImportError as exc:
-            logger.debug("stats module unavailable: %s", exc)
-            return "stats module unavailable"
-
-        source = event.source
-        session_entry = self.session_store.get_or_create_session(source)
-        session_key = self._session_key_for_source(source)
-
-        # Try to get the running agent for this session
-        agent = self._running_agents.get(session_key)
-        started_at = self._running_agents_ts.get(session_key)
-
-        dashboard = format_stats_dashboard(
-            agent=agent,
-            session_db=self._session_db,
-            session_id=session_entry.session_id,
-            started_at=started_at,
-        )
-        return dashboard
-
    async def _handle_agents_command(self, event: MessageEvent) -> str:
        """Handle /agents command - list active agents and running tasks."""
        from tools.process_registry import format_uptime_short, process_registry
--- a/hermes_cli/commands.py
+++ b/hermes_cli/commands.py
@ -107,7 +107,6 @@ COMMAND_REGISTRY: list[CommandDef] = [
    CommandDef("subgoal", "Add or manage extra criteria on the active goal", "Session",
               args_hint="[text | remove N | clear]"),
    CommandDef("status", "Show session info", "Session"),
-    CommandDef("stats", "Show comprehensive system stats — model, skills, curator, cron, uptime", "Info"),
    CommandDef("whoami", "Show your slash command access (admin / user)", "Info"),
    CommandDef("profile", "Show active profile name and home directory", "Info"),
    CommandDef("sethome", "Set this chat as the home channel", "Session",
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@ -1402,83 +1402,6 @@ class TestAuxiliaryFallbackLayering:

        assert main_client.chat.completions.create.called

-    def test_context_length_failure_uses_configured_chain_same_provider_different_model(self, monkeypatch):
-        """Local auxiliary context_length failure should self-heal via fallback_chain."""
-        monkeypatch.setenv("OPENCODE_GO_API_KEY", "go-key")
-
-        primary_client = MagicMock()
-        chain_client = MagicMock()
-        chain_client.chat.completions.create.return_value = MagicMock(choices=[
-            MagicMock(message=MagicMock(content="from opencode gpt fallback"))
-        ])
-
-        task_cfg = {
-            "provider": "opencode_go",
-            "model": "deepseek-v4-pro",
-            "context_length": 100,
-            "fallback_chain": [
-                {"provider": "opencode_go", "model": "gpt-5.5", "context_length": 10000},
-            ],
-        }
-
-        def resolve_single(provider, model=None, base_url=None, api_key=None):
-            assert provider == "opencode_go"
-            assert model == "gpt-5.5"
-            return chain_client
-
-        with patch("agent.auxiliary_client._get_cached_client",
-                   return_value=(primary_client, "deepseek-v4-pro")), \
-             patch("agent.auxiliary_client._resolve_task_provider_model",
-                   return_value=("opencode_go", "deepseek-v4-pro", None, None, None)), \
-             patch("agent.auxiliary_client._get_auxiliary_task_config",
-                   return_value=task_cfg), \
-             patch("agent.auxiliary_client._resolve_single_provider",
-                   side_effect=resolve_single), \
-             patch("agent.auxiliary_client._try_main_agent_model_fallback") as main_fb:
-            result = call_llm(
-                task="compression",
-                messages=[{"role": "user", "content": "x" * 1000}],
-                max_tokens=2000,
-            )
-
-        primary_client.chat.completions.create.assert_not_called()
-        chain_client.chat.completions.create.assert_called_once()
-        main_fb.assert_not_called()
-        assert result.choices[0].message.content == "from opencode gpt fallback"
-
-    def test_configured_chain_skips_too_small_fallback_context(self):
-        """fallback_chain should continue past entries that cannot fit the request."""
-        from agent.auxiliary_client import _try_configured_fallback_chain
-
-        too_small = MagicMock()
-        fits = MagicMock()
-        task_cfg = {
-            "fallback_chain": [
-                {"provider": "custom", "model": "tiny", "context_length": 100},
-                {"provider": "custom", "model": "gemma-local", "context_length": 10000},
-            ]
-        }
-
-        def resolve_single(provider, model=None, base_url=None, api_key=None):
-            return too_small if model == "tiny" else fits
-
-        with patch("agent.auxiliary_client._get_auxiliary_task_config",
-                   return_value=task_cfg), \
-             patch("agent.auxiliary_client._resolve_single_provider",
-                   side_effect=resolve_single):
-            client, model, label = _try_configured_fallback_chain(
-                "compression",
-                "opencode_go",
-                reason="context length",
-                failed_model="deepseek-v4-pro",
-                messages=[{"role": "user", "content": "x" * 1000}],
-                max_tokens=2000,
-            )
-
-        assert client is fits
-        assert model == "gemma-local"
-        assert label == "fallback_chain[1](custom)"
-
    def test_warning_emitted_when_all_fallbacks_exhausted(self, monkeypatch, caplog):
        """When chain AND main model both fail, a user-visible warning fires before re-raise."""
        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
--- a/tests/run_agent/test_compression_feasibility.py
+++ b/tests/run_agent/test_compression_feasibility.py
@ -57,6 +57,7 @@ def _make_agent(
    compressor = MagicMock(spec=ContextCompressor)
    compressor.context_length = main_context
    compressor.threshold_tokens = int(main_context * threshold_percent)
+    compressor._compression_fallback = None
    agent.context_compressor = compressor

    return agent
@ -101,24 +102,169 @@ def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_clien
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
@patch("agent.auxiliary_client.get_text_auxiliary_client")
 def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
-    """Hard floor: aux context < MINIMUM_CONTEXT_LENGTH (64K) → session
-    refuses to start (ValueError), mirroring the main-model rejection."""
+    """When aux context < MINIMUM_CONTEXT_LENGTH (64K) and no fallback
+    providers are configured, a warning is emitted and compression will
+    operate without summaries.  Previously this raised ValueError; now it
+    degrades gracefully so a model switch doesn't kill the session."""
    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
    mock_client = MagicMock()
    mock_client.base_url = "https://openrouter.ai/api/v1"
    mock_client.api_key = "sk-aux"
    mock_get_client.return_value = (mock_client, "tiny-aux-model")

-    agent._emit_status = lambda msg: None
+    messages = []
+    agent._emit_status = lambda msg: messages.append(msg)

-    with pytest.raises(ValueError) as exc_info:
+    # No fallback chain → should warn, not raise
+    agent._fallback_chain = []
+    agent._check_compression_model_feasibility()
+
+    assert len(messages) == 1
+    assert "No suitable compression model" in messages[0]
+    assert "tiny-aux-model" in messages[0]
+    assert "32,768" in messages[0]
+    assert "64,000" in messages[0]
+    assert agent._compression_warning is not None
+
+
+@patch("agent.model_metadata.get_model_context_length")
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_falls_back_to_chain_when_aux_below_minimum(mock_get_client, mock_ctx_len):
+    """When the primary aux model fails the context-length floor, the
+    feasibility check tries each fallback provider in order, using the
+    first one that meets MINIMUM_CONTEXT_LENGTH."""
+    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+
+    # Primary aux model: too small (32K)
+    mock_primary_client = MagicMock()
+    mock_primary_client.base_url = "https://openrouter.ai/api/v1"
+    mock_primary_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_primary_client, "tiny-aux-model")
+
+    # Fallback chain: two providers, first one meets the floor
+    agent._fallback_chain = [
+        {"provider": "opencode_go", "model": "deepseek-v4-pro"},
+        {"provider": "custom", "model": "gemma-local",
+         "base_url": "http://127.0.0.1:8081/v1", "api_key": "no-key"},
+    ]
+
+    # Mock resolve_provider_client for the fallback resolution
+    mock_fb_client = MagicMock()
+    mock_fb_client.base_url = "https://api.opencode.ai/v1"
+    mock_fb_client.api_key = "sk-fallback"
+
+    # get_model_context_length: first return 32K (primary fail),
+    # then return 128K (fallback success)
+    mock_ctx_len.side_effect = [32_768, 128_000]
+
+    messages = []
+    agent._emit_status = lambda msg: messages.append(msg)
+
+    with patch("agent.auxiliary_client.resolve_provider_client",
+               return_value=(mock_fb_client, "deepseek-v4-pro")) as mock_resolve:
        agent._check_compression_model_feasibility()

-    err = str(exc_info.value)
-    assert "tiny-aux-model" in err
-    assert "32,768" in err
-    assert "64,000" in err
-    assert "below the minimum" in err
+    # Should have resolved the fallback provider
+    mock_resolve.assert_called_once()
+    # First two positional args: provider, model
+    assert mock_resolve.call_args[0][0] == "opencode_go"
+    assert mock_resolve.call_args[0][1] == "deepseek-v4-pro"
+
+    # Warning should mention the fallback choice
+    assert len(messages) == 1
+    assert "Falling back to" in messages[0]
+    assert "deepseek-v4-pro" in messages[0]
+    assert "opencode_go" in messages[0]
+
+    # Fallback dict stored on compressor
+    fb = agent.context_compressor._compression_fallback
+    assert fb is not None
+    assert fb["provider"] == "opencode_go"
+    assert fb["model"] == "deepseek-v4-pro"
+
+
+@patch("agent.model_metadata.get_model_context_length")
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_falls_back_past_unavailable_provider(mock_get_client, mock_ctx_len):
+    """When the first fallback provider is unavailable, skip it and
+    try the next one."""
+    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+
+    mock_primary_client = MagicMock()
+    mock_primary_client.base_url = "https://openrouter.ai/api/v1"
+    mock_primary_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_primary_client, "tiny")
+
+    # Fallback chain: first unavailable, second works
+    agent._fallback_chain = [
+        {"provider": "broken-provider", "model": "broken-model"},
+        {"provider": "opencode_go", "model": "deepseek-v4-pro"},
+    ]
+
+    mock_fb_client = MagicMock()
+    mock_fb_client.base_url = "https://api.opencode.ai/v1"
+    mock_fb_client.api_key = "sk-fallback"
+
+    # Primary: 32K (fail), broken-provider: unavailable, opencode_go: 128K
+    mock_ctx_len.side_effect = [32_768, None, 128_000]
+
+    messages = []
+    agent._emit_status = lambda msg: messages.append(msg)
+
+    # First resolve returns None (unavailable), second returns client
+    mock_resolve_values = [(None, None), (mock_fb_client, "deepseek-v4-pro")]
+    with patch("agent.auxiliary_client.resolve_provider_client",
+               side_effect=mock_resolve_values) as mock_resolve:
+        agent._check_compression_model_feasibility()
+
+    # Should have tried both fallbacks
+    assert mock_resolve.call_count == 2
+
+    # Should succeed with the second fallback
+    fb = agent.context_compressor._compression_fallback
+    assert fb is not None
+    assert fb["provider"] == "opencode_go"
+
+
+@patch("agent.model_metadata.get_model_context_length")
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_warns_when_all_fallbacks_exhausted(mock_get_client, mock_ctx_len):
+    """When every fallback provider also fails the context floor or is
+    unavailable, emit a warning and degrade to no-summary mode without
+    raising."""
+    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+
+    mock_primary_client = MagicMock()
+    mock_primary_client.base_url = "https://openrouter.ai/api/v1"
+    mock_primary_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_primary_client, "tiny-main")
+
+    agent._fallback_chain = [
+        {"provider": "small-provider", "model": "small-model"},
+    ]
+
+    # Fallback also too small
+    mock_fb_client = MagicMock()
+    mock_fb_client.base_url = "https://small.api/v1"
+    mock_fb_client.api_key = "sk-small"
+    mock_ctx_len.side_effect = [32_768, 16_384]
+
+    messages = []
+    agent._emit_status = lambda msg: messages.append(msg)
+    # Mock compressor won't have _compression_fallback until set —
+    # initialize it so the final assertion works.
+    agent.context_compressor._compression_fallback = None
+
+    with patch("agent.auxiliary_client.resolve_provider_client",
+               return_value=(mock_fb_client, "small-model")):
+        agent._check_compression_model_feasibility()
+
+    assert len(messages) == 1
+    assert "No suitable compression model" in messages[0]
+    assert "small-model" in messages[0]
+    assert agent._compression_warning is not None
+    # No fallback on compressor
+    assert agent.context_compressor._compression_fallback is None


@patch("agent.model_metadata.get_model_context_length", return_value=200_000)