From 504954dcee60f7c5e6eaf41ef93a104041088383 Mon Sep 17 00:00:00 2001
From: Anton Palgunov <toxblh@gmail.com>
Date: Fri, 29 May 2026 14:52:31 +0000
Subject: [PATCH] feat: fallback provider chain for auxiliary compression model

When the primary auxiliary compression model fails the minimum
context-length check (64K floor), iterate through the user's
fallback_providers chain instead of raising ValueError and
killing the session.

- check_compression_model_feasibility: catch ValueError from
  MINIMUM_CONTEXT_LENGTH rejection, try each fallback provider
  in order, store the first suitable one as
  _compression_fallback on the ContextCompressor
- ContextCompressor._generate_summary: when _compression_fallback
  is set, override main_runtime (provider/model/base_url/api_key)
  so summaries route to the fallback provider
- When all fallbacks are exhausted, emit a clear warning and
  continue without summaries (same as 'no auxiliary LLM' path)

Fixes the session-killing crash when switching to a model whose
provider's auto-detected compression model has <64K context.
---
 agent/context_compressor.py                   |  21 +++
 agent/conversation_compression.py             |  98 ++++++++++-
 .../run_agent/test_compression_feasibility.py | 164 +++++++++++++++++-
 3 files changed, 271 insertions(+), 12 deletions(-)

diff --git a/agent/context_compressor.py b/agent/context_compressor.py
index 49907e2c3..ba5fcf596 100644
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -580,6 +580,13 @@ class ContextCompressor(ContextEngine):
 
         self.summary_model = summary_model_override or ""
 
+        # Compression-model fallback: set by check_compression_model_feasibility
+        # when the primary aux compression model fails the minimum context check.
+        # If set, _generate_summary uses this provider/model for the LLM call
+        # instead of the main compressor attributes.  Dict keys:
+        #   provider, model, base_url, api_key
+        self._compression_fallback: Optional[Dict[str, str]] = None
+
         # Stores the previous compaction summary for iterative updates
         self._previous_summary: Optional[str] = None
         # Anti-thrashing: track whether last compression was effective
@@ -1069,6 +1076,20 @@ The user has requested that this compaction PRIORITISE preserving all informatio
             }
             if self.summary_model:
                 call_kwargs["model"] = self.summary_model
+            # Compression-model fallback: when the primary aux compression
+            # model was rejected for insufficient context, the feasibility
+            # check stored a replacement provider/model here.  Override the
+            # entire main_runtime so call_llm routes the summary request to
+            # the fallback provider instead of the main one.
+            if self._compression_fallback:
+                _fb = self._compression_fallback
+                call_kwargs["main_runtime"] = {
+                    "model": _fb["model"],
+                    "provider": _fb["provider"],
+                    "base_url": _fb.get("base_url", ""),
+                    "api_key": _fb.get("api_key", ""),
+                    "api_mode": _fb.get("api_mode", self.api_mode),
+                }
             response = call_llm(**call_kwargs)
             content = response.choices[0].message.content
             # Handle cases where content is not a string (e.g., dict from llama.cpp)
diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py
index a620f343e..5e0099c4f 100644
--- a/agent/conversation_compression.py
+++ b/agent/conversation_compression.py
@@ -221,9 +221,101 @@ def check_compression_model_feasibility(agent: Any) -> None:
                 new_threshold,
             )
     except ValueError:
-        # Hard rejections (aux below minimum context) must propagate
-        # so the session refuses to start.
-        raise
+        # Primary compression model failed the minimum context check
+        # (context_length < MINIMUM_CONTEXT_LENGTH).  Before giving up,
+        # try the user's fallback provider chain so a model switch or
+        # provider outage doesn't silently disable compression.
+        _fallback_chain = getattr(agent, '_fallback_chain', None) or []
+        _tried = [f"{aux_model} ({_aux_cfg_provider or 'auto'}): {aux_context:,} ctx < {MINIMUM_CONTEXT_LENGTH:,}"]
+
+        for _fb_entry in _fallback_chain:
+            _fb_provider = _fb_entry.get("provider", "")
+            _fb_model = _fb_entry.get("model", "")
+            if not _fb_provider or not _fb_model:
+                continue
+
+            try:
+                from agent.auxiliary_client import resolve_provider_client
+
+                _fb_client, _fb_resolved_model = resolve_provider_client(
+                    _fb_provider,
+                    _fb_model,
+                    explicit_base_url=_fb_entry.get("base_url", ""),
+                    explicit_api_key=_fb_entry.get("api_key", ""),
+                    main_runtime=agent._current_main_runtime(),
+                )
+                if _fb_client is None or not _fb_resolved_model:
+                    _tried.append(f"{_fb_model} ({_fb_provider}): unavailable")
+                    continue
+
+                _fb_base_url = str(getattr(_fb_client, "base_url", ""))
+                _fb_api_key_raw = getattr(_fb_client, "api_key", "")
+                _fb_api_key = (
+                    ""
+                    if callable(_fb_api_key_raw) and not isinstance(_fb_api_key_raw, str)
+                    else str(_fb_api_key_raw or "")
+                )
+
+                _fb_context = get_model_context_length(
+                    _fb_resolved_model,
+                    base_url=_fb_base_url,
+                    api_key=_fb_api_key,
+                    provider=_fb_provider,
+                    custom_providers=getattr(agent, "_custom_providers", None),
+                )
+
+                if _fb_context and _fb_context < MINIMUM_CONTEXT_LENGTH:
+                    _tried.append(
+                        f"{_fb_resolved_model} ({_fb_provider}): "
+                        f"{_fb_context:,} ctx < {MINIMUM_CONTEXT_LENGTH:,}"
+                    )
+                    continue
+
+                # ── Found a suitable fallback ──────────────────────────
+                logger.warning(
+                    "Compression model %s (%s) has only %d token context "
+                    "(minimum %d).  Falling back to %s (%s) with %d token context.",
+                    aux_model, _aux_cfg_provider or "auto", aux_context,
+                    MINIMUM_CONTEXT_LENGTH, _fb_resolved_model, _fb_provider,
+                    _fb_context or 0,
+                )
+
+                agent.context_compressor._compression_fallback = {
+                    "provider": _fb_provider,
+                    "model": _fb_resolved_model,
+                    "base_url": _fb_base_url,
+                    "api_key": _fb_api_key,
+                }
+
+                _msg = (
+                    f"⚠ Compression model {aux_model} has only "
+                    f"{aux_context:,} token context (minimum "
+                    f"{MINIMUM_CONTEXT_LENGTH:,} required).  "
+                    f"Falling back to {_fb_resolved_model} ({_fb_provider}) "
+                    f"for summaries."
+                )
+                agent._compression_warning = _msg
+                agent._emit_status(_msg)
+                return
+
+            except Exception as _fb_err:
+                _tried.append(f"{_fb_model} ({_fb_provider}): {_fb_err}")
+                continue
+
+        # No fallback worked — warn and let compression run without
+        # summaries (same behavior as 'no auxiliary LLM' above).
+        _all_tried = "; ".join(_tried)
+        _msg = (
+            f"⚠ No suitable compression model available.  "
+            f"Tried: {_all_tried}.  "
+            f"Compression will drop middle turns without summaries.  "
+            f"Run `hermes setup` or set "
+            f"auxiliary.compression.model in config.yaml."
+        )
+        agent._compression_warning = _msg
+        agent._emit_status(_msg)
+        logger.warning("Compression model fallback exhausted: %s", _all_tried)
+        return
     except Exception as exc:
         logger.debug(
             "Compression feasibility check failed (non-fatal): %s", exc
diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py
index 3be0f0235..008cb6f32 100644
--- a/tests/run_agent/test_compression_feasibility.py
+++ b/tests/run_agent/test_compression_feasibility.py
@@ -57,6 +57,7 @@ def _make_agent(
     compressor = MagicMock(spec=ContextCompressor)
     compressor.context_length = main_context
     compressor.threshold_tokens = int(main_context * threshold_percent)
+    compressor._compression_fallback = None
     agent.context_compressor = compressor
 
     return agent
@@ -101,24 +102,169 @@ def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_clien
 @patch("agent.model_metadata.get_model_context_length", return_value=32_768)
 @patch("agent.auxiliary_client.get_text_auxiliary_client")
 def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
-    """Hard floor: aux context < MINIMUM_CONTEXT_LENGTH (64K) → session
-    refuses to start (ValueError), mirroring the main-model rejection."""
+    """When aux context < MINIMUM_CONTEXT_LENGTH (64K) and no fallback
+    providers are configured, a warning is emitted and compression will
+    operate without summaries.  Previously this raised ValueError; now it
+    degrades gracefully so a model switch doesn't kill the session."""
     agent = _make_agent(main_context=200_000, threshold_percent=0.50)
     mock_client = MagicMock()
     mock_client.base_url = "https://openrouter.ai/api/v1"
     mock_client.api_key = "sk-aux"
     mock_get_client.return_value = (mock_client, "tiny-aux-model")
 
-    agent._emit_status = lambda msg: None
+    messages = []
+    agent._emit_status = lambda msg: messages.append(msg)
 
-    with pytest.raises(ValueError) as exc_info:
+    # No fallback chain → should warn, not raise
+    agent._fallback_chain = []
+    agent._check_compression_model_feasibility()
+
+    assert len(messages) == 1
+    assert "No suitable compression model" in messages[0]
+    assert "tiny-aux-model" in messages[0]
+    assert "32,768" in messages[0]
+    assert "64,000" in messages[0]
+    assert agent._compression_warning is not None
+
+
+@patch("agent.model_metadata.get_model_context_length")
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_falls_back_to_chain_when_aux_below_minimum(mock_get_client, mock_ctx_len):
+    """When the primary aux model fails the context-length floor, the
+    feasibility check tries each fallback provider in order, using the
+    first one that meets MINIMUM_CONTEXT_LENGTH."""
+    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+
+    # Primary aux model: too small (32K)
+    mock_primary_client = MagicMock()
+    mock_primary_client.base_url = "https://openrouter.ai/api/v1"
+    mock_primary_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_primary_client, "tiny-aux-model")
+
+    # Fallback chain: two providers, first one meets the floor
+    agent._fallback_chain = [
+        {"provider": "opencode_go", "model": "deepseek-v4-pro"},
+        {"provider": "custom", "model": "gemma-local",
+         "base_url": "http://127.0.0.1:8081/v1", "api_key": "no-key"},
+    ]
+
+    # Mock resolve_provider_client for the fallback resolution
+    mock_fb_client = MagicMock()
+    mock_fb_client.base_url = "https://api.opencode.ai/v1"
+    mock_fb_client.api_key = "sk-fallback"
+
+    # get_model_context_length: first return 32K (primary fail),
+    # then return 128K (fallback success)
+    mock_ctx_len.side_effect = [32_768, 128_000]
+
+    messages = []
+    agent._emit_status = lambda msg: messages.append(msg)
+
+    with patch("agent.auxiliary_client.resolve_provider_client",
+               return_value=(mock_fb_client, "deepseek-v4-pro")) as mock_resolve:
         agent._check_compression_model_feasibility()
 
-    err = str(exc_info.value)
-    assert "tiny-aux-model" in err
-    assert "32,768" in err
-    assert "64,000" in err
-    assert "below the minimum" in err
+    # Should have resolved the fallback provider
+    mock_resolve.assert_called_once()
+    # First two positional args: provider, model
+    assert mock_resolve.call_args[0][0] == "opencode_go"
+    assert mock_resolve.call_args[0][1] == "deepseek-v4-pro"
+
+    # Warning should mention the fallback choice
+    assert len(messages) == 1
+    assert "Falling back to" in messages[0]
+    assert "deepseek-v4-pro" in messages[0]
+    assert "opencode_go" in messages[0]
+
+    # Fallback dict stored on compressor
+    fb = agent.context_compressor._compression_fallback
+    assert fb is not None
+    assert fb["provider"] == "opencode_go"
+    assert fb["model"] == "deepseek-v4-pro"
+
+
+@patch("agent.model_metadata.get_model_context_length")
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_falls_back_past_unavailable_provider(mock_get_client, mock_ctx_len):
+    """When the first fallback provider is unavailable, skip it and
+    try the next one."""
+    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+
+    mock_primary_client = MagicMock()
+    mock_primary_client.base_url = "https://openrouter.ai/api/v1"
+    mock_primary_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_primary_client, "tiny")
+
+    # Fallback chain: first unavailable, second works
+    agent._fallback_chain = [
+        {"provider": "broken-provider", "model": "broken-model"},
+        {"provider": "opencode_go", "model": "deepseek-v4-pro"},
+    ]
+
+    mock_fb_client = MagicMock()
+    mock_fb_client.base_url = "https://api.opencode.ai/v1"
+    mock_fb_client.api_key = "sk-fallback"
+
+    # Primary: 32K (fail), broken-provider: unavailable, opencode_go: 128K
+    mock_ctx_len.side_effect = [32_768, None, 128_000]
+
+    messages = []
+    agent._emit_status = lambda msg: messages.append(msg)
+
+    # First resolve returns None (unavailable), second returns client
+    mock_resolve_values = [(None, None), (mock_fb_client, "deepseek-v4-pro")]
+    with patch("agent.auxiliary_client.resolve_provider_client",
+               side_effect=mock_resolve_values) as mock_resolve:
+        agent._check_compression_model_feasibility()
+
+    # Should have tried both fallbacks
+    assert mock_resolve.call_count == 2
+
+    # Should succeed with the second fallback
+    fb = agent.context_compressor._compression_fallback
+    assert fb is not None
+    assert fb["provider"] == "opencode_go"
+
+
+@patch("agent.model_metadata.get_model_context_length")
+@patch("agent.auxiliary_client.get_text_auxiliary_client")
+def test_warns_when_all_fallbacks_exhausted(mock_get_client, mock_ctx_len):
+    """When every fallback provider also fails the context floor or is
+    unavailable, emit a warning and degrade to no-summary mode without
+    raising."""
+    agent = _make_agent(main_context=200_000, threshold_percent=0.50)
+
+    mock_primary_client = MagicMock()
+    mock_primary_client.base_url = "https://openrouter.ai/api/v1"
+    mock_primary_client.api_key = "sk-aux"
+    mock_get_client.return_value = (mock_primary_client, "tiny-main")
+
+    agent._fallback_chain = [
+        {"provider": "small-provider", "model": "small-model"},
+    ]
+
+    # Fallback also too small
+    mock_fb_client = MagicMock()
+    mock_fb_client.base_url = "https://small.api/v1"
+    mock_fb_client.api_key = "sk-small"
+    mock_ctx_len.side_effect = [32_768, 16_384]
+
+    messages = []
+    agent._emit_status = lambda msg: messages.append(msg)
+    # Mock compressor won't have _compression_fallback until set —
+    # initialize it so the final assertion works.
+    agent.context_compressor._compression_fallback = None
+
+    with patch("agent.auxiliary_client.resolve_provider_client",
+               return_value=(mock_fb_client, "small-model")):
+        agent._check_compression_model_feasibility()
+
+    assert len(messages) == 1
+    assert "No suitable compression model" in messages[0]
+    assert "small-model" in messages[0]
+    assert agent._compression_warning is not None
+    # No fallback on compressor
+    assert agent.context_compressor._compression_fallback is None
 
 
 @patch("agent.model_metadata.get_model_context_length", return_value=200_000)