From 504954dcee60f7c5e6eaf41ef93a104041088383 Mon Sep 17 00:00:00 2001 From: Anton Palgunov Date: Fri, 29 May 2026 14:52:31 +0000 Subject: [PATCH] feat: fallback provider chain for auxiliary compression model When the primary auxiliary compression model fails the minimum context-length check (64K floor), iterate through the user's fallback_providers chain instead of raising ValueError and killing the session. - check_compression_model_feasibility: catch ValueError from MINIMUM_CONTEXT_LENGTH rejection, try each fallback provider in order, store the first suitable one as _compression_fallback on the ContextCompressor - ContextCompressor._generate_summary: when _compression_fallback is set, override main_runtime (provider/model/base_url/api_key) so summaries route to the fallback provider - When all fallbacks are exhausted, emit a clear warning and continue without summaries (same as 'no auxiliary LLM' path) Fixes the session-killing crash when switching to a model whose provider's auto-detected compression model has <64K context. --- agent/context_compressor.py | 21 +++ agent/conversation_compression.py | 98 ++++++++++- .../run_agent/test_compression_feasibility.py | 164 +++++++++++++++++- 3 files changed, 271 insertions(+), 12 deletions(-) diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 49907e2c3..ba5fcf596 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -580,6 +580,13 @@ class ContextCompressor(ContextEngine): self.summary_model = summary_model_override or "" + # Compression-model fallback: set by check_compression_model_feasibility + # when the primary aux compression model fails the minimum context check. + # If set, _generate_summary uses this provider/model for the LLM call + # instead of the main compressor attributes. Dict keys: + # provider, model, base_url, api_key + self._compression_fallback: Optional[Dict[str, str]] = None + # Stores the previous compaction summary for iterative updates self._previous_summary: Optional[str] = None # Anti-thrashing: track whether last compression was effective @@ -1069,6 +1076,20 @@ The user has requested that this compaction PRIORITISE preserving all informatio } if self.summary_model: call_kwargs["model"] = self.summary_model + # Compression-model fallback: when the primary aux compression + # model was rejected for insufficient context, the feasibility + # check stored a replacement provider/model here. Override the + # entire main_runtime so call_llm routes the summary request to + # the fallback provider instead of the main one. + if self._compression_fallback: + _fb = self._compression_fallback + call_kwargs["main_runtime"] = { + "model": _fb["model"], + "provider": _fb["provider"], + "base_url": _fb.get("base_url", ""), + "api_key": _fb.get("api_key", ""), + "api_mode": _fb.get("api_mode", self.api_mode), + } response = call_llm(**call_kwargs) content = response.choices[0].message.content # Handle cases where content is not a string (e.g., dict from llama.cpp) diff --git a/agent/conversation_compression.py b/agent/conversation_compression.py index a620f343e..5e0099c4f 100644 --- a/agent/conversation_compression.py +++ b/agent/conversation_compression.py @@ -221,9 +221,101 @@ def check_compression_model_feasibility(agent: Any) -> None: new_threshold, ) except ValueError: - # Hard rejections (aux below minimum context) must propagate - # so the session refuses to start. - raise + # Primary compression model failed the minimum context check + # (context_length < MINIMUM_CONTEXT_LENGTH). Before giving up, + # try the user's fallback provider chain so a model switch or + # provider outage doesn't silently disable compression. + _fallback_chain = getattr(agent, '_fallback_chain', None) or [] + _tried = [f"{aux_model} ({_aux_cfg_provider or 'auto'}): {aux_context:,} ctx < {MINIMUM_CONTEXT_LENGTH:,}"] + + for _fb_entry in _fallback_chain: + _fb_provider = _fb_entry.get("provider", "") + _fb_model = _fb_entry.get("model", "") + if not _fb_provider or not _fb_model: + continue + + try: + from agent.auxiliary_client import resolve_provider_client + + _fb_client, _fb_resolved_model = resolve_provider_client( + _fb_provider, + _fb_model, + explicit_base_url=_fb_entry.get("base_url", ""), + explicit_api_key=_fb_entry.get("api_key", ""), + main_runtime=agent._current_main_runtime(), + ) + if _fb_client is None or not _fb_resolved_model: + _tried.append(f"{_fb_model} ({_fb_provider}): unavailable") + continue + + _fb_base_url = str(getattr(_fb_client, "base_url", "")) + _fb_api_key_raw = getattr(_fb_client, "api_key", "") + _fb_api_key = ( + "" + if callable(_fb_api_key_raw) and not isinstance(_fb_api_key_raw, str) + else str(_fb_api_key_raw or "") + ) + + _fb_context = get_model_context_length( + _fb_resolved_model, + base_url=_fb_base_url, + api_key=_fb_api_key, + provider=_fb_provider, + custom_providers=getattr(agent, "_custom_providers", None), + ) + + if _fb_context and _fb_context < MINIMUM_CONTEXT_LENGTH: + _tried.append( + f"{_fb_resolved_model} ({_fb_provider}): " + f"{_fb_context:,} ctx < {MINIMUM_CONTEXT_LENGTH:,}" + ) + continue + + # ── Found a suitable fallback ────────────────────────── + logger.warning( + "Compression model %s (%s) has only %d token context " + "(minimum %d). Falling back to %s (%s) with %d token context.", + aux_model, _aux_cfg_provider or "auto", aux_context, + MINIMUM_CONTEXT_LENGTH, _fb_resolved_model, _fb_provider, + _fb_context or 0, + ) + + agent.context_compressor._compression_fallback = { + "provider": _fb_provider, + "model": _fb_resolved_model, + "base_url": _fb_base_url, + "api_key": _fb_api_key, + } + + _msg = ( + f"⚠ Compression model {aux_model} has only " + f"{aux_context:,} token context (minimum " + f"{MINIMUM_CONTEXT_LENGTH:,} required). " + f"Falling back to {_fb_resolved_model} ({_fb_provider}) " + f"for summaries." + ) + agent._compression_warning = _msg + agent._emit_status(_msg) + return + + except Exception as _fb_err: + _tried.append(f"{_fb_model} ({_fb_provider}): {_fb_err}") + continue + + # No fallback worked — warn and let compression run without + # summaries (same behavior as 'no auxiliary LLM' above). + _all_tried = "; ".join(_tried) + _msg = ( + f"⚠ No suitable compression model available. " + f"Tried: {_all_tried}. " + f"Compression will drop middle turns without summaries. " + f"Run `hermes setup` or set " + f"auxiliary.compression.model in config.yaml." + ) + agent._compression_warning = _msg + agent._emit_status(_msg) + logger.warning("Compression model fallback exhausted: %s", _all_tried) + return except Exception as exc: logger.debug( "Compression feasibility check failed (non-fatal): %s", exc diff --git a/tests/run_agent/test_compression_feasibility.py b/tests/run_agent/test_compression_feasibility.py index 3be0f0235..008cb6f32 100644 --- a/tests/run_agent/test_compression_feasibility.py +++ b/tests/run_agent/test_compression_feasibility.py @@ -57,6 +57,7 @@ def _make_agent( compressor = MagicMock(spec=ContextCompressor) compressor.context_length = main_context compressor.threshold_tokens = int(main_context * threshold_percent) + compressor._compression_fallback = None agent.context_compressor = compressor return agent @@ -101,24 +102,169 @@ def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_clien @patch("agent.model_metadata.get_model_context_length", return_value=32_768) @patch("agent.auxiliary_client.get_text_auxiliary_client") def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len): - """Hard floor: aux context < MINIMUM_CONTEXT_LENGTH (64K) → session - refuses to start (ValueError), mirroring the main-model rejection.""" + """When aux context < MINIMUM_CONTEXT_LENGTH (64K) and no fallback + providers are configured, a warning is emitted and compression will + operate without summaries. Previously this raised ValueError; now it + degrades gracefully so a model switch doesn't kill the session.""" agent = _make_agent(main_context=200_000, threshold_percent=0.50) mock_client = MagicMock() mock_client.base_url = "https://openrouter.ai/api/v1" mock_client.api_key = "sk-aux" mock_get_client.return_value = (mock_client, "tiny-aux-model") - agent._emit_status = lambda msg: None + messages = [] + agent._emit_status = lambda msg: messages.append(msg) - with pytest.raises(ValueError) as exc_info: + # No fallback chain → should warn, not raise + agent._fallback_chain = [] + agent._check_compression_model_feasibility() + + assert len(messages) == 1 + assert "No suitable compression model" in messages[0] + assert "tiny-aux-model" in messages[0] + assert "32,768" in messages[0] + assert "64,000" in messages[0] + assert agent._compression_warning is not None + + +@patch("agent.model_metadata.get_model_context_length") +@patch("agent.auxiliary_client.get_text_auxiliary_client") +def test_falls_back_to_chain_when_aux_below_minimum(mock_get_client, mock_ctx_len): + """When the primary aux model fails the context-length floor, the + feasibility check tries each fallback provider in order, using the + first one that meets MINIMUM_CONTEXT_LENGTH.""" + agent = _make_agent(main_context=200_000, threshold_percent=0.50) + + # Primary aux model: too small (32K) + mock_primary_client = MagicMock() + mock_primary_client.base_url = "https://openrouter.ai/api/v1" + mock_primary_client.api_key = "sk-aux" + mock_get_client.return_value = (mock_primary_client, "tiny-aux-model") + + # Fallback chain: two providers, first one meets the floor + agent._fallback_chain = [ + {"provider": "opencode_go", "model": "deepseek-v4-pro"}, + {"provider": "custom", "model": "gemma-local", + "base_url": "http://127.0.0.1:8081/v1", "api_key": "no-key"}, + ] + + # Mock resolve_provider_client for the fallback resolution + mock_fb_client = MagicMock() + mock_fb_client.base_url = "https://api.opencode.ai/v1" + mock_fb_client.api_key = "sk-fallback" + + # get_model_context_length: first return 32K (primary fail), + # then return 128K (fallback success) + mock_ctx_len.side_effect = [32_768, 128_000] + + messages = [] + agent._emit_status = lambda msg: messages.append(msg) + + with patch("agent.auxiliary_client.resolve_provider_client", + return_value=(mock_fb_client, "deepseek-v4-pro")) as mock_resolve: agent._check_compression_model_feasibility() - err = str(exc_info.value) - assert "tiny-aux-model" in err - assert "32,768" in err - assert "64,000" in err - assert "below the minimum" in err + # Should have resolved the fallback provider + mock_resolve.assert_called_once() + # First two positional args: provider, model + assert mock_resolve.call_args[0][0] == "opencode_go" + assert mock_resolve.call_args[0][1] == "deepseek-v4-pro" + + # Warning should mention the fallback choice + assert len(messages) == 1 + assert "Falling back to" in messages[0] + assert "deepseek-v4-pro" in messages[0] + assert "opencode_go" in messages[0] + + # Fallback dict stored on compressor + fb = agent.context_compressor._compression_fallback + assert fb is not None + assert fb["provider"] == "opencode_go" + assert fb["model"] == "deepseek-v4-pro" + + +@patch("agent.model_metadata.get_model_context_length") +@patch("agent.auxiliary_client.get_text_auxiliary_client") +def test_falls_back_past_unavailable_provider(mock_get_client, mock_ctx_len): + """When the first fallback provider is unavailable, skip it and + try the next one.""" + agent = _make_agent(main_context=200_000, threshold_percent=0.50) + + mock_primary_client = MagicMock() + mock_primary_client.base_url = "https://openrouter.ai/api/v1" + mock_primary_client.api_key = "sk-aux" + mock_get_client.return_value = (mock_primary_client, "tiny") + + # Fallback chain: first unavailable, second works + agent._fallback_chain = [ + {"provider": "broken-provider", "model": "broken-model"}, + {"provider": "opencode_go", "model": "deepseek-v4-pro"}, + ] + + mock_fb_client = MagicMock() + mock_fb_client.base_url = "https://api.opencode.ai/v1" + mock_fb_client.api_key = "sk-fallback" + + # Primary: 32K (fail), broken-provider: unavailable, opencode_go: 128K + mock_ctx_len.side_effect = [32_768, None, 128_000] + + messages = [] + agent._emit_status = lambda msg: messages.append(msg) + + # First resolve returns None (unavailable), second returns client + mock_resolve_values = [(None, None), (mock_fb_client, "deepseek-v4-pro")] + with patch("agent.auxiliary_client.resolve_provider_client", + side_effect=mock_resolve_values) as mock_resolve: + agent._check_compression_model_feasibility() + + # Should have tried both fallbacks + assert mock_resolve.call_count == 2 + + # Should succeed with the second fallback + fb = agent.context_compressor._compression_fallback + assert fb is not None + assert fb["provider"] == "opencode_go" + + +@patch("agent.model_metadata.get_model_context_length") +@patch("agent.auxiliary_client.get_text_auxiliary_client") +def test_warns_when_all_fallbacks_exhausted(mock_get_client, mock_ctx_len): + """When every fallback provider also fails the context floor or is + unavailable, emit a warning and degrade to no-summary mode without + raising.""" + agent = _make_agent(main_context=200_000, threshold_percent=0.50) + + mock_primary_client = MagicMock() + mock_primary_client.base_url = "https://openrouter.ai/api/v1" + mock_primary_client.api_key = "sk-aux" + mock_get_client.return_value = (mock_primary_client, "tiny-main") + + agent._fallback_chain = [ + {"provider": "small-provider", "model": "small-model"}, + ] + + # Fallback also too small + mock_fb_client = MagicMock() + mock_fb_client.base_url = "https://small.api/v1" + mock_fb_client.api_key = "sk-small" + mock_ctx_len.side_effect = [32_768, 16_384] + + messages = [] + agent._emit_status = lambda msg: messages.append(msg) + # Mock compressor won't have _compression_fallback until set — + # initialize it so the final assertion works. + agent.context_compressor._compression_fallback = None + + with patch("agent.auxiliary_client.resolve_provider_client", + return_value=(mock_fb_client, "small-model")): + agent._check_compression_model_feasibility() + + assert len(messages) == 1 + assert "No suitable compression model" in messages[0] + assert "small-model" in messages[0] + assert agent._compression_warning is not None + # No fallback on compressor + assert agent.context_compressor._compression_fallback is None @patch("agent.model_metadata.get_model_context_length", return_value=200_000)