feat: fallback provider chain for auxiliary compression model
Some checks failed
OSV-Scanner / Scan lockfiles (push) Has been cancelled
Some checks failed
OSV-Scanner / Scan lockfiles (push) Has been cancelled
When the primary auxiliary compression model fails the minimum context-length check (64K floor), iterate through the user's fallback_providers chain instead of raising ValueError and killing the session. - check_compression_model_feasibility: catch ValueError from MINIMUM_CONTEXT_LENGTH rejection, try each fallback provider in order, store the first suitable one as _compression_fallback on the ContextCompressor - ContextCompressor._generate_summary: when _compression_fallback is set, override main_runtime (provider/model/base_url/api_key) so summaries route to the fallback provider - When all fallbacks are exhausted, emit a clear warning and continue without summaries (same as 'no auxiliary LLM' path) Fixes the session-killing crash when switching to a model whose provider's auto-detected compression model has <64K context.
This commit is contained in:
parent
2517917de3
commit
504954dcee
@ -580,6 +580,13 @@ class ContextCompressor(ContextEngine):
|
|||||||
|
|
||||||
self.summary_model = summary_model_override or ""
|
self.summary_model = summary_model_override or ""
|
||||||
|
|
||||||
|
# Compression-model fallback: set by check_compression_model_feasibility
|
||||||
|
# when the primary aux compression model fails the minimum context check.
|
||||||
|
# If set, _generate_summary uses this provider/model for the LLM call
|
||||||
|
# instead of the main compressor attributes. Dict keys:
|
||||||
|
# provider, model, base_url, api_key
|
||||||
|
self._compression_fallback: Optional[Dict[str, str]] = None
|
||||||
|
|
||||||
# Stores the previous compaction summary for iterative updates
|
# Stores the previous compaction summary for iterative updates
|
||||||
self._previous_summary: Optional[str] = None
|
self._previous_summary: Optional[str] = None
|
||||||
# Anti-thrashing: track whether last compression was effective
|
# Anti-thrashing: track whether last compression was effective
|
||||||
@ -1069,6 +1076,20 @@ The user has requested that this compaction PRIORITISE preserving all informatio
|
|||||||
}
|
}
|
||||||
if self.summary_model:
|
if self.summary_model:
|
||||||
call_kwargs["model"] = self.summary_model
|
call_kwargs["model"] = self.summary_model
|
||||||
|
# Compression-model fallback: when the primary aux compression
|
||||||
|
# model was rejected for insufficient context, the feasibility
|
||||||
|
# check stored a replacement provider/model here. Override the
|
||||||
|
# entire main_runtime so call_llm routes the summary request to
|
||||||
|
# the fallback provider instead of the main one.
|
||||||
|
if self._compression_fallback:
|
||||||
|
_fb = self._compression_fallback
|
||||||
|
call_kwargs["main_runtime"] = {
|
||||||
|
"model": _fb["model"],
|
||||||
|
"provider": _fb["provider"],
|
||||||
|
"base_url": _fb.get("base_url", ""),
|
||||||
|
"api_key": _fb.get("api_key", ""),
|
||||||
|
"api_mode": _fb.get("api_mode", self.api_mode),
|
||||||
|
}
|
||||||
response = call_llm(**call_kwargs)
|
response = call_llm(**call_kwargs)
|
||||||
content = response.choices[0].message.content
|
content = response.choices[0].message.content
|
||||||
# Handle cases where content is not a string (e.g., dict from llama.cpp)
|
# Handle cases where content is not a string (e.g., dict from llama.cpp)
|
||||||
|
|||||||
@ -221,9 +221,101 @@ def check_compression_model_feasibility(agent: Any) -> None:
|
|||||||
new_threshold,
|
new_threshold,
|
||||||
)
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# Hard rejections (aux below minimum context) must propagate
|
# Primary compression model failed the minimum context check
|
||||||
# so the session refuses to start.
|
# (context_length < MINIMUM_CONTEXT_LENGTH). Before giving up,
|
||||||
raise
|
# try the user's fallback provider chain so a model switch or
|
||||||
|
# provider outage doesn't silently disable compression.
|
||||||
|
_fallback_chain = getattr(agent, '_fallback_chain', None) or []
|
||||||
|
_tried = [f"{aux_model} ({_aux_cfg_provider or 'auto'}): {aux_context:,} ctx < {MINIMUM_CONTEXT_LENGTH:,}"]
|
||||||
|
|
||||||
|
for _fb_entry in _fallback_chain:
|
||||||
|
_fb_provider = _fb_entry.get("provider", "")
|
||||||
|
_fb_model = _fb_entry.get("model", "")
|
||||||
|
if not _fb_provider or not _fb_model:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
from agent.auxiliary_client import resolve_provider_client
|
||||||
|
|
||||||
|
_fb_client, _fb_resolved_model = resolve_provider_client(
|
||||||
|
_fb_provider,
|
||||||
|
_fb_model,
|
||||||
|
explicit_base_url=_fb_entry.get("base_url", ""),
|
||||||
|
explicit_api_key=_fb_entry.get("api_key", ""),
|
||||||
|
main_runtime=agent._current_main_runtime(),
|
||||||
|
)
|
||||||
|
if _fb_client is None or not _fb_resolved_model:
|
||||||
|
_tried.append(f"{_fb_model} ({_fb_provider}): unavailable")
|
||||||
|
continue
|
||||||
|
|
||||||
|
_fb_base_url = str(getattr(_fb_client, "base_url", ""))
|
||||||
|
_fb_api_key_raw = getattr(_fb_client, "api_key", "")
|
||||||
|
_fb_api_key = (
|
||||||
|
""
|
||||||
|
if callable(_fb_api_key_raw) and not isinstance(_fb_api_key_raw, str)
|
||||||
|
else str(_fb_api_key_raw or "")
|
||||||
|
)
|
||||||
|
|
||||||
|
_fb_context = get_model_context_length(
|
||||||
|
_fb_resolved_model,
|
||||||
|
base_url=_fb_base_url,
|
||||||
|
api_key=_fb_api_key,
|
||||||
|
provider=_fb_provider,
|
||||||
|
custom_providers=getattr(agent, "_custom_providers", None),
|
||||||
|
)
|
||||||
|
|
||||||
|
if _fb_context and _fb_context < MINIMUM_CONTEXT_LENGTH:
|
||||||
|
_tried.append(
|
||||||
|
f"{_fb_resolved_model} ({_fb_provider}): "
|
||||||
|
f"{_fb_context:,} ctx < {MINIMUM_CONTEXT_LENGTH:,}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ── Found a suitable fallback ──────────────────────────
|
||||||
|
logger.warning(
|
||||||
|
"Compression model %s (%s) has only %d token context "
|
||||||
|
"(minimum %d). Falling back to %s (%s) with %d token context.",
|
||||||
|
aux_model, _aux_cfg_provider or "auto", aux_context,
|
||||||
|
MINIMUM_CONTEXT_LENGTH, _fb_resolved_model, _fb_provider,
|
||||||
|
_fb_context or 0,
|
||||||
|
)
|
||||||
|
|
||||||
|
agent.context_compressor._compression_fallback = {
|
||||||
|
"provider": _fb_provider,
|
||||||
|
"model": _fb_resolved_model,
|
||||||
|
"base_url": _fb_base_url,
|
||||||
|
"api_key": _fb_api_key,
|
||||||
|
}
|
||||||
|
|
||||||
|
_msg = (
|
||||||
|
f"⚠ Compression model {aux_model} has only "
|
||||||
|
f"{aux_context:,} token context (minimum "
|
||||||
|
f"{MINIMUM_CONTEXT_LENGTH:,} required). "
|
||||||
|
f"Falling back to {_fb_resolved_model} ({_fb_provider}) "
|
||||||
|
f"for summaries."
|
||||||
|
)
|
||||||
|
agent._compression_warning = _msg
|
||||||
|
agent._emit_status(_msg)
|
||||||
|
return
|
||||||
|
|
||||||
|
except Exception as _fb_err:
|
||||||
|
_tried.append(f"{_fb_model} ({_fb_provider}): {_fb_err}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# No fallback worked — warn and let compression run without
|
||||||
|
# summaries (same behavior as 'no auxiliary LLM' above).
|
||||||
|
_all_tried = "; ".join(_tried)
|
||||||
|
_msg = (
|
||||||
|
f"⚠ No suitable compression model available. "
|
||||||
|
f"Tried: {_all_tried}. "
|
||||||
|
f"Compression will drop middle turns without summaries. "
|
||||||
|
f"Run `hermes setup` or set "
|
||||||
|
f"auxiliary.compression.model in config.yaml."
|
||||||
|
)
|
||||||
|
agent._compression_warning = _msg
|
||||||
|
agent._emit_status(_msg)
|
||||||
|
logger.warning("Compression model fallback exhausted: %s", _all_tried)
|
||||||
|
return
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Compression feasibility check failed (non-fatal): %s", exc
|
"Compression feasibility check failed (non-fatal): %s", exc
|
||||||
|
|||||||
@ -57,6 +57,7 @@ def _make_agent(
|
|||||||
compressor = MagicMock(spec=ContextCompressor)
|
compressor = MagicMock(spec=ContextCompressor)
|
||||||
compressor.context_length = main_context
|
compressor.context_length = main_context
|
||||||
compressor.threshold_tokens = int(main_context * threshold_percent)
|
compressor.threshold_tokens = int(main_context * threshold_percent)
|
||||||
|
compressor._compression_fallback = None
|
||||||
agent.context_compressor = compressor
|
agent.context_compressor = compressor
|
||||||
|
|
||||||
return agent
|
return agent
|
||||||
@ -101,24 +102,169 @@ def test_auto_corrects_threshold_when_aux_context_below_threshold(mock_get_clien
|
|||||||
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
|
@patch("agent.model_metadata.get_model_context_length", return_value=32_768)
|
||||||
@patch("agent.auxiliary_client.get_text_auxiliary_client")
|
@patch("agent.auxiliary_client.get_text_auxiliary_client")
|
||||||
def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
|
def test_rejects_aux_below_minimum_context(mock_get_client, mock_ctx_len):
|
||||||
"""Hard floor: aux context < MINIMUM_CONTEXT_LENGTH (64K) → session
|
"""When aux context < MINIMUM_CONTEXT_LENGTH (64K) and no fallback
|
||||||
refuses to start (ValueError), mirroring the main-model rejection."""
|
providers are configured, a warning is emitted and compression will
|
||||||
|
operate without summaries. Previously this raised ValueError; now it
|
||||||
|
degrades gracefully so a model switch doesn't kill the session."""
|
||||||
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
|
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
|
||||||
mock_client = MagicMock()
|
mock_client = MagicMock()
|
||||||
mock_client.base_url = "https://openrouter.ai/api/v1"
|
mock_client.base_url = "https://openrouter.ai/api/v1"
|
||||||
mock_client.api_key = "sk-aux"
|
mock_client.api_key = "sk-aux"
|
||||||
mock_get_client.return_value = (mock_client, "tiny-aux-model")
|
mock_get_client.return_value = (mock_client, "tiny-aux-model")
|
||||||
|
|
||||||
agent._emit_status = lambda msg: None
|
messages = []
|
||||||
|
agent._emit_status = lambda msg: messages.append(msg)
|
||||||
|
|
||||||
with pytest.raises(ValueError) as exc_info:
|
# No fallback chain → should warn, not raise
|
||||||
|
agent._fallback_chain = []
|
||||||
|
agent._check_compression_model_feasibility()
|
||||||
|
|
||||||
|
assert len(messages) == 1
|
||||||
|
assert "No suitable compression model" in messages[0]
|
||||||
|
assert "tiny-aux-model" in messages[0]
|
||||||
|
assert "32,768" in messages[0]
|
||||||
|
assert "64,000" in messages[0]
|
||||||
|
assert agent._compression_warning is not None
|
||||||
|
|
||||||
|
|
||||||
|
@patch("agent.model_metadata.get_model_context_length")
|
||||||
|
@patch("agent.auxiliary_client.get_text_auxiliary_client")
|
||||||
|
def test_falls_back_to_chain_when_aux_below_minimum(mock_get_client, mock_ctx_len):
|
||||||
|
"""When the primary aux model fails the context-length floor, the
|
||||||
|
feasibility check tries each fallback provider in order, using the
|
||||||
|
first one that meets MINIMUM_CONTEXT_LENGTH."""
|
||||||
|
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
|
||||||
|
|
||||||
|
# Primary aux model: too small (32K)
|
||||||
|
mock_primary_client = MagicMock()
|
||||||
|
mock_primary_client.base_url = "https://openrouter.ai/api/v1"
|
||||||
|
mock_primary_client.api_key = "sk-aux"
|
||||||
|
mock_get_client.return_value = (mock_primary_client, "tiny-aux-model")
|
||||||
|
|
||||||
|
# Fallback chain: two providers, first one meets the floor
|
||||||
|
agent._fallback_chain = [
|
||||||
|
{"provider": "opencode_go", "model": "deepseek-v4-pro"},
|
||||||
|
{"provider": "custom", "model": "gemma-local",
|
||||||
|
"base_url": "http://127.0.0.1:8081/v1", "api_key": "no-key"},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Mock resolve_provider_client for the fallback resolution
|
||||||
|
mock_fb_client = MagicMock()
|
||||||
|
mock_fb_client.base_url = "https://api.opencode.ai/v1"
|
||||||
|
mock_fb_client.api_key = "sk-fallback"
|
||||||
|
|
||||||
|
# get_model_context_length: first return 32K (primary fail),
|
||||||
|
# then return 128K (fallback success)
|
||||||
|
mock_ctx_len.side_effect = [32_768, 128_000]
|
||||||
|
|
||||||
|
messages = []
|
||||||
|
agent._emit_status = lambda msg: messages.append(msg)
|
||||||
|
|
||||||
|
with patch("agent.auxiliary_client.resolve_provider_client",
|
||||||
|
return_value=(mock_fb_client, "deepseek-v4-pro")) as mock_resolve:
|
||||||
agent._check_compression_model_feasibility()
|
agent._check_compression_model_feasibility()
|
||||||
|
|
||||||
err = str(exc_info.value)
|
# Should have resolved the fallback provider
|
||||||
assert "tiny-aux-model" in err
|
mock_resolve.assert_called_once()
|
||||||
assert "32,768" in err
|
# First two positional args: provider, model
|
||||||
assert "64,000" in err
|
assert mock_resolve.call_args[0][0] == "opencode_go"
|
||||||
assert "below the minimum" in err
|
assert mock_resolve.call_args[0][1] == "deepseek-v4-pro"
|
||||||
|
|
||||||
|
# Warning should mention the fallback choice
|
||||||
|
assert len(messages) == 1
|
||||||
|
assert "Falling back to" in messages[0]
|
||||||
|
assert "deepseek-v4-pro" in messages[0]
|
||||||
|
assert "opencode_go" in messages[0]
|
||||||
|
|
||||||
|
# Fallback dict stored on compressor
|
||||||
|
fb = agent.context_compressor._compression_fallback
|
||||||
|
assert fb is not None
|
||||||
|
assert fb["provider"] == "opencode_go"
|
||||||
|
assert fb["model"] == "deepseek-v4-pro"
|
||||||
|
|
||||||
|
|
||||||
|
@patch("agent.model_metadata.get_model_context_length")
|
||||||
|
@patch("agent.auxiliary_client.get_text_auxiliary_client")
|
||||||
|
def test_falls_back_past_unavailable_provider(mock_get_client, mock_ctx_len):
|
||||||
|
"""When the first fallback provider is unavailable, skip it and
|
||||||
|
try the next one."""
|
||||||
|
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
|
||||||
|
|
||||||
|
mock_primary_client = MagicMock()
|
||||||
|
mock_primary_client.base_url = "https://openrouter.ai/api/v1"
|
||||||
|
mock_primary_client.api_key = "sk-aux"
|
||||||
|
mock_get_client.return_value = (mock_primary_client, "tiny")
|
||||||
|
|
||||||
|
# Fallback chain: first unavailable, second works
|
||||||
|
agent._fallback_chain = [
|
||||||
|
{"provider": "broken-provider", "model": "broken-model"},
|
||||||
|
{"provider": "opencode_go", "model": "deepseek-v4-pro"},
|
||||||
|
]
|
||||||
|
|
||||||
|
mock_fb_client = MagicMock()
|
||||||
|
mock_fb_client.base_url = "https://api.opencode.ai/v1"
|
||||||
|
mock_fb_client.api_key = "sk-fallback"
|
||||||
|
|
||||||
|
# Primary: 32K (fail), broken-provider: unavailable, opencode_go: 128K
|
||||||
|
mock_ctx_len.side_effect = [32_768, None, 128_000]
|
||||||
|
|
||||||
|
messages = []
|
||||||
|
agent._emit_status = lambda msg: messages.append(msg)
|
||||||
|
|
||||||
|
# First resolve returns None (unavailable), second returns client
|
||||||
|
mock_resolve_values = [(None, None), (mock_fb_client, "deepseek-v4-pro")]
|
||||||
|
with patch("agent.auxiliary_client.resolve_provider_client",
|
||||||
|
side_effect=mock_resolve_values) as mock_resolve:
|
||||||
|
agent._check_compression_model_feasibility()
|
||||||
|
|
||||||
|
# Should have tried both fallbacks
|
||||||
|
assert mock_resolve.call_count == 2
|
||||||
|
|
||||||
|
# Should succeed with the second fallback
|
||||||
|
fb = agent.context_compressor._compression_fallback
|
||||||
|
assert fb is not None
|
||||||
|
assert fb["provider"] == "opencode_go"
|
||||||
|
|
||||||
|
|
||||||
|
@patch("agent.model_metadata.get_model_context_length")
|
||||||
|
@patch("agent.auxiliary_client.get_text_auxiliary_client")
|
||||||
|
def test_warns_when_all_fallbacks_exhausted(mock_get_client, mock_ctx_len):
|
||||||
|
"""When every fallback provider also fails the context floor or is
|
||||||
|
unavailable, emit a warning and degrade to no-summary mode without
|
||||||
|
raising."""
|
||||||
|
agent = _make_agent(main_context=200_000, threshold_percent=0.50)
|
||||||
|
|
||||||
|
mock_primary_client = MagicMock()
|
||||||
|
mock_primary_client.base_url = "https://openrouter.ai/api/v1"
|
||||||
|
mock_primary_client.api_key = "sk-aux"
|
||||||
|
mock_get_client.return_value = (mock_primary_client, "tiny-main")
|
||||||
|
|
||||||
|
agent._fallback_chain = [
|
||||||
|
{"provider": "small-provider", "model": "small-model"},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Fallback also too small
|
||||||
|
mock_fb_client = MagicMock()
|
||||||
|
mock_fb_client.base_url = "https://small.api/v1"
|
||||||
|
mock_fb_client.api_key = "sk-small"
|
||||||
|
mock_ctx_len.side_effect = [32_768, 16_384]
|
||||||
|
|
||||||
|
messages = []
|
||||||
|
agent._emit_status = lambda msg: messages.append(msg)
|
||||||
|
# Mock compressor won't have _compression_fallback until set —
|
||||||
|
# initialize it so the final assertion works.
|
||||||
|
agent.context_compressor._compression_fallback = None
|
||||||
|
|
||||||
|
with patch("agent.auxiliary_client.resolve_provider_client",
|
||||||
|
return_value=(mock_fb_client, "small-model")):
|
||||||
|
agent._check_compression_model_feasibility()
|
||||||
|
|
||||||
|
assert len(messages) == 1
|
||||||
|
assert "No suitable compression model" in messages[0]
|
||||||
|
assert "small-model" in messages[0]
|
||||||
|
assert agent._compression_warning is not None
|
||||||
|
# No fallback on compressor
|
||||||
|
assert agent.context_compressor._compression_fallback is None
|
||||||
|
|
||||||
|
|
||||||
@patch("agent.model_metadata.get_model_context_length", return_value=200_000)
|
@patch("agent.model_metadata.get_model_context_length", return_value=200_000)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user