fix(agent): consult supports_vision override in auto-mode routing
The contributor PR (#17936) only patched the strip path in `_model_supports_vision()`. The auto-mode router in `agent/image_routing._lookup_supports_vision` still only read models.dev, so a custom-provider model declared as vision-capable would still get its images routed through vision_analyze in the default `agent.image_input_mode: auto` setting. Users had to set both `supports_vision: true` AND `image_input_mode: native` to bypass the text pipeline. Single-knob behavior now: `supports_vision: true` alone is enough in auto mode. The strip path and the routing path consult the same resolver. - Extract override resolution into `_supports_vision_override()` in agent/image_routing.py and wire it into `_lookup_supports_vision()`. - Refactor `run_agent._model_supports_vision` to call the same helper (DRY, single source of truth for the resolution order). - Strict YAML boolean coercion: `supports_vision: "false"` (quoted — a common YAML mistake) no longer coerces to True via bool() truthiness. Recognised tokens: true/false/yes/no/on/off/1/0 plus real bools and 0/1. Unrecognised values return None and fall through to models.dev. - Add @CNSeniorious000 to AUTHOR_MAP for release attribution. Tests: 26 new (TestCoerceCapabilityBool, TestSupportsVisionOverride, TestLookupSupportsVisionOverride, TestAutoModeRespectsOverride). Existing contributor tests + image_routing + vision_native_fast_path + native_image_buffer_isolation all green (92/92).
This commit is contained in:
parent
1c76689b28
commit
32aea113f0
@ -46,6 +46,84 @@ logger = logging.getLogger(__name__)
|
|||||||
_VALID_MODES = frozenset({"auto", "native", "text"})
|
_VALID_MODES = frozenset({"auto", "native", "text"})
|
||||||
|
|
||||||
|
|
||||||
|
# Strict YAML/JSON boolean coercion for capability overrides.
|
||||||
|
#
|
||||||
|
# ``bool("false")`` is True in Python because non-empty strings are truthy, so
|
||||||
|
# a user writing ``supports_vision: "false"`` (quoted — a common YAML mistake)
|
||||||
|
# would silently enable native vision routing on a model that can't actually
|
||||||
|
# handle it. Accept only the values YAML 1.1 / 1.2 treat as booleans, plus
|
||||||
|
# real ``bool`` and integer 0/1. Anything else returns None so the caller
|
||||||
|
# falls through to models.dev rather than honouring garbage.
|
||||||
|
_TRUE_TOKENS = frozenset({"true", "yes", "on", "1"})
|
||||||
|
_FALSE_TOKENS = frozenset({"false", "no", "off", "0"})
|
||||||
|
|
||||||
|
|
||||||
|
def _coerce_capability_bool(raw: Any) -> Optional[bool]:
|
||||||
|
"""Return True/False for recognised boolean values, None otherwise."""
|
||||||
|
if isinstance(raw, bool):
|
||||||
|
return raw
|
||||||
|
if isinstance(raw, int):
|
||||||
|
if raw in (0, 1):
|
||||||
|
return bool(raw)
|
||||||
|
return None
|
||||||
|
if isinstance(raw, str):
|
||||||
|
s = raw.strip().lower()
|
||||||
|
if s in _TRUE_TOKENS:
|
||||||
|
return True
|
||||||
|
if s in _FALSE_TOKENS:
|
||||||
|
return False
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _supports_vision_override(
|
||||||
|
cfg: Optional[Dict[str, Any]],
|
||||||
|
provider: str,
|
||||||
|
model: str,
|
||||||
|
) -> Optional[bool]:
|
||||||
|
"""Resolve user-declared vision capability from config.yaml.
|
||||||
|
|
||||||
|
Resolution order, first hit wins:
|
||||||
|
1. ``model.supports_vision`` (top-level shortcut for the active model)
|
||||||
|
2. ``providers.<provider>.models.<model>.supports_vision``
|
||||||
|
(named custom providers — ``provider`` may be the runtime-resolved
|
||||||
|
value ``"custom"`` and/or the user-declared name under
|
||||||
|
``model.provider``; both are tried)
|
||||||
|
|
||||||
|
Returns None when no override is set, so the caller falls through to
|
||||||
|
models.dev. Returns False explicitly only when the user wrote a
|
||||||
|
recognised boolean false token.
|
||||||
|
"""
|
||||||
|
if not isinstance(cfg, dict):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 1. Top-level shortcut
|
||||||
|
model_cfg_raw = cfg.get("model")
|
||||||
|
model_cfg: Dict[str, Any] = model_cfg_raw if isinstance(model_cfg_raw, dict) else {}
|
||||||
|
top = _coerce_capability_bool(model_cfg.get("supports_vision"))
|
||||||
|
if top is not None:
|
||||||
|
return top
|
||||||
|
|
||||||
|
# 2. Per-provider, per-model. Named custom providers (e.g. "my-vllm")
|
||||||
|
# get rewritten to provider="custom" at runtime
|
||||||
|
# (hermes_cli/runtime_provider.py:_resolve_named_custom_runtime), so the
|
||||||
|
# config still holds the user-declared name under model.provider. Try
|
||||||
|
# both as candidate provider keys.
|
||||||
|
config_provider = str(model_cfg.get("provider") or "").strip()
|
||||||
|
providers_raw = cfg.get("providers")
|
||||||
|
providers_cfg: Dict[str, Any] = providers_raw if isinstance(providers_raw, dict) else {}
|
||||||
|
for p in dict.fromkeys(filter(None, (provider, config_provider))):
|
||||||
|
entry_raw = providers_cfg.get(p)
|
||||||
|
entry: Dict[str, Any] = entry_raw if isinstance(entry_raw, dict) else {}
|
||||||
|
models_raw = entry.get("models")
|
||||||
|
models_cfg: Dict[str, Any] = models_raw if isinstance(models_raw, dict) else {}
|
||||||
|
per_model_raw = models_cfg.get(model)
|
||||||
|
per_model: Dict[str, Any] = per_model_raw if isinstance(per_model_raw, dict) else {}
|
||||||
|
coerced = _coerce_capability_bool(per_model.get("supports_vision"))
|
||||||
|
if coerced is not None:
|
||||||
|
return coerced
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _coerce_mode(raw: Any) -> str:
|
def _coerce_mode(raw: Any) -> str:
|
||||||
"""Normalize a config value into one of the valid modes."""
|
"""Normalize a config value into one of the valid modes."""
|
||||||
if not isinstance(raw, str):
|
if not isinstance(raw, str):
|
||||||
@ -81,8 +159,20 @@ def _explicit_aux_vision_override(cfg: Optional[Dict[str, Any]]) -> bool:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _lookup_supports_vision(provider: str, model: str) -> Optional[bool]:
|
def _lookup_supports_vision(
|
||||||
"""Return True/False if we can resolve caps, None if unknown."""
|
provider: str,
|
||||||
|
model: str,
|
||||||
|
cfg: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> Optional[bool]:
|
||||||
|
"""Return True/False if we can resolve caps, None if unknown.
|
||||||
|
|
||||||
|
Consults the user's ``supports_vision`` override in config.yaml first
|
||||||
|
(so custom/local models declared as vision-capable don't fall through to
|
||||||
|
text routing in ``auto`` mode), then falls back to models.dev.
|
||||||
|
"""
|
||||||
|
override = _supports_vision_override(cfg, provider, model)
|
||||||
|
if override is not None:
|
||||||
|
return override
|
||||||
if not provider or not model:
|
if not provider or not model:
|
||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
@ -123,7 +213,7 @@ def decide_image_input_mode(
|
|||||||
if _explicit_aux_vision_override(cfg):
|
if _explicit_aux_vision_override(cfg):
|
||||||
return "text"
|
return "text"
|
||||||
|
|
||||||
supports = _lookup_supports_vision(provider, model)
|
supports = _lookup_supports_vision(provider, model, cfg)
|
||||||
if supports is True:
|
if supports is True:
|
||||||
return "native"
|
return "native"
|
||||||
return "text"
|
return "text"
|
||||||
|
|||||||
24
run_agent.py
24
run_agent.py
@ -3201,7 +3201,7 @@ class AIAgent:
|
|||||||
messages (for non-vision models) or let the provider adapter handle
|
messages (for non-vision models) or let the provider adapter handle
|
||||||
them natively (for vision-capable models).
|
them natively (for vision-capable models).
|
||||||
|
|
||||||
Resolution order:
|
Resolution order (see ``agent.image_routing._supports_vision_override``):
|
||||||
1. ``model.supports_vision`` (top-level, single-model shortcut)
|
1. ``model.supports_vision`` (top-level, single-model shortcut)
|
||||||
2. ``providers.<provider>.models.<model>.supports_vision``
|
2. ``providers.<provider>.models.<model>.supports_vision``
|
||||||
3. models.dev capability lookup
|
3. models.dev capability lookup
|
||||||
@ -3209,28 +3209,12 @@ class AIAgent:
|
|||||||
misclassified as non-vision and have their images stripped.
|
misclassified as non-vision and have their images stripped.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
from hermes_cli.config import cfg_get, load_config
|
from hermes_cli.config import load_config
|
||||||
|
from agent.image_routing import _lookup_supports_vision
|
||||||
cfg = load_config()
|
cfg = load_config()
|
||||||
provider = (getattr(self, "provider", "") or "").strip()
|
provider = (getattr(self, "provider", "") or "").strip()
|
||||||
model = (getattr(self, "model", "") or "").strip()
|
model = (getattr(self, "model", "") or "").strip()
|
||||||
# self.provider is the runtime-resolved value, which is rewritten
|
return _lookup_supports_vision(provider, model, cfg) is True
|
||||||
# to "custom" for named custom providers (see
|
|
||||||
# hermes_cli/runtime_provider.py:_resolve_named_custom_runtime),
|
|
||||||
# while the config still holds the user-declared name (e.g.
|
|
||||||
# "my-vllm") under model.provider. Try both as provider keys.
|
|
||||||
config_provider = str(cfg_get(cfg, "model", "provider") or "").strip()
|
|
||||||
candidates = [("model", "supports_vision")]
|
|
||||||
for p in dict.fromkeys(filter(None, (provider, config_provider))):
|
|
||||||
candidates.append(("providers", p, "models", model, "supports_vision"))
|
|
||||||
for keys in candidates:
|
|
||||||
override = cfg_get(cfg, *keys)
|
|
||||||
if override is not None:
|
|
||||||
return bool(override)
|
|
||||||
from agent.models_dev import get_model_capabilities
|
|
||||||
if not provider or not model:
|
|
||||||
return False
|
|
||||||
caps = get_model_capabilities(provider, model)
|
|
||||||
return bool(caps and caps.supports_vision)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|||||||
@ -47,6 +47,7 @@ ACP_REGISTRY_MANIFEST = REPO_ROOT / "acp_registry" / "agent.json"
|
|||||||
AUTHOR_MAP = {
|
AUTHOR_MAP = {
|
||||||
# teknium (multiple emails)
|
# teknium (multiple emails)
|
||||||
"teknium1@gmail.com": "teknium1",
|
"teknium1@gmail.com": "teknium1",
|
||||||
|
"me@promplate.dev": "CNSeniorious000",
|
||||||
"erhanyasarx@gmail.com": "erhnysr",
|
"erhanyasarx@gmail.com": "erhnysr",
|
||||||
"30366221+WorldWriter@users.noreply.github.com": "WorldWriter",
|
"30366221+WorldWriter@users.noreply.github.com": "WorldWriter",
|
||||||
"dafeng@DafengdeMacBook-Pro.local": "WorldWriter",
|
"dafeng@DafengdeMacBook-Pro.local": "WorldWriter",
|
||||||
|
|||||||
@ -9,8 +9,11 @@ from unittest.mock import patch
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from agent.image_routing import (
|
from agent.image_routing import (
|
||||||
|
_coerce_capability_bool,
|
||||||
_coerce_mode,
|
_coerce_mode,
|
||||||
_explicit_aux_vision_override,
|
_explicit_aux_vision_override,
|
||||||
|
_lookup_supports_vision,
|
||||||
|
_supports_vision_override,
|
||||||
build_native_content_parts,
|
build_native_content_parts,
|
||||||
decide_image_input_mode,
|
decide_image_input_mode,
|
||||||
)
|
)
|
||||||
@ -125,6 +128,168 @@ class TestDecideImageInputMode:
|
|||||||
assert decide_image_input_mode("xiaomi", "mimo-v2.5-pro", {}) == "text"
|
assert decide_image_input_mode("xiaomi", "mimo-v2.5-pro", {}) == "text"
|
||||||
|
|
||||||
|
|
||||||
|
# ─── _coerce_capability_bool ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestCoerceCapabilityBool:
|
||||||
|
def test_real_bool_passes_through(self):
|
||||||
|
assert _coerce_capability_bool(True) is True
|
||||||
|
assert _coerce_capability_bool(False) is False
|
||||||
|
|
||||||
|
def test_int_0_and_1(self):
|
||||||
|
assert _coerce_capability_bool(1) is True
|
||||||
|
assert _coerce_capability_bool(0) is False
|
||||||
|
|
||||||
|
def test_other_ints_return_none(self):
|
||||||
|
assert _coerce_capability_bool(2) is None
|
||||||
|
assert _coerce_capability_bool(-1) is None
|
||||||
|
|
||||||
|
def test_yaml_true_tokens(self):
|
||||||
|
for s in ("true", "TRUE", "True", "yes", "on", "1", " true "):
|
||||||
|
assert _coerce_capability_bool(s) is True
|
||||||
|
|
||||||
|
def test_yaml_false_tokens(self):
|
||||||
|
for s in ("false", "FALSE", "False", "no", "off", "0", " false "):
|
||||||
|
assert _coerce_capability_bool(s) is False
|
||||||
|
|
||||||
|
def test_quoted_false_does_not_silently_become_true(self):
|
||||||
|
# Regression: bool("false") is True in Python. A user writing
|
||||||
|
# supports_vision: "false" must NOT enable native vision routing.
|
||||||
|
assert _coerce_capability_bool("false") is False
|
||||||
|
|
||||||
|
def test_unrecognised_strings_return_none(self):
|
||||||
|
# None == fall through to models.dev, not a silent truthy.
|
||||||
|
assert _coerce_capability_bool("maybe") is None
|
||||||
|
assert _coerce_capability_bool("") is None
|
||||||
|
assert _coerce_capability_bool("definitely") is None
|
||||||
|
|
||||||
|
def test_other_types_return_none(self):
|
||||||
|
assert _coerce_capability_bool(None) is None
|
||||||
|
assert _coerce_capability_bool([]) is None
|
||||||
|
assert _coerce_capability_bool({}) is None
|
||||||
|
assert _coerce_capability_bool(1.5) is None
|
||||||
|
|
||||||
|
|
||||||
|
# ─── _supports_vision_override ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestSupportsVisionOverride:
|
||||||
|
def test_no_cfg_returns_none(self):
|
||||||
|
assert _supports_vision_override(None, "custom", "my-llava") is None
|
||||||
|
assert _supports_vision_override({}, "custom", "my-llava") is None
|
||||||
|
|
||||||
|
def test_top_level_shortcut_wins(self):
|
||||||
|
cfg = {"model": {"supports_vision": True}}
|
||||||
|
assert _supports_vision_override(cfg, "custom", "my-llava") is True
|
||||||
|
|
||||||
|
def test_top_level_false_propagates(self):
|
||||||
|
cfg = {"model": {"supports_vision": False}}
|
||||||
|
assert _supports_vision_override(cfg, "custom", "my-llava") is False
|
||||||
|
|
||||||
|
def test_per_provider_per_model_via_runtime_name(self):
|
||||||
|
cfg = {
|
||||||
|
"providers": {
|
||||||
|
"custom": {"models": {"my-llava": {"supports_vision": True}}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
assert _supports_vision_override(cfg, "custom", "my-llava") is True
|
||||||
|
|
||||||
|
def test_per_provider_per_model_via_config_name(self):
|
||||||
|
# Named custom provider — runtime self.provider == "custom", config
|
||||||
|
# holds the original name under model.provider.
|
||||||
|
cfg = {
|
||||||
|
"model": {"provider": "my-vllm"},
|
||||||
|
"providers": {
|
||||||
|
"my-vllm": {"models": {"my-llava": {"supports_vision": True}}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
assert _supports_vision_override(cfg, "custom", "my-llava") is True
|
||||||
|
|
||||||
|
def test_quoted_false_string_in_yaml_does_not_enable(self):
|
||||||
|
# Real-world: user writes supports_vision: "false" (quoted).
|
||||||
|
cfg = {"model": {"supports_vision": "false"}}
|
||||||
|
assert _supports_vision_override(cfg, "custom", "my-llava") is False
|
||||||
|
|
||||||
|
def test_unrecognised_value_falls_through(self):
|
||||||
|
cfg = {"model": {"supports_vision": "maybe"}}
|
||||||
|
assert _supports_vision_override(cfg, "custom", "my-llava") is None
|
||||||
|
|
||||||
|
def test_no_override_returns_none(self):
|
||||||
|
cfg = {"model": {"default": "my-llava"}}
|
||||||
|
assert _supports_vision_override(cfg, "custom", "my-llava") is None
|
||||||
|
|
||||||
|
def test_malformed_sections_are_ignored(self):
|
||||||
|
# User accidentally wrote a string where a section was expected —
|
||||||
|
# don't blow up, just fall through.
|
||||||
|
cfg = {"model": "some-string", "providers": ["not-a-dict"]}
|
||||||
|
assert _supports_vision_override(cfg, "custom", "my-llava") is None
|
||||||
|
|
||||||
|
|
||||||
|
# ─── _lookup_supports_vision (override-aware) ────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestLookupSupportsVisionOverride:
|
||||||
|
def test_config_override_short_circuits_models_dev(self):
|
||||||
|
# Config says True, models.dev says None — config wins.
|
||||||
|
cfg = {"model": {"supports_vision": True}}
|
||||||
|
with patch("agent.models_dev.get_model_capabilities", return_value=None):
|
||||||
|
assert _lookup_supports_vision("custom", "my-llava", cfg) is True
|
||||||
|
|
||||||
|
def test_config_override_false_beats_vision_capable_models_dev(self):
|
||||||
|
# User explicitly disables vision on a models.dev-vision-capable model.
|
||||||
|
fake_caps = type("Caps", (), {"supports_vision": True})()
|
||||||
|
cfg = {"model": {"supports_vision": False}}
|
||||||
|
with patch("agent.models_dev.get_model_capabilities", return_value=fake_caps):
|
||||||
|
assert _lookup_supports_vision("anthropic", "claude-sonnet-4", cfg) is False
|
||||||
|
|
||||||
|
def test_no_override_falls_back_to_models_dev(self):
|
||||||
|
fake_caps = type("Caps", (), {"supports_vision": True})()
|
||||||
|
with patch("agent.models_dev.get_model_capabilities", return_value=fake_caps):
|
||||||
|
assert _lookup_supports_vision("anthropic", "claude-sonnet-4", {}) is True
|
||||||
|
|
||||||
|
def test_no_override_no_models_dev_entry_returns_none(self):
|
||||||
|
with patch("agent.models_dev.get_model_capabilities", return_value=None):
|
||||||
|
assert _lookup_supports_vision("custom", "my-llava", {}) is None
|
||||||
|
|
||||||
|
def test_cfg_none_falls_back_to_models_dev(self):
|
||||||
|
# Caller didn't pass cfg at all — old call sites must still work.
|
||||||
|
with patch("agent.models_dev.get_model_capabilities", return_value=None):
|
||||||
|
assert _lookup_supports_vision("openrouter", "x", None) is None
|
||||||
|
|
||||||
|
|
||||||
|
# ─── decide_image_input_mode with auto + override ────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestAutoModeRespectsOverride:
|
||||||
|
def test_auto_native_for_custom_with_supports_vision_true(self):
|
||||||
|
# The motivating bug: Qwen3.6 on local llama.cpp via provider=custom.
|
||||||
|
# Without the override, auto falls back to text. With it, auto picks
|
||||||
|
# native — no need to also set agent.image_input_mode: native.
|
||||||
|
cfg = {"model": {"supports_vision": True}}
|
||||||
|
with patch("agent.models_dev.get_model_capabilities", return_value=None):
|
||||||
|
assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "native"
|
||||||
|
|
||||||
|
def test_auto_text_for_custom_with_supports_vision_false(self):
|
||||||
|
cfg = {"model": {"supports_vision": False}}
|
||||||
|
with patch("agent.models_dev.get_model_capabilities", return_value=None):
|
||||||
|
assert decide_image_input_mode("custom", "some-text-only", cfg) == "text"
|
||||||
|
|
||||||
|
def test_auto_text_for_custom_with_no_override(self):
|
||||||
|
# Unchanged baseline: unknown custom model → text.
|
||||||
|
with patch("agent.models_dev.get_model_capabilities", return_value=None):
|
||||||
|
assert decide_image_input_mode("custom", "unknown", {}) == "text"
|
||||||
|
|
||||||
|
def test_explicit_aux_vision_override_still_wins(self):
|
||||||
|
# If the user has configured a dedicated vision aux backend, respect
|
||||||
|
# it even when supports_vision: true is also set.
|
||||||
|
cfg = {
|
||||||
|
"model": {"supports_vision": True},
|
||||||
|
"auxiliary": {"vision": {"provider": "openrouter", "model": "gemini-2.5-pro"}},
|
||||||
|
}
|
||||||
|
with patch("agent.models_dev.get_model_capabilities", return_value=None):
|
||||||
|
assert decide_image_input_mode("custom", "qwen3.6-35b", cfg) == "text"
|
||||||
|
|
||||||
|
|
||||||
# ─── build_native_content_parts ──────────────────────────────────────────────
|
# ─── build_native_content_parts ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user