feat: session improvements — crash-context, compression fallback, semantic RLE, telegram voice/test coverage

This commit is contained in:
Anton Palgunov 2026-05-29 15:40:24 +00:00
parent 2517917de3
commit f95f2daa4b
22 changed files with 1791 additions and 34 deletions

View File

@ -943,6 +943,10 @@ def restore_primary_runtime(agent) -> bool:
agent._fallback_activated = False
agent._fallback_index = 0
agent._emit_status(
f"✅ Primary model recovered — switching back: "
f"{agent.model} via {agent.provider}"
)
logger.info(
"Primary runtime restored for new turn: %s (%s)",
agent.model, agent.provider,

View File

@ -846,6 +846,8 @@ def _extract_responses_message_text(item: Any) -> str:
chunks: List[str] = []
for part in content:
if part is None:
continue
ptype = getattr(part, "type", None)
if ptype not in {"output_text", "text"}:
continue
@ -861,6 +863,8 @@ def _extract_responses_reasoning_text(item: Any) -> str:
if isinstance(summary, list):
chunks: List[str] = []
for part in summary:
if part is None:
continue
text = getattr(part, "text", None)
if isinstance(text, str) and text:
chunks.append(text)
@ -920,7 +924,13 @@ def _normalize_codex_response(response: Any) -> tuple[Any, str]:
saw_commentary_phase = False
saw_final_answer_phase = False
for item in output:
for idx, item in enumerate(output):
if item is None:
logger.warning(
"Codex response output contains None item (item #%d of %d); skipping.",
idx, len(output),
)
continue
item_type = getattr(item, "type", None)
item_status = getattr(item, "status", None)
if isinstance(item_status, str):

View File

@ -335,7 +335,15 @@ def run_codex_stream(agent, api_kwargs: dict, client: Any = None, on_first_delta
)
return agent._run_codex_create_stream_fallback(api_kwargs, client=active_client)
raise
except Exception as exc:
# Catch-all for unexpected errors during stream processing
# (e.g. TypeError when a response field is None that we try to iterate).
# Log with full traceback so we can diagnose, then re-raise.
logger.exception(
"Codex Responses stream raised unexpected %s: %s. %s",
type(exc).__name__, exc, agent._client_log_context(),
)
raise
def run_codex_create_stream_fallback(agent, api_kwargs: dict, client: Any = None):

View File

@ -580,6 +580,13 @@ class ContextCompressor(ContextEngine):
self.summary_model = summary_model_override or ""
# Compression-model fallback: set by check_compression_model_feasibility
# when the primary aux compression model fails the minimum context check.
# If set, _generate_summary uses this provider/model for the LLM call
# instead of the main compressor attributes. Dict keys:
# provider, model, base_url, api_key
self._compression_fallback: Optional[Dict[str, str]] = None
# Stores the previous compaction summary for iterative updates
self._previous_summary: Optional[str] = None
# Anti-thrashing: track whether last compression was effective
@ -1069,6 +1076,20 @@ The user has requested that this compaction PRIORITISE preserving all informatio
}
if self.summary_model:
call_kwargs["model"] = self.summary_model
# Compression-model fallback: when the primary aux compression
# model was rejected for insufficient context, the feasibility
# check stored a replacement provider/model here. Override the
# entire main_runtime so call_llm routes the summary request to
# the fallback provider instead of the main one.
if self._compression_fallback:
_fb = self._compression_fallback
call_kwargs["main_runtime"] = {
"model": _fb["model"],
"provider": _fb["provider"],
"base_url": _fb.get("base_url", ""),
"api_key": _fb.get("api_key", ""),
"api_mode": _fb.get("api_mode", self.api_mode),
}
response = call_llm(**call_kwargs)
content = response.choices[0].message.content
# Handle cases where content is not a string (e.g., dict from llama.cpp)

View File

@ -221,9 +221,101 @@ def check_compression_model_feasibility(agent: Any) -> None:
new_threshold,
)
except ValueError:
# Hard rejections (aux below minimum context) must propagate
# so the session refuses to start.
raise
# Primary compression model failed the minimum context check
# (context_length < MINIMUM_CONTEXT_LENGTH). Before giving up,
# try the user's fallback provider chain so a model switch or
# provider outage doesn't silently disable compression.
_fallback_chain = getattr(agent, '_fallback_chain', None) or []
_tried = [f"{aux_model} ({_aux_cfg_provider or 'auto'}): {aux_context:,} ctx < {MINIMUM_CONTEXT_LENGTH:,}"]
for _fb_entry in _fallback_chain:
_fb_provider = _fb_entry.get("provider", "")
_fb_model = _fb_entry.get("model", "")
if not _fb_provider or not _fb_model:
continue
try:
from agent.auxiliary_client import resolve_provider_client
_fb_client, _fb_resolved_model = resolve_provider_client(
_fb_provider,
_fb_model,
explicit_base_url=_fb_entry.get("base_url", ""),
explicit_api_key=_fb_entry.get("api_key", ""),
main_runtime=agent._current_main_runtime(),
)
if _fb_client is None or not _fb_resolved_model:
_tried.append(f"{_fb_model} ({_fb_provider}): unavailable")
continue
_fb_base_url = str(getattr(_fb_client, "base_url", ""))
_fb_api_key_raw = getattr(_fb_client, "api_key", "")
_fb_api_key = (
""
if callable(_fb_api_key_raw) and not isinstance(_fb_api_key_raw, str)
else str(_fb_api_key_raw or "")
)
_fb_context = get_model_context_length(
_fb_resolved_model,
base_url=_fb_base_url,
api_key=_fb_api_key,
provider=_fb_provider,
custom_providers=getattr(agent, "_custom_providers", None),
)
if _fb_context and _fb_context < MINIMUM_CONTEXT_LENGTH:
_tried.append(
f"{_fb_resolved_model} ({_fb_provider}): "
f"{_fb_context:,} ctx < {MINIMUM_CONTEXT_LENGTH:,}"
)
continue
# ── Found a suitable fallback ──────────────────────────
logger.warning(
"Compression model %s (%s) has only %d token context "
"(minimum %d). Falling back to %s (%s) with %d token context.",
aux_model, _aux_cfg_provider or "auto", aux_context,
MINIMUM_CONTEXT_LENGTH, _fb_resolved_model, _fb_provider,
_fb_context or 0,
)
agent.context_compressor._compression_fallback = {
"provider": _fb_provider,
"model": _fb_resolved_model,
"base_url": _fb_base_url,
"api_key": _fb_api_key,
}
_msg = (
f"⚠ Compression model {aux_model} has only "
f"{aux_context:,} token context (minimum "
f"{MINIMUM_CONTEXT_LENGTH:,} required). "
f"Falling back to {_fb_resolved_model} ({_fb_provider}) "
f"for summaries."
)
agent._compression_warning = _msg
agent._emit_status(_msg)
return
except Exception as _fb_err:
_tried.append(f"{_fb_model} ({_fb_provider}): {_fb_err}")
continue
# No fallback worked — warn and let compression run without
# summaries (same behavior as 'no auxiliary LLM' above).
_all_tried = "; ".join(_tried)
_msg = (
f"⚠ No suitable compression model available. "
f"Tried: {_all_tried}. "
f"Compression will drop middle turns without summaries. "
f"Run `hermes setup` or set "
f"auxiliary.compression.model in config.yaml."
)
agent._compression_warning = _msg
agent._emit_status(_msg)
logger.warning("Compression model fallback exhausted: %s", _all_tried)
return
except Exception as exc:
logger.debug(
"Compression feasibility check failed (non-fatal): %s", exc

View File

@ -1814,6 +1814,14 @@ def run_conversation(
break
except Exception as api_error:
# Log full traceback immediately so we can diagnose what broke.
# Many recovery paths below can swallow or transform the error;
# this guarantees we always have the original stack trace.
logger.exception(
"%sAPI call raised exception (attempt %s/%s, thread=%s): %s",
agent.log_prefix, retry_count + 1, max_retries,
agent._thread_identity(), api_error,
)
# Stop spinner before printing error messages
if thinking_spinner:
thinking_spinner.stop("(╥_╥) error, retrying...")
@ -2822,6 +2830,10 @@ def run_conversation(
# provider/network failure (malformed response body,
# truncated stream, routing layer corruption), not a
# local programming bug, and should be retried (#14782).
# Exclude TypeError when it originates from SDK response
# processing (e.g. 'NoneType' object is not iterable when
# a server sends output: null) — that's a transient API/SDK
# bug, not a local programming error. (#14950)
is_local_validation_error = (
isinstance(api_error, (ValueError, TypeError))
and not isinstance(
@ -2835,6 +2847,14 @@ def run_conversation(
# ssl.SSLError explicitly so the error classifier's
# retryable=True mapping takes effect instead.
and not isinstance(api_error, ssl.SSLError)
# TypeError from the SDK is a transient bug in API
# response processing (e.g. NoneType not iterable when
# server returns output: null). Do NOT treat it as a
# local programming error — it should be retried.
and not (
isinstance(api_error, TypeError)
and not isinstance(api_error, (UnicodeEncodeError, ssl.SSLError))
)
)
# ``FailoverReason.billing`` (HTTP 402) is NOT in this
# exclusion set. By the time we reach this block:
@ -2887,6 +2907,22 @@ def run_conversation(
agent._vprint(f"{agent.log_prefix}❌ Non-retryable client error (HTTP {status_code}). Aborting.", force=True)
agent._vprint(f"{agent.log_prefix} 🔌 Provider: {_provider} Model: {_model}", force=True)
agent._vprint(f"{agent.log_prefix} 🌐 Endpoint: {_base}", force=True)
# Surface local validation bugs (TypeError, ValueError) with a
# traceback pointer so the user can find the root cause in logs.
if isinstance(api_error, (TypeError, ValueError)) and not isinstance(
api_error, (UnicodeEncodeError, json.JSONDecodeError, ssl.SSLError)
):
_log_dir = os.path.join(os.path.expanduser("~"), ".hermes", "logs")
agent._vprint(
f"{agent.log_prefix} 🐛 This is a local code error (not a provider failure). "
f"Full traceback: {_log_dir}/errors.log",
force=True,
)
# Also emit a status so gateway/cron users see it
agent._emit_status(
f"🐛 Local bug: {agent._summarize_api_error(api_error)} "
f"— check {_log_dir}/errors.log"
)
# Actionable guidance for common auth errors
if classified.is_auth or classified.reason == FailoverReason.billing:
if _provider in {"openai-codex", "xai-oauth", "nous"} and status_code == 401:
@ -2917,7 +2953,7 @@ def run_conversation(
agent._vprint(f"{agent.log_prefix} • Check credits: https://openrouter.ai/settings/credits", force=True)
else:
agent._vprint(f"{agent.log_prefix} 💡 This type of error won't be fixed by retrying.", force=True)
logger.error(f"{agent.log_prefix}Non-retryable client error: {api_error}")
logger.error(f"{agent.log_prefix}Non-retryable client error: {api_error}", exc_info=True)
# Skip session persistence when the error is likely
# context-overflow related (status 400 + large session).
# Persisting the failed user message would make the

View File

@ -213,24 +213,44 @@ class ResponsesApiTransport(ProviderTransport):
_normalize_codex_response,
)
import logging
_logger = logging.getLogger(__name__)
# _normalize_codex_response returns (SimpleNamespace, finish_reason_str)
msg, finish_reason = _normalize_codex_response(response)
try:
msg, finish_reason = _normalize_codex_response(response)
except Exception:
_logger.exception(
"Failed to normalize Codex response (response type=%s, has_output=%s)",
type(response).__name__,
hasattr(response, "output"),
)
raise
tool_calls = None
if msg and msg.tool_calls:
tool_calls = []
for tc in msg.tool_calls:
provider_data = {}
if hasattr(tc, "call_id") and tc.call_id:
provider_data["call_id"] = tc.call_id
if hasattr(tc, "response_item_id") and tc.response_item_id:
provider_data["response_item_id"] = tc.response_item_id
tool_calls.append(ToolCall(
id=tc.id if hasattr(tc, "id") else (tc.function.name if hasattr(tc, "function") else None),
name=tc.function.name if hasattr(tc, "function") else getattr(tc, "name", ""),
arguments=tc.function.arguments if hasattr(tc, "function") else getattr(tc, "arguments", "{}"),
provider_data=provider_data or None,
))
_raw_tc = msg.tool_calls
if not hasattr(_raw_tc, "__iter__"):
# Defensive: tool_calls should be a list; log and skip if not iterable.
_logger.warning(
"Codex response tool_calls is not iterable (type=%s, value=%r); "
"skipping tool call extraction.",
type(_raw_tc).__name__, _raw_tc,
)
else:
tool_calls = []
for tc in _raw_tc:
provider_data = {}
if hasattr(tc, "call_id") and tc.call_id:
provider_data["call_id"] = tc.call_id
if hasattr(tc, "response_item_id") and tc.response_item_id:
provider_data["response_item_id"] = tc.response_item_id
tool_calls.append(ToolCall(
id=tc.id if hasattr(tc, "id") else (tc.function.name if hasattr(tc, "function") else None),
name=tc.function.name if hasattr(tc, "function") else getattr(tc, "name", ""),
arguments=tc.function.arguments if hasattr(tc, "function") else getattr(tc, "arguments", "{}"),
provider_data=provider_data or None,
))
# Extract reasoning items for provider_data
provider_data = {}

View File

@ -0,0 +1,156 @@
# Semantic RLE Context Engine experiment
Status: experimental MVP, not enabled by default.
## Hypothesis
For very long Telegram chats, a deterministic context engine with:
- a verbatim hot tail;
- a semantic factual ledger for older turns;
- explicit stale/superseded facts;
- credential references instead of raw secrets;
- retrieval notes for manual compression focus;
will reduce "misses" versus the current lossy compression path, especially when users correct facts over time ("server X" -> "server Y") or return to obligations created many turns ago.
## MVP design
Plugin: `plugins/context_engine/semantic_rle/`.
Compression output shape:
1. original system messages;
2. `system` summary block named `Semantic RLE context ledger`;
3. last `hot_tail_messages` non-system messages preserved verbatim.
The ledger is deterministic and local-only. No cloud LLM is called by the plugin.
Ledger sections:
- Active facts
- Decisions
- Obligations
- Superseded facts
- Unresolved questions
- Credential refs
- Retrieval notes
Best-effort redaction:
- key/value secrets (`api_key=...`, `token: ...`, `Authorization: ...`) become stable `credential_ref:credential:<hash>` references;
- token-like long strings become credential refs;
- IPv4-like strings become `[REDACTED_IP]`.
## What counts as a miss
A run has a miss when the assistant, after compaction, does any of these:
1. uses an inactive/superseded fact as current;
2. loses a still-active obligation or TODO;
3. says it does not know a fact that was in the compacted cold history;
4. exposes raw fake credentials or sensitive IP-like strings from cold history;
5. answers from an older decision when a newer decision superseded it;
6. needs user correction for information that the ledger retained.
## Baseline
Baseline should be the same Telegram session corpus and prompt set with:
```yaml
context:
engine: compressor
```
Record:
- number of manual corrections per 100 turns;
- number of stale-fact answers per scenario;
- number of obligation/TODO misses;
- raw secret leakage count in model-visible compressed context;
- compressed message count and rough token count;
- whether final hot-tail messages remain byte-for-byte unchanged.
## First A/B plan
1. Select 5-10 long Telegram transcripts with known corrections and recurring tasks.
2. Replay fixed query checkpoints against baseline `compressor` and experimental `semantic_rle`.
3. Keep model/provider/toolsets constant.
4. For each checkpoint, force compression before asking the evaluation query.
5. Score blind if possible: correct/current, stale, missing, unsafe leak, or ambiguous.
## Test scenarios
- Hot tail preservation: last N messages must be unchanged.
- Server supersession: `server alpha` followed by `server beta` should keep beta active and mark alpha superseded.
- Fake token redaction: no raw fake token appears in compressed output; only credential refs.
- IP-like redaction: raw IPv4-like strings do not appear in the ledger.
- Obligations: old `todo`/`надо` messages survive as obligations.
- Unresolved questions: old question markers survive as unresolved questions.
- Deterministic failure: if extraction raises, return original messages rather than dropping context.
- Discovery/config: plugin can be discovered and explicitly loaded, but is not globally enabled by adding it to the repo.
## Manual enablement for experiment only
Do not turn it on globally in commits. For a local experiment:
```bash
hermes config set context.engine semantic_rle
# restart the CLI session or gateway process you intentionally want to test
```
Return to default:
```bash
hermes config set context.engine compressor
```
Gateway note: do not restart production gateway just because the plugin exists. Restart only a deliberately selected experiment profile/session.
## Deterministic smoke-eval
Run:
```bash
python scripts/semantic_rle_eval.py
python scripts/semantic_rle_eval.py --json
```
Current invariant results on the checked-in synthetic corpus:
- `hot_tail_only_baseline`: 4/12 checks passed.
- `semantic_rle`: 12/12 checks passed.
Covered invariants:
- current fact retained after supersession;
- old fact marked superseded;
- old decision retained;
- old obligation retained;
- old unresolved question retained;
- cold fake token/IP redacted to refs/markers;
- hot tail preserved byte-for-byte.
This is only a deterministic preflight. It proves the plugin keeps the facts in model-visible context; it does **not** prove the LLM will use them correctly in live Telegram replay.
## Logical MVP boundary
Done for MVP:
- plugin discovery and explicit loading;
- ContextEngine ABC compatibility;
- local-only deterministic compression path;
- fail-closed compression error handling;
- hot tail preservation;
- factual ledger with active/superseded facts;
- decisions/obligations/questions sections;
- cold-history fake secret and IPv4 redaction;
- smoke-eval harness and tests.
Still intentionally out of MVP:
- Better fact keys: current MVP is regex-based and intentionally conservative.
- Persistence: ledger is in-memory only; no per-chat store yet.
- Retrieval tools: no `semantic_rle_search` tool yet.
- Multilingual extraction: Russian TODO/question markers are minimal.
- Live Telegram replay/scoring against real transcripts.

View File

@ -30,6 +30,7 @@ try:
Application,
CommandHandler,
CallbackQueryHandler,
MessageReactionHandler,
MessageHandler as TelegramMessageHandler,
ContextTypes,
filters,
@ -48,6 +49,7 @@ except ImportError:
Application = Any
CommandHandler = Any
CallbackQueryHandler = Any
MessageReactionHandler = Any
TelegramMessageHandler = Any
HTTPXRequest = Any
filters = None
@ -118,7 +120,7 @@ def check_telegram_requirements() -> bool:
"""
global TELEGRAM_AVAILABLE, Update, Bot, Message, InlineKeyboardButton
global InlineKeyboardMarkup, LinkPreviewOptions, Application
global CommandHandler, CallbackQueryHandler, TelegramMessageHandler
global CommandHandler, CallbackQueryHandler, MessageReactionHandler, TelegramMessageHandler
global ContextTypes, filters, ParseMode, ChatType, HTTPXRequest
if TELEGRAM_AVAILABLE:
return True
@ -137,6 +139,7 @@ def check_telegram_requirements() -> bool:
from telegram.ext import (
Application as _App, CommandHandler as _CH,
CallbackQueryHandler as _CQH,
MessageReactionHandler as _MRH,
MessageHandler as _MH,
ContextTypes as _CT, filters as _filters,
)
@ -153,6 +156,7 @@ def check_telegram_requirements() -> bool:
Application = _App
CommandHandler = _CH
CallbackQueryHandler = _CQH
MessageReactionHandler = _MRH
TelegramMessageHandler = _MH
ContextTypes = _CT
filters = _filters
@ -1581,6 +1585,14 @@ class TelegramAdapter(BasePlatformAdapter):
))
# Handle inline keyboard button callbacks (update prompts)
self._app.add_handler(CallbackQueryHandler(self._handle_callback_query))
# Handle user reactions to bot/news messages. The existing
# telegram.reactions flag controls the bot's own processing
# lifecycle reactions (👀/👍/👎); this handler is for inbound
# Telegram ``message_reaction`` updates so reaction-only feedback
# can reach the agent/session.
if MessageReactionHandler is not Any:
self._app.add_handler(MessageReactionHandler(self._handle_message_reaction))
# Start polling — retry initialize() for transient TLS resets
try:
@ -2073,6 +2085,46 @@ class TelegramAdapter(BasePlatformAdapter):
is_connect_timeout = self._looks_like_connect_timeout(e)
return SendResult(success=False, error=str(e), retryable=(is_connect_timeout or not is_timeout))
def _thread_metadata_for_voice_ack(self, event: MessageEvent) -> Optional[Dict[str, Any]]:
"""Build thread metadata for a platform-side pre-STT voice ACK.
The ACK is sent before gateway STT calls run, so it must stay entirely
platform-side and must not become an assistant/model turn. Reuse the
same Telegram thread/DM-topic metadata shape the runner uses for normal
replies so the short status lands in the same chat/topic.
"""
source = event.source
thread_id = getattr(source, "thread_id", None)
if thread_id is None:
return None
metadata: Dict[str, Any] = {"thread_id": thread_id}
if getattr(source, "platform", None) == Platform.TELEGRAM and getattr(source, "chat_type", None) == "dm":
metadata["telegram_dm_topic_reply_fallback"] = True
tid = str(thread_id)
if tid and tid not in {"", "1"}:
metadata["direct_messages_topic_id"] = tid
anchor = getattr(source, "message_id", None) or event.message_id
if anchor is not None:
metadata["telegram_reply_to_message_id"] = str(anchor)
return metadata
async def _send_voice_pre_stt_ack(self, event: MessageEvent) -> None:
"""Send a best-effort Telegram status before slow voice transcription."""
try:
result = await self.send(
event.source.chat_id,
"🎧 получил, распознаю",
reply_to=event.message_id,
metadata=self._thread_metadata_for_voice_ack(event),
)
if not result.success:
logger.warning(
"[Telegram] Pre-STT voice ACK was not delivered: %s",
result.error or "unknown error",
)
except Exception:
logger.exception("[Telegram] Failed to send pre-STT voice ACK; continuing transcription")
async def send_or_update_status(
self,
chat_id: str,
@ -4947,6 +4999,113 @@ class TelegramAdapter(BasePlatformAdapter):
"""
return getattr(update, "effective_message", None) or getattr(update, "message", None)
async def _handle_message_reaction(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
"""Handle Telegram user reactions as lightweight synthetic messages."""
reaction_update = getattr(update, "message_reaction", None)
if reaction_update is None:
return
user = getattr(reaction_update, "user", None)
user_id = str(getattr(user, "id", "") or "")
bot_id = str(getattr(getattr(self, "_bot", None), "id", "") or "")
if user_id and bot_id and user_id == bot_id:
return
chat = getattr(reaction_update, "chat", None)
chat_id = str(getattr(chat, "id", "") or "")
if not chat_id or not user_id:
return
chat_type_raw = getattr(chat, "type", None)
chat_type_value = str(getattr(chat_type_raw, "value", chat_type_raw) or "dm").lower()
chat_type = "dm" if chat_type_value == "private" else (
"group" if chat_type_value in {"group", "supergroup"} else chat_type_value
)
thread_id = getattr(reaction_update, "message_thread_id", None)
user_name = getattr(user, "first_name", None) or getattr(user, "username", None)
if not self._is_callback_user_authorized(
user_id,
chat_id=chat_id,
chat_type=chat_type,
thread_id=str(thread_id) if thread_id is not None else None,
user_name=user_name,
):
logger.info("[%s] Ignoring Telegram reaction from unauthorized user %s", self.name, user_id)
return
def _reaction_label(obj: Any) -> str:
emoji = getattr(obj, "emoji", None)
if emoji:
return str(emoji)
custom = getattr(obj, "custom_emoji_id", None)
if custom:
return f"custom_emoji:{custom}"
return str(getattr(obj, "type", obj))
new_reactions = list(getattr(reaction_update, "new_reaction", None) or [])
old_reactions = list(getattr(reaction_update, "old_reaction", None) or [])
new_labels = [_reaction_label(r) for r in new_reactions]
old_labels = [_reaction_label(r) for r in old_reactions]
target_message_id = getattr(reaction_update, "message_id", None)
if not new_labels:
logger.info(
"[%s] Telegram reaction removed: chat=%s user=%s message=%s previous=%s",
self.name,
chat_id,
user_id,
target_message_id,
" ".join(old_labels) if old_labels else "<unknown>",
)
return
positive_labels = {"👍", "🔥", "👌", "❤️", "", "😍", "🥰", "👏", "🎉", "🤩"}
negative_labels = {"👎", "💩", "🤮", "😡", "😠", "🙄", "😴", "🤢"}
label_set = set(new_labels)
if label_set & negative_labels:
feedback = "negative feedback / мимо"
elif label_set & positive_labels:
feedback = "positive feedback / годно"
else:
feedback = "emoji feedback"
text = (
f"[Telegram reaction feedback: {' '.join(new_labels)} on message {target_message_id}. "
f"Interpretation: {feedback}. "
f"If the target was a cron/news/digest item, use this as taste feedback.]"
)
from gateway.session import SessionSource
source = SessionSource(
platform=Platform.TELEGRAM,
chat_id=chat_id,
chat_name=getattr(chat, "title", None),
chat_type=chat_type,
user_id=user_id,
user_name=str(user_name) if user_name else None,
thread_id=str(thread_id) if thread_id is not None else None,
message_id=str(target_message_id) if target_message_id is not None else None,
)
event = MessageEvent(
text=text,
message_type=MessageType.TEXT,
source=source,
raw_message=reaction_update,
message_id=str(target_message_id) if target_message_id is not None else None,
platform_update_id=update.update_id,
)
logger.info(
"[%s] Telegram reaction received: chat=%s user=%s message=%s emoji=%s",
self.name,
chat_id,
user_id,
target_message_id,
" ".join(new_labels) if new_labels else "<removed>",
)
await self.handle_message(event)
async def _handle_text_message(self, update: Update, context: ContextTypes.DEFAULT_TYPE) -> None:
"""Handle incoming text messages.
@ -5257,8 +5416,9 @@ class TelegramAdapter(BasePlatformAdapter):
event.media_urls = [cached_path]
event.media_types = ["audio/ogg"]
logger.info("[Telegram] Cached user voice at %s", cached_path)
await self._send_voice_pre_stt_ack(event)
except Exception as e:
logger.warning("[Telegram] Failed to cache voice: %s", e, exc_info=True)
logger.warning("[Telegram] Failed to cache voice before STT: %s", e, exc_info=True)
elif msg.audio:
try:
file_obj = await msg.audio.get_file()

View File

@ -1266,6 +1266,23 @@ def _is_control_interrupt_message(message: Optional[str]) -> bool:
return normalized in _CONTROL_INTERRUPT_MESSAGES
def _should_process_pending_during_drain(result: Optional[dict]) -> bool:
"""Return True when a queued follow-up is safe to run before shutdown.
If a user sends a follow-up while an active gateway turn is running, the
adapter stores it in the pending slot. A service restart can begin while
that active turn is still running. When the active turn then finishes
cleanly during the drain window, dropping the pending slot loses the user's
message. Process it before exit unless the active turn itself ended via an
interruption/failure, where continuing work during shutdown would be unsafe.
"""
if not result:
return False
if result.get("interrupted") or result.get("failed"):
return False
return True
def _skill_slug_from_frontmatter(skill_md: Path) -> tuple[str | None, str | None]:
"""Derive the /command slug and declared frontmatter name from a SKILL.md.
@ -7535,6 +7552,9 @@ class GatewayRunner:
if canonical == "insights":
return await self._handle_insights_command(event)
if canonical == "stats":
return await self._handle_stats_command(event)
if canonical == "reload-mcp":
return await self._handle_reload_mcp_command(event)
@ -8216,6 +8236,28 @@ class GatewayRunner:
context_note = "[System note: The user's previous session expired due to inactivity. This is a fresh conversation with no prior context.]"
context_prompt = context_note + "\n\n" + context_prompt
# If the previous agent turn crashed, prepend a crash-context notice
# so the agent knows what went wrong and can self-diagnose. The
# error fields are cleared after this read so the notice appears
# exactly once (on the first message of the fresh session).
_last_err_type = getattr(session_entry, 'last_error_type', None)
if _last_err_type:
_last_err_msg = getattr(session_entry, 'last_error_message', '') or ''
_crash_note = (
f"[System note: The previous session crashed with "
f"{_last_err_type}: {_last_err_msg[:300]}. "
f"Diagnose and fix the root cause if possible — "
f"do NOT just retry the same thing.]"
)
context_prompt = _crash_note + "\n\n" + context_prompt
# Clear so the notice appears only once
try:
session_entry.last_error_type = None
session_entry.last_error_message = None
session_entry.last_error_time = None
except Exception:
pass
# Send a user-facing notification explaining the reset, unless:
# - notifications are disabled in config
# - the platform is excluded (e.g. api_server, webhook)
@ -9098,6 +9140,14 @@ class GatewayRunner:
logger.exception("Agent error in session %s", session_key)
error_type = type(e).__name__
error_detail = str(e)[:300] if str(e) else "no details available"
# Record the crash on the session entry so the next session's
# agent can self-diagnose the failure instead of starting blind.
try:
session_entry.last_error_type = error_type
session_entry.last_error_message = error_detail
session_entry.last_error_time = time.time()
except Exception:
pass # defensive — never let error-recording cause a secondary crash
status_hint = ""
status_code = getattr(e, "status_code", None)
_hist_len = len(history) if 'history' in locals() else 0
@ -13234,6 +13284,196 @@ class GatewayRunner:
logger.error("Insights command error: %s", e, exc_info=True)
return t("gateway.insights.error", error=e)
async def _handle_stats_command(self, event: MessageEvent) -> str:
"""Handle /stats command — comprehensive system and model report.
Pure local compute no LLM call, no prompt-cache impact. Surfaces
model config, context engine, skills usage, curator state, cron jobs,
and recent session activity in a compact dashboard.
"""
import json, time as _time, os as _os
from datetime import datetime, timezone, timedelta
source = event.source
session_entry = self.session_store.get_or_create_session(source)
lines: list[str] = []
# ── Header ──────────────────────────────────────────────────────
_now = datetime.now(timezone.utc)
lines.append("📊 **Hermes Stats**")
lines.append("")
# ── Model ───────────────────────────────────────────────────────
cfg = self._read_user_config()
model_cfg = cfg.get("model", {}) if isinstance(cfg, dict) else {}
main_model = model_cfg.get("default", "") if isinstance(model_cfg, dict) else str(model_cfg)
main_provider = model_cfg.get("provider", "") if isinstance(model_cfg, dict) else ""
fallback_chain = cfg.get("fallback_providers", []) if isinstance(cfg, dict) else []
fb_models = [f"{fb.get('model','?')} ({fb.get('provider','?')})" for fb in fallback_chain[:3]]
# Context length — from cached agent if available, else config
ctx_len = "?"
ctx_engine_name = "compressor"
try:
_cache_lock = getattr(self, "_agent_cache_lock", None)
if _cache_lock:
with _cache_lock:
_cached = self._agent_cache.get(session_entry.session_key)
if _cached:
_agent, _, _ts = _cached
_cc = getattr(_agent, "context_compressor", None)
if _cc:
ctx_len = f"{getattr(_cc, 'context_length', 0):,}"
_ce = getattr(_cc, "name", None) or type(_cc).__name__
ctx_engine_name = _ce.lower() if _ce != "ContextCompressor" else "compressor"
except Exception:
pass
lines.append("**🤖 Model**")
lines.append(f" Main: `{main_model}` ({main_provider})")
if fb_models:
lines.append(f" Fallback: {', '.join(fb_models)}")
lines.append(f" Context: {ctx_len} tokens | Engine: {ctx_engine_name}")
lines.append("")
# ── Context Engine (Semantic RLE) ───────────────────────────────
_rle_dir = _os.path.expanduser("~/.hermes/hermes-agent/plugins/context_engine/semantic_rle")
_rle_plugin_yaml = _os.path.join(_rle_dir, "plugin.yaml") if _os.path.isdir(_rle_dir) else ""
rle_version = "?"
if _rle_plugin_yaml and _os.path.isfile(_rle_plugin_yaml):
try:
import yaml as _yaml
with open(_rle_plugin_yaml) as f:
_rle_cfg = _yaml.safe_load(f) or {}
rle_version = _rle_cfg.get("version", "?")
except Exception:
pass
_rle_stats = {}
try:
_rle_dir_cache = _os.path.join(_rle_dir, "__pycache__")
if _os.path.isdir(_rle_dir_cache):
_rle_stats["cached"] = True
except Exception:
pass
lines.append("**🧠 Context Engine**")
if _os.path.isdir(_rle_dir):
lines.append(f" Semantic RLE: v{rle_version}")
else:
lines.append(" Semantic RLE: not installed")
lines.append("")
# ── Skills Usage ────────────────────────────────────────────────
_skills_usage_path = _os.path.expanduser("~/.hermes/skills/.usage.json")
skills_active = 0
skills_agent = 0
skills_top: list[tuple[str, int]] = []
if _os.path.isfile(_skills_usage_path):
try:
with open(_skills_usage_path) as f:
_su = json.load(f)
for _sn, _sd in _su.items():
if _sd.get("state") == "active":
skills_active += 1
if _sd.get("created_by") == "agent":
skills_agent += 1
_by_use = sorted(_su.items(), key=lambda x: x[1].get("use_count", 0), reverse=True)
skills_top = [(_sn, _sd.get("use_count", 0)) for _sn, _sd in _by_use[:5]]
except Exception:
pass
lines.append("**📚 Skills**")
lines.append(f" Active: {skills_active} | Agent-created: {skills_agent}")
if skills_top:
_top_str = ", ".join(f"`{n}` ({c})" for n, c in skills_top)
lines.append(f" Top: {_top_str}")
lines.append("")
# ── Curator ─────────────────────────────────────────────────────
_curator_state_path = _os.path.expanduser("~/.hermes/skills/.curator_state.json")
curator_runs = 0
curator_last = "never"
curator_archived = 0
if _os.path.isfile(_curator_state_path):
try:
with open(_curator_state_path) as f:
_cs = json.load(f)
curator_runs = _cs.get("runs", 0)
_cls = _cs.get("last_run")
if _cls:
_dt = datetime.fromisoformat(_cls.replace("Z", "+00:00"))
_delta = _now - _dt
if _delta.days > 0:
curator_last = f"{_delta.days}d ago"
elif _delta.seconds > 3600:
curator_last = f"{_delta.seconds // 3600}h ago"
else:
curator_last = f"{_delta.seconds // 60}m ago"
curator_archived = _cs.get("archived_count", 0)
except Exception:
pass
lines.append("**🌱 Curator**")
lines.append(f" Runs: {curator_runs} | Last: {curator_last} | Archived: {curator_archived}")
lines.append("")
# ── Cron ────────────────────────────────────────────────────────
try:
from cron.jobs import load_jobs
_jobs = load_jobs()
_active_jobs = [j for j in _jobs if j.get("paused") != True]
_job_names = [j.get("name", j.get("job_id", "?")[:8]) for j in _active_jobs[:5]]
lines.append("**⏰ Cron**")
if _job_names:
lines.append(f" Active: {len(_active_jobs)}{', '.join(_job_names)}")
else:
lines.append(" Active: 0")
lines.append("")
except Exception:
pass
# ── Recent Activity ─────────────────────────────────────────────
try:
from hermes_state import SessionDB
_db = SessionDB()
_recent = _db.list_sessions(limit=5, offset=0)
_db.close()
if _recent:
_recent_24h = 0
_recent_msgs = 0
_cutoff = _now - timedelta(hours=24)
for _rs in _recent:
_t = _rs.get("updated_at") or _rs.get("created_at")
if _t:
_dt = datetime.fromisoformat(str(_t).replace("Z", "+00:00")) if isinstance(_t, str) else _t
if _dt > _cutoff:
_recent_24h += 1
_recent_msgs += _rs.get("message_count", 0)
lines.append("**⏱️ Activity (24h)**")
lines.append(f" Sessions: {_recent_24h} | Messages: {_recent_msgs}")
_latest = _recent[0] if _recent else {}
_latest_title = _latest.get("title") or _latest.get("session_id", "?")
_latest_preview = (_latest.get("preview") or "")[:80]
if _latest_preview:
lines.append(f" Latest: _{_latest_preview}_")
lines.append("")
except Exception:
pass
# ── System ──────────────────────────────────────────────────────
try:
import psutil
_proc = psutil.Process()
_mem_mb = _proc.memory_info().rss / 1024 / 1024
lines.append("**🔧 System**")
lines.append(f" Memory: {_mem_mb:.0f} MB")
except ImportError:
pass
except Exception:
pass
return "\n".join(lines)
async def _handle_reload_mcp_command(self, event: MessageEvent) -> Optional[str]:
"""Handle /reload-mcp — reconnect MCP servers and rebuild the cached agent.
@ -14585,6 +14825,79 @@ class GatewayRunner:
return prefix
return user_text
@staticmethod
def _transcription_looks_uncertain(transcript: str) -> bool:
"""Heuristic gate for optional OpenAI Whisper fallback.
Keep this deliberately conservative: the fallback is paid and slower,
so we only retry when the primary transcript explicitly looks broken or
uncertain, not for every voice note.
"""
text = (transcript or "").strip()
if not text:
return True
lowered = text.lower()
uncertainty_markers = (
"inaudible",
"unintelligible",
"can't understand",
"cannot understand",
"could not understand",
"неразбор",
"не слышно",
"плохо слышно",
"не понял",
"непонятно",
)
if any(marker in lowered for marker in uncertainty_markers):
return True
if "<EFBFBD>" in text:
return True
compact = lowered.strip(" .,…!?-—\n\t")
return compact in {"", "...", "[music]", "(music)", "тишина"}
def _should_try_openai_whisper_fallback(self, result: Dict[str, Any]) -> bool:
"""Return True only for the configured fast OpenAI model + uncertain text."""
if not result.get("success"):
return False
try:
from tools.transcription_tools import _load_stt_config
stt_config = _load_stt_config()
except Exception:
logger.exception("Failed to load STT config while evaluating fallback policy")
return False
if str(stt_config.get("provider", "")).lower() != "openai":
return False
openai_cfg = stt_config.get("openai", {}) if isinstance(stt_config.get("openai"), dict) else {}
model = str(openai_cfg.get("model", "")).strip()
if model != "gpt-4o-mini-transcribe":
return False
return self._transcription_looks_uncertain(str(result.get("transcript") or ""))
async def _transcribe_audio_with_fallback(self, transcribe_audio, path: str) -> Dict[str, Any]:
"""Transcribe once, then optionally retry uncertain mini-transcribe output with whisper-1."""
result = await asyncio.to_thread(transcribe_audio, path)
if not self._should_try_openai_whisper_fallback(result):
return result
logger.info("Primary OpenAI STT transcript looks uncertain; retrying %s with whisper-1", os.path.basename(path))
try:
fallback = await asyncio.to_thread(transcribe_audio, path, model="whisper-1")
except Exception:
logger.exception("OpenAI Whisper-1 STT fallback failed for %s; keeping primary transcript", path)
result["stt_uncertain"] = True
return result
if fallback.get("success") and str(fallback.get("transcript") or "").strip():
fallback["fallback_used"] = "whisper-1"
return fallback
logger.warning(
"OpenAI Whisper-1 STT fallback did not produce a usable transcript for %s: %s",
path,
fallback.get("error") or "empty transcript",
)
result["stt_uncertain"] = True
return result
async def _enrich_message_with_transcription(
self,
user_text: str,
@ -14628,13 +14941,20 @@ class GatewayRunner:
for path in audio_paths:
try:
logger.debug("Transcribing user voice: %s", path)
result = await asyncio.to_thread(transcribe_audio, path)
result = await self._transcribe_audio_with_fallback(transcribe_audio, path)
if result["success"]:
transcript = result["transcript"]
enriched_parts.append(
note = (
f'[The user sent a voice message~ '
f'Here\'s what they said: "{transcript}"]'
f'Here\'s what they said: "{transcript}"'
)
if result.get("stt_uncertain") or self._transcription_looks_uncertain(str(transcript)):
note += (
" The transcript still looks uncertain; don't guess. "
"Ask the user a short clarification in Russian."
)
note += "]"
enriched_parts.append(note)
else:
error = result.get("error", "unknown error")
if (
@ -14661,7 +14981,7 @@ class GatewayRunner:
f"transcribing it~ ({error})]"
)
except Exception as e:
logger.error("Transcription error: %s", e)
logger.error("Transcription error for %s: %s", path, e, exc_info=True)
enriched_parts.append(
"[The user sent a voice message but something went wrong "
"when I tried to listen to it~ Let them know!]"
@ -17667,13 +17987,20 @@ class GatewayRunner:
pass
if self._draining and (pending_event or pending):
logger.info(
"Discarding pending follow-up for session %s during gateway %s",
session_key or "?",
self._status_action_label(),
)
pending_event = None
pending = None
if _should_process_pending_during_drain(result):
logger.info(
"Processing pending follow-up for session %s before gateway %s completes",
session_key or "?",
self._status_action_label(),
)
else:
logger.info(
"Discarding pending follow-up for session %s during gateway %s",
session_key or "?",
self._status_action_label(),
)
pending_event = None
pending = None
if pending_event or pending:
logger.debug("Processing pending message: '%s...'", pending[:40])

View File

@ -453,6 +453,13 @@ class SessionEntry:
# Last API-reported prompt tokens (for accurate compression pre-check)
last_prompt_tokens: int = 0
# Set when the previous agent turn crashed with an unhandled exception.
# Consumed once by the message handler to inject a crash-context notice
# into the next session's system prompt so the agent can self-diagnose.
last_error_type: Optional[str] = None
last_error_message: Optional[str] = None
last_error_time: Optional[float] = None
# Set when a session was created because the previous one expired;
# consumed once by the message handler to inject a notice into context
was_auto_reset: bool = False

View File

@ -98,6 +98,7 @@ COMMAND_REGISTRY: list[CommandDef] = [
aliases=("bg", "btw"), args_hint="<prompt>"),
CommandDef("agents", "Show active agents and running tasks", "Session",
aliases=("tasks",)),
CommandDef("stats", "Show comprehensive system stats — model, skills, curator, cron, activity", "Info"),
CommandDef("queue", "Queue a prompt for the next turn (doesn't interrupt)", "Session",
aliases=("q",), args_hint="<prompt>"),
CommandDef("steer", "Inject a message after the next tool call without interrupting", "Session",

View File

@ -0,0 +1,300 @@
"""Experimental Semantic RLE context engine.
MVP goals:
- keep the hot tail verbatim;
- collapse older chat into a deterministic factual ledger;
- mark superseded facts instead of silently forgetting them;
- redact likely credentials and IP-like sensitive strings before ledgering.
This plugin is intentionally deterministic and does not call cloud LLMs.
It is not enabled by default; select it explicitly with ``context.engine``.
"""
from __future__ import annotations
import hashlib
import re
from dataclasses import dataclass
from typing import Any, Dict, Iterable, List, Optional, Tuple
from agent.context_engine import ContextEngine
Message = Dict[str, Any]
_TOKEN_PATTERNS: tuple[re.Pattern[str], ...] = (
re.compile(r"\b(?:sk|xox[baprs]?|gh[pousr]|hf|AIza|ya29|pat|tok)[-_][A-Za-z0-9_./+=-]{12,}\b"),
re.compile(r"\b[A-Za-z0-9_./+=-]{32,}\b"),
)
_KEY_VALUE_SECRET_RE = re.compile(
r"(?i)\b(api[_-]?key|token|secret|password|passwd|authorization|bearer)\b\s*[:=]\s*([^\s,;]+)"
)
_IPV4_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
_SERVER_RE = re.compile(r"(?i)\bserver\s*(?:is|=|:|->|to)?\s*([A-Za-z0-9][A-Za-z0-9._-]{1,})")
_DECISION_RE = re.compile(r"(?i)\b(decided|decision|решили|решение|choose|chosen|use|используем)\b")
_OBLIGATION_RE = re.compile(r"(?i)\b(todo|надо|нужно|must|should|обяз|follow up|сделай|сделать)\b")
_QUESTION_RE = re.compile(r"(?i)(\?\s*$|\b(unresolved|open question|вопрос|непонятно|уточнить)\b)")
_FACT_RE = re.compile(r"(?i)\b(is|are|=|:|это|будет|uses|runs|host|server|model|provider)\b")
@dataclass
class LedgerFact:
key: str
value: str
role: str
turn_index: int
active: bool = True
superseded_by: Optional[str] = None
def line(self) -> str:
state = "active" if self.active else f"superseded by {self.superseded_by or 'newer fact'}"
return f"- [{state}] {self.key}: {self.value} (turn {self.turn_index}, {self.role})"
class SemanticRLEEngine(ContextEngine):
"""Deterministic context engine for a hot-tail + semantic-ledger experiment."""
threshold_percent = 0.75
protect_first_n = 1
protect_last_n = 8
def __init__(self, context_length: int = 200_000, hot_tail_messages: int = 8) -> None:
self.context_length = context_length
self.threshold_tokens = int(context_length * self.threshold_percent)
self.hot_tail_messages = hot_tail_messages
self.last_prompt_tokens = 0
self.last_completion_tokens = 0
self.last_total_tokens = 0
self.compression_count = 0
self._last_ledger: dict[str, Any] = {}
@property
def name(self) -> str:
return "semantic_rle"
def is_available(self) -> bool:
return True
def update_from_response(self, usage: Dict[str, Any]) -> None:
self.last_prompt_tokens = int(usage.get("prompt_tokens") or usage.get("input_tokens") or 0)
self.last_completion_tokens = int(usage.get("completion_tokens") or usage.get("output_tokens") or 0)
self.last_total_tokens = int(usage.get("total_tokens") or (self.last_prompt_tokens + self.last_completion_tokens))
def should_compress(self, prompt_tokens: Optional[int] = None) -> bool:
tokens = self.last_prompt_tokens if prompt_tokens is None else int(prompt_tokens)
return bool(self.threshold_tokens and tokens >= self.threshold_tokens)
def should_compress_preflight(self, messages: List[Message]) -> bool:
return self.has_content_to_compress(messages)
def has_content_to_compress(self, messages: List[Message]) -> bool:
return len(self._non_system(messages)) > self.hot_tail_messages
def update_model(
self,
model: str,
context_length: int,
base_url: str = "",
api_key: str = "",
provider: str = "",
api_mode: str = "",
) -> None:
self.context_length = int(context_length or self.context_length or 0)
self.threshold_tokens = int(self.context_length * self.threshold_percent) if self.context_length else 0
def compress(
self,
messages: List[Message],
current_tokens: Optional[int] = None,
focus_topic: Optional[str] = None,
) -> List[Message]:
"""Return original head + semantic ledger for cold turns + verbatim hot tail.
The deterministic path is fail-closed: on unexpected errors, return a
shallow copy of the original message list rather than dropping context.
"""
self.compression_count += 1
try:
if not messages:
return []
copied = [dict(m) for m in messages]
system_head = [m for m in copied if m.get("role") == "system"]
non_system = [m for m in copied if m.get("role") != "system"]
if len(non_system) <= self.hot_tail_messages:
return copied
hot_tail = non_system[-self.hot_tail_messages :]
cold = non_system[: -self.hot_tail_messages]
ledger = self._build_ledger(cold, focus_topic=focus_topic)
self._last_ledger = ledger
summary_message: Message = {
"role": "system",
"content": self._render_summary(ledger, focus_topic=focus_topic),
}
return [*system_head, summary_message, *hot_tail]
except Exception:
return [dict(m) for m in messages]
def get_status(self) -> Dict[str, Any]:
status = super().get_status()
status.update(
{
"engine": self.name,
"hot_tail_messages": self.hot_tail_messages,
"ledger_counts": {
key: len(value) for key, value in self._last_ledger.items() if isinstance(value, list)
},
}
)
return status
@staticmethod
def _non_system(messages: Iterable[Message]) -> list[Message]:
return [m for m in messages if m.get("role") != "system"]
def _build_ledger(self, messages: List[Message], focus_topic: Optional[str] = None) -> dict[str, Any]:
facts_by_key: dict[str, LedgerFact] = {}
superseded: list[LedgerFact] = []
decisions: list[str] = []
obligations: list[str] = []
questions: list[str] = []
credential_refs: list[str] = []
retrieval_notes: list[str] = []
for index, msg in enumerate(messages, start=1):
role = str(msg.get("role", "unknown"))
text = self._string_content(msg.get("content", ""))
if not text.strip():
continue
sanitized, refs = self._sanitize(text)
credential_refs.extend(refs)
snippet = self._snippet(sanitized)
fact = self._extract_fact(sanitized, role=role, turn_index=index)
if fact:
old = facts_by_key.get(fact.key)
if old and old.value != fact.value:
old.active = False
old.superseded_by = fact.value
superseded.append(old)
facts_by_key[fact.key] = fact
if _DECISION_RE.search(sanitized):
decisions.append(f"- {snippet} (turn {index}, {role})")
if _OBLIGATION_RE.search(sanitized):
obligations.append(f"- {snippet} (turn {index}, {role})")
if _QUESTION_RE.search(sanitized):
questions.append(f"- {snippet} (turn {index}, {role})")
if focus_topic and focus_topic.lower() in sanitized.lower():
retrieval_notes.append(f"- Focus match `{focus_topic}` at turn {index}: {snippet}")
active_facts = [fact for fact in facts_by_key.values() if fact.active]
return {
"active_facts": active_facts,
"decisions": self._dedupe(decisions),
"obligations": self._dedupe(obligations),
"superseded_facts": superseded,
"unresolved_questions": self._dedupe(questions),
"credential_refs": self._dedupe(credential_refs),
"retrieval_notes": self._dedupe(retrieval_notes),
"cold_turns_compacted": len(messages),
}
def _extract_fact(self, text: str, role: str, turn_index: int) -> Optional[LedgerFact]:
server = _SERVER_RE.search(text)
if server:
return LedgerFact("server", server.group(1), role, turn_index)
if not _FACT_RE.search(text):
return None
cleaned = self._snippet(text, limit=180)
key = self._fact_key(cleaned)
return LedgerFact(key, cleaned, role, turn_index)
@staticmethod
def _fact_key(text: str) -> str:
lower = text.lower()
before_sep = re.split(r"\s*(?:is|are|=|:|это|будет|uses|runs)\s*", lower, maxsplit=1)[0]
words = re.findall(r"[a-zа-я0-9_-]+", before_sep)[:6]
return " ".join(words) or "fact"
@staticmethod
def _string_content(content: Any) -> str:
if isinstance(content, str):
return content
if isinstance(content, list):
parts: list[str] = []
for item in content:
if isinstance(item, dict):
if isinstance(item.get("text"), str):
parts.append(item["text"])
elif isinstance(item.get("content"), str):
parts.append(item["content"])
elif isinstance(item, str):
parts.append(item)
return "\n".join(parts)
return str(content)
def _sanitize(self, text: str) -> Tuple[str, list[str]]:
refs: list[str] = []
def ref_for(raw: str, kind: str = "credential") -> str:
digest = hashlib.sha256(raw.encode("utf-8", "ignore")).hexdigest()[:10]
ref = f"credential_ref:{kind}:{digest}"
refs.append(ref)
return ref
def replace_key_value(match: re.Match[str]) -> str:
key = match.group(1)
raw = match.group(2)
return f"{key}=<{ref_for(raw)}>"
sanitized = _KEY_VALUE_SECRET_RE.sub(replace_key_value, text)
for pattern in _TOKEN_PATTERNS:
sanitized = pattern.sub(lambda m: f"<{ref_for(m.group(0))}>", sanitized)
sanitized = _IPV4_RE.sub("[REDACTED_IP]", sanitized)
return sanitized, refs
@staticmethod
def _snippet(text: str, limit: int = 220) -> str:
compact = " ".join(text.split())
if len(compact) <= limit:
return compact
return compact[: limit - 1].rstrip() + ""
@staticmethod
def _dedupe(items: Iterable[str]) -> list[str]:
seen: set[str] = set()
result: list[str] = []
for item in items:
if item not in seen:
seen.add(item)
result.append(item)
return result
@staticmethod
def _render_summary(ledger: dict[str, Any], focus_topic: Optional[str] = None) -> str:
sections: list[str] = [
"Semantic RLE context ledger (deterministic, older turns compacted).",
f"Cold turns compacted: {ledger.get('cold_turns_compacted', 0)}.",
"Hot tail messages after this block are preserved verbatim.",
]
if focus_topic:
sections.append(f"Compression focus: {focus_topic}")
def add_section(title: str, lines: list[str]) -> None:
sections.append(f"\n## {title}")
sections.extend(lines or ["- None detected."])
add_section("Active facts", [f.line() for f in ledger.get("active_facts", [])])
add_section("Decisions", ledger.get("decisions", []))
add_section("Obligations", ledger.get("obligations", []))
add_section("Superseded facts", [f.line() for f in ledger.get("superseded_facts", [])])
add_section("Unresolved questions", ledger.get("unresolved_questions", []))
add_section("Credential refs", [f"- {ref}" for ref in ledger.get("credential_refs", [])])
add_section("Retrieval notes", ledger.get("retrieval_notes", []))
return "\n".join(sections)
def register(ctx: Any) -> None:
ctx.register_context_engine(SemanticRLEEngine())

View File

@ -0,0 +1,4 @@
name: semantic_rle
version: 0.1.0
experimental: true
description: "Experimental deterministic Semantic RLE context engine: hot tail + factual ledger + supersession notes for long chats."

View File

@ -1526,7 +1526,15 @@ class AIAgent:
# Fallback: truncate the raw string but give more room than 200 chars
status_code = getattr(error, "status_code", None)
prefix = f"HTTP {status_code}: " if status_code else ""
prefix_parts = []
if status_code:
prefix_parts.append(f"HTTP {status_code}")
# For local bugs (TypeError, ValueError without status_code), include
# the exception type so logs clearly distinguish "provider rejected us"
# from "we have a code bug".
if not status_code and isinstance(error, (TypeError, ValueError)):
prefix_parts.append(type(error).__name__)
prefix = f"{': '.join(prefix_parts)}: " if prefix_parts else ""
return f"{prefix}{raw[:500]}"
def _mask_api_key_for_logs(self, key: Any) -> Optional[str]:

View File

@ -0,0 +1,171 @@
#!/usr/bin/env python3
"""Deterministic smoke-eval for the experimental semantic_rle context engine.
This is not an LLM quality benchmark. It checks the minimum invariant we need
before trying live Telegram replay: after cold-history compaction, the
model-visible context still contains current facts/obligations, marks stale
facts as superseded, preserves the hot tail byte-for-byte, and does not leak
raw fake secrets from cold turns.
"""
from __future__ import annotations
import argparse
import json
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable
REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
from plugins.context_engine.semantic_rle import SemanticRLEEngine # noqa: E402
Message = dict[str, Any]
@dataclass(frozen=True)
class Scenario:
name: str
messages: list[Message]
hot_tail_messages: int
checks: dict[str, Callable[[list[Message]], bool]]
def _render(messages: list[Message]) -> str:
return "\n".join(str(message.get("content", "")) for message in messages)
def _hot_tail_only(messages: list[Message], hot_tail_messages: int) -> list[Message]:
system_head = [dict(m) for m in messages if m.get("role") == "system"]
non_system = [dict(m) for m in messages if m.get("role") != "system"]
return [*system_head, *non_system[-hot_tail_messages:]]
def _semantic_rle(messages: list[Message], hot_tail_messages: int) -> list[Message]:
return SemanticRLEEngine(hot_tail_messages=hot_tail_messages).compress(messages)
def _scenarios() -> list[Scenario]:
fake_token = "sk-test1234567890abcdef1234567890"
fake_pat = "ghp_abcdef1234567890abcdef1234567890"
ip = "203.0.113.42"
supersession_messages = [
{"role": "system", "content": "You are Hermes."},
{"role": "user", "content": "server alpha.example is the current deployment target"},
{"role": "assistant", "content": "Noted: server alpha.example."},
{"role": "user", "content": "decision: use postgres for the ledger"},
{"role": "assistant", "content": "I will use postgres."},
{"role": "user", "content": "todo: compare misses against baseline"},
{"role": "assistant", "content": "Added comparison todo."},
{"role": "user", "content": "unresolved question: how often to compact?"},
{"role": "assistant", "content": "We can measure compaction cadence."},
{"role": "user", "content": "server beta.example is the current deployment target now"},
{"role": "assistant", "content": "Switched to beta.example."},
{"role": "user", "content": "hot tail user message"},
{"role": "assistant", "content": "hot tail assistant message"},
]
secret_messages = [
{"role": "system", "content": "You are Hermes."},
{"role": "user", "content": f"api_key={fake_token} server {ip}"},
{"role": "assistant", "content": f"token: {fake_pat}"},
{"role": "user", "content": "todo: rotate the credential reference"},
{"role": "assistant", "content": "Will track credential refs only."},
{"role": "user", "content": "hot tail one"},
{"role": "assistant", "content": "hot tail two"},
]
return [
Scenario(
name="supersession_and_obligation",
messages=supersession_messages,
hot_tail_messages=2,
checks={
"current_fact_retained": lambda m: "server: beta.example" in _render(m),
"old_fact_marked_superseded": lambda m: "[superseded by beta.example] server: alpha.example" in _render(m),
"decision_retained": lambda m: "use postgres" in _render(m),
"obligation_retained": lambda m: "compare misses against baseline" in _render(m),
"question_retained": lambda m: "how often to compact" in _render(m),
"hot_tail_preserved": lambda m: m[-2:] == supersession_messages[-2:],
},
),
Scenario(
name="cold_secret_redaction",
messages=secret_messages,
hot_tail_messages=2,
checks={
"raw_fake_token_absent": lambda m: fake_token not in _render(m) and fake_pat not in _render(m),
"raw_ip_absent": lambda m: ip not in _render(m),
"credential_ref_present": lambda m: "credential_ref:credential:" in _render(m),
"ip_redacted_marker_present": lambda m: "[REDACTED_IP]" in _render(m),
"obligation_retained": lambda m: "rotate the credential reference" in _render(m),
"hot_tail_preserved": lambda m: m[-2:] == secret_messages[-2:],
},
),
]
def run_eval() -> dict[str, Any]:
engines: dict[str, Callable[[list[Message], int], list[Message]]] = {
"hot_tail_only_baseline": _hot_tail_only,
"semantic_rle": _semantic_rle,
}
results: list[dict[str, Any]] = []
for scenario in _scenarios():
for engine_name, engine_fn in engines.items():
compacted = engine_fn(scenario.messages, scenario.hot_tail_messages)
checks = {name: bool(check(compacted)) for name, check in scenario.checks.items()}
passed = sum(1 for ok in checks.values() if ok)
results.append(
{
"scenario": scenario.name,
"engine": engine_name,
"passed": passed,
"total": len(checks),
"misses": [name for name, ok in checks.items() if not ok],
"message_count_before": len(scenario.messages),
"message_count_after": len(compacted),
"char_count_after": len(_render(compacted)),
}
)
by_engine: dict[str, dict[str, int]] = {}
for row in results:
aggregate = by_engine.setdefault(row["engine"], {"passed": 0, "total": 0})
aggregate["passed"] += int(row["passed"])
aggregate["total"] += int(row["total"])
return {"results": results, "summary": by_engine}
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--json", action="store_true", help="print machine-readable JSON")
args = parser.parse_args()
report = run_eval()
if args.json:
print(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True))
else:
print("Semantic RLE deterministic smoke-eval")
for engine, row in report["summary"].items():
print(f"- {engine}: {row['passed']}/{row['total']} invariant checks passed")
print("Details:")
for row in report["results"]:
misses = ", ".join(row["misses"]) or "none"
print(
f"- {row['scenario']} / {row['engine']}: "
f"{row['passed']}/{row['total']}, misses={misses}, "
f"messages {row['message_count_before']}{row['message_count_after']}, "
f"chars_after={row['char_count_after']}"
)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@ -0,0 +1,32 @@
"""Regression tests for queued follow-ups during gateway shutdown.
A user message can arrive while the gateway is still draining an active agent
turn (for example during a service restart after a code update). The active
turn may finish cleanly while the gateway is in draining mode. In that case
we must process the already-queued follow-up before exiting instead of dropping
it silently.
"""
from gateway.run import _should_process_pending_during_drain
def test_clean_result_processes_pending_followup_during_drain():
result = {"final_response": "done", "messages": [], "api_calls": 1}
assert _should_process_pending_during_drain(result) is True
def test_interrupted_result_drops_pending_followup_during_drain():
result = {"interrupted": True, "final_response": ""}
assert _should_process_pending_during_drain(result) is False
def test_failed_result_drops_pending_followup_during_drain():
result = {"failed": True, "final_response": "boom"}
assert _should_process_pending_during_drain(result) is False
def test_missing_result_drops_pending_followup_during_drain():
assert _should_process_pending_during_drain(None) is False

View File

@ -79,6 +79,77 @@ async def test_voice_message_still_transcribed():
assert "voice message" in result.lower()
@pytest.mark.asyncio
async def test_voice_stt_fallback_only_for_uncertain_openai_mini(monkeypatch):
"""OpenAI mini-transcribe should retry whisper-1 only when the transcript looks uncertain."""
runner = _make_runner(stt_enabled=True)
source = SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm")
event = _voice_event("/tmp/voice.ogg")
monkeypatch.setattr(
"tools.transcription_tools._load_stt_config",
lambda: {"provider": "openai", "openai": {"model": "gpt-4o-mini-transcribe"}},
)
calls = []
def fake_transcribe(path, model=None):
calls.append((path, model))
if model == "whisper-1":
return {"success": True, "transcript": "нормальный текст", "provider": "openai"}
return {"success": True, "transcript": "[inaudible]", "provider": "openai"}
with patch("tools.transcription_tools.transcribe_audio", side_effect=fake_transcribe):
result = await runner._prepare_inbound_message_text(event=event, source=source, history=[])
assert calls == [("/tmp/voice.ogg", None), ("/tmp/voice.ogg", "whisper-1")]
assert "нормальный текст" in result
@pytest.mark.asyncio
async def test_voice_stt_no_fallback_for_confident_openai_mini(monkeypatch):
"""Confident primary transcripts should not pay for a Whisper fallback."""
runner = _make_runner(stt_enabled=True)
source = SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm")
event = _voice_event("/tmp/voice.ogg")
monkeypatch.setattr(
"tools.transcription_tools._load_stt_config",
lambda: {"provider": "openai", "openai": {"model": "gpt-4o-mini-transcribe"}},
)
with patch(
"tools.transcription_tools.transcribe_audio",
return_value={"success": True, "transcript": "привет, проверь статус", "provider": "openai"},
) as mock_transcribe:
result = await runner._prepare_inbound_message_text(event=event, source=source, history=[])
mock_transcribe.assert_called_once_with("/tmp/voice.ogg")
assert "привет, проверь статус" in result
@pytest.mark.asyncio
async def test_voice_stt_uncertain_after_failed_fallback_requests_short_clarification(monkeypatch):
"""If fallback cannot rescue an uncertain transcript, the agent should ask briefly instead of guessing."""
runner = _make_runner(stt_enabled=True)
source = SessionSource(platform=Platform.TELEGRAM, chat_id="1", chat_type="dm")
event = _voice_event("/tmp/voice.ogg")
monkeypatch.setattr(
"tools.transcription_tools._load_stt_config",
lambda: {"provider": "openai", "openai": {"model": "gpt-4o-mini-transcribe"}},
)
def fake_transcribe(path, model=None):
if model == "whisper-1":
return {"success": False, "error": "temporary provider error", "provider": "openai"}
return {"success": True, "transcript": "[inaudible]", "provider": "openai"}
with patch("tools.transcription_tools.transcribe_audio", side_effect=fake_transcribe):
result = await runner._prepare_inbound_message_text(event=event, source=source, history=[])
assert "[inaudible]" in result
assert "short clarification in Russian" in result
# ---------------------------------------------------------------------------
# 2. AUDIO file attachment bypasses STT
# ---------------------------------------------------------------------------

View File

@ -17,6 +17,7 @@ def _make_adapter(**extra_env):
adapter.platform = Platform.TELEGRAM
adapter.config = PlatformConfig(enabled=True, token="fake-token")
adapter._bot = AsyncMock()
adapter._bot.id = 999
adapter._bot.set_message_reaction = AsyncMock()
return adapter
@ -315,3 +316,79 @@ def test_config_reactions_env_takes_precedence(monkeypatch, tmp_path):
import os
assert os.getenv("TELEGRAM_REACTIONS") == "false"
# ── inbound user reactions ───────────────────────────────────────────
def _make_reaction_update(*, emoji="👎", old_emoji=None, user_id=42, bot_id=999):
new_reaction = [] if emoji is None else [SimpleNamespace(emoji=emoji)]
old_reaction = [] if old_emoji is None else [SimpleNamespace(emoji=old_emoji)]
return SimpleNamespace(
update_id=777,
message_reaction=SimpleNamespace(
chat=SimpleNamespace(id=123, type="private", title=None),
user=SimpleNamespace(id=user_id, first_name="Anton", username="toxblh"),
message_id=456,
message_thread_id=None,
new_reaction=new_reaction,
old_reaction=old_reaction,
),
)
@pytest.mark.asyncio
async def test_inbound_negative_reaction_becomes_mimo_feedback():
adapter = _make_adapter()
adapter._is_callback_user_authorized = lambda *args, **kwargs: True
adapter.handle_message = AsyncMock()
await adapter._handle_message_reaction(_make_reaction_update(emoji="👎"), None)
adapter.handle_message.assert_awaited_once()
event = adapter.handle_message.await_args.args[0]
assert event.text == (
"[Telegram reaction feedback: 👎 on message 456. "
"Interpretation: negative feedback / мимо. "
"If the target was a cron/news/digest item, use this as taste feedback.]"
)
assert event.source.chat_id == "123"
assert event.source.user_id == "42"
assert event.source.message_id == "456"
assert event.message_id == "456"
assert event.platform_update_id == 777
@pytest.mark.asyncio
async def test_inbound_positive_reaction_becomes_godno_feedback():
adapter = _make_adapter()
adapter._is_callback_user_authorized = lambda *args, **kwargs: True
adapter.handle_message = AsyncMock()
await adapter._handle_message_reaction(_make_reaction_update(emoji="👌"), None)
event = adapter.handle_message.await_args.args[0]
assert "Telegram reaction feedback: 👌" in event.text
assert "Interpretation: positive feedback / годно" in event.text
@pytest.mark.asyncio
async def test_inbound_reaction_removal_is_ignored():
adapter = _make_adapter()
adapter._is_callback_user_authorized = lambda *args, **kwargs: True
adapter.handle_message = AsyncMock()
await adapter._handle_message_reaction(_make_reaction_update(emoji=None, old_emoji="👎"), None)
adapter.handle_message.assert_not_awaited()
@pytest.mark.asyncio
async def test_inbound_bot_own_reaction_is_ignored():
adapter = _make_adapter()
adapter._is_callback_user_authorized = lambda *args, **kwargs: True
adapter.handle_message = AsyncMock()
await adapter._handle_message_reaction(_make_reaction_update(emoji="👎", user_id=999), None)
adapter.handle_message.assert_not_awaited()

View File

@ -0,0 +1,111 @@
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from gateway.config import Platform
from gateway.platforms.base import MessageEvent, MessageType, SendResult
from gateway.session import SessionSource
def _voice_event(thread_id=None) -> MessageEvent:
return MessageEvent(
text="",
message_type=MessageType.VOICE,
source=SessionSource(
platform=Platform.TELEGRAM,
chat_id="123",
chat_type="dm",
thread_id=thread_id,
message_id="42",
),
message_id="42",
)
@pytest.mark.asyncio
async def test_pre_stt_ack_is_best_effort_and_platform_side():
from gateway.platforms.telegram import TelegramAdapter
adapter = TelegramAdapter.__new__(TelegramAdapter)
adapter.send = AsyncMock(return_value=SendResult(success=True, message_id="99"))
event = _voice_event()
await adapter._send_voice_pre_stt_ack(event)
adapter.send.assert_awaited_once_with(
"123",
"🎧 получил, распознаю",
reply_to="42",
metadata=None,
)
@pytest.mark.asyncio
async def test_pre_stt_ack_failure_does_not_raise():
from gateway.platforms.telegram import TelegramAdapter
adapter = TelegramAdapter.__new__(TelegramAdapter)
adapter.send = AsyncMock(side_effect=RuntimeError("telegram is temporarily unavailable"))
await adapter._send_voice_pre_stt_ack(_voice_event())
adapter.send.assert_awaited_once()
@pytest.mark.asyncio
async def test_voice_handler_sends_ack_before_dispatching_to_gateway():
from gateway.platforms.telegram import TelegramAdapter
adapter = TelegramAdapter.__new__(TelegramAdapter)
adapter.platform = Platform.TELEGRAM
adapter.config = MagicMock()
adapter.config.extra = {}
order = []
adapter._should_process_message = lambda message: True
adapter._apply_telegram_group_observe_attribution = lambda event: event
adapter._send_voice_pre_stt_ack = AsyncMock(side_effect=lambda event: order.append("ack"))
adapter.handle_message = AsyncMock(side_effect=lambda event: order.append("handle"))
file_obj = MagicMock()
file_obj.download_as_bytearray = AsyncMock(return_value=bytearray(b"ogg-data"))
voice = MagicMock()
voice.get_file = AsyncMock(return_value=file_obj)
chat = MagicMock()
chat.id = 123
chat.type = "private"
chat.title = None
chat.full_name = "Тохыч"
user = MagicMock()
user.id = 456
user.full_name = "Тохыч"
msg = MagicMock()
msg.chat = chat
msg.from_user = user
msg.message_id = 42
msg.message_thread_id = None
msg.is_topic_message = False
msg.date = None
msg.caption = None
msg.text = ""
msg.reply_to_message = None
msg.sticker = None
msg.photo = []
msg.video = None
msg.audio = None
msg.voice = voice
msg.document = None
msg.media_group_id = None
update = MagicMock()
update.message = msg
update.update_id = 777
with patch("gateway.platforms.telegram.cache_audio_from_bytes", return_value="/tmp/voice.ogg"):
await adapter._handle_media_message(update, MagicMock())
assert order == ["ack", "handle"]
handled_event = adapter.handle_message.await_args.args[0]
assert handled_event.media_urls == ["/tmp/voice.ogg"]
assert handled_event.media_types == ["audio/ogg"]

View File

@ -0,0 +1,102 @@
"""Tests for the experimental semantic_rle context engine plugin."""
from __future__ import annotations
from agent.context_engine import ContextEngine
from plugins.context_engine import discover_context_engines, load_context_engine
from plugins.context_engine.semantic_rle import SemanticRLEEngine
def _messages() -> list[dict[str, str]]:
return [
{"role": "system", "content": "You are Hermes."},
{"role": "user", "content": "server alpha.example is the current deployment target"},
{"role": "assistant", "content": "Noted: server alpha.example."},
{"role": "user", "content": "decision: use postgres for the ledger"},
{"role": "assistant", "content": "I will use postgres."},
{"role": "user", "content": "todo: compare misses against baseline"},
{"role": "assistant", "content": "Added comparison todo."},
{"role": "user", "content": "unresolved question: how often to compact?"},
{"role": "assistant", "content": "We can measure compaction cadence."},
{"role": "user", "content": "server beta.example is the current deployment target now"},
{"role": "assistant", "content": "Switched to beta.example."},
{"role": "user", "content": "hot tail user message"},
{"role": "assistant", "content": "hot tail assistant message"},
]
def test_semantic_rle_satisfies_abc_and_returns_valid_messages():
engine = SemanticRLEEngine(hot_tail_messages=4)
assert isinstance(engine, ContextEngine)
assert engine.name == "semantic_rle"
result = engine.compress(_messages())
assert isinstance(result, list)
assert all("role" in message and "content" in message for message in result)
assert result[0]["role"] == "system"
assert any("Semantic RLE context ledger" in message["content"] for message in result)
def test_hot_tail_preserved_verbatim():
engine = SemanticRLEEngine(hot_tail_messages=4)
messages = _messages()
result = engine.compress(messages)
assert result[-4:] == messages[-4:]
def test_server_supersession_marks_old_fact_inactive():
engine = SemanticRLEEngine(hot_tail_messages=2)
result = engine.compress(_messages())
summary = "\n".join(str(message["content"]) for message in result if message["role"] == "system")
assert "server: beta.example" in summary
assert "[superseded by beta.example] server: alpha.example" in summary
def test_fake_tokens_and_ip_like_strings_are_redacted_to_refs():
engine = SemanticRLEEngine(hot_tail_messages=2)
fake_token = "sk-test_abcdefghijklmnopqrstuvwxyz1234567890"
fake_pat = "ghp_abcdefghijklmnopqrstuvwxyz1234567890"
messages = [
{"role": "user", "content": f"api_key={fake_token} server 203.0.113.9"},
{"role": "assistant", "content": f"token: {fake_pat}"},
{"role": "user", "content": "tail one"},
{"role": "assistant", "content": "tail two"},
{"role": "user", "content": "tail three"},
]
result = engine.compress(messages)
rendered = "\n".join(str(message["content"]) for message in result)
assert fake_token not in rendered
assert fake_pat not in rendered
assert "203.0.113.9" not in rendered
assert "credential_ref:credential:" in rendered
assert "[REDACTED_IP]" in rendered
def test_deterministic_failure_path_returns_original_messages(monkeypatch):
engine = SemanticRLEEngine(hot_tail_messages=2)
messages = _messages()
def explode(*args, **kwargs):
raise RuntimeError("boom")
monkeypatch.setattr(engine, "_build_ledger", explode)
assert engine.compress(messages) == messages
def test_plugin_discovery_and_explicit_load_without_global_activation():
discovered = {name: (description, available) for name, description, available in discover_context_engines()}
assert "semantic_rle" in discovered
assert discovered["semantic_rle"][1] is True
engine = load_context_engine("semantic_rle")
assert isinstance(engine, SemanticRLEEngine)
assert engine.name == "semantic_rle"

View File

@ -0,0 +1,39 @@
"""Tests for scripts/semantic_rle_eval.py."""
from __future__ import annotations
import importlib.util
import sys
from pathlib import Path
def _load_eval_module():
path = Path(__file__).resolve().parents[2] / "scripts" / "semantic_rle_eval.py"
spec = importlib.util.spec_from_file_location("semantic_rle_eval", path)
assert spec and spec.loader
module = importlib.util.module_from_spec(spec)
sys.modules["semantic_rle_eval"] = module
spec.loader.exec_module(module)
return module
def test_semantic_rle_eval_reports_baseline_misses_and_semantic_passes():
module = _load_eval_module()
report = module.run_eval()
assert report["summary"]["semantic_rle"] == {"passed": 12, "total": 12}
assert report["summary"]["hot_tail_only_baseline"]["passed"] < 12
semantic_rows = [row for row in report["results"] if row["engine"] == "semantic_rle"]
assert all(row["misses"] == [] for row in semantic_rows)
baseline_misses = {
miss
for row in report["results"]
if row["engine"] == "hot_tail_only_baseline"
for miss in row["misses"]
}
assert "current_fact_retained" in baseline_misses
assert "obligation_retained" in baseline_misses
assert "credential_ref_present" in baseline_misses