#!/usr/bin/env python3 """Deterministic smoke-eval for the experimental semantic_rle context engine. This is not an LLM quality benchmark. It checks the minimum invariant we need before trying live Telegram replay: after cold-history compaction, the model-visible context still contains current facts/obligations, marks stale facts as superseded, preserves the hot tail byte-for-byte, and does not leak raw fake secrets from cold turns. """ from __future__ import annotations import argparse import json import sys from dataclasses import dataclass from pathlib import Path from typing import Any, Callable REPO_ROOT = Path(__file__).resolve().parents[1] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) from plugins.context_engine.semantic_rle import SemanticRLEEngine # noqa: E402 Message = dict[str, Any] @dataclass(frozen=True) class Scenario: name: str messages: list[Message] hot_tail_messages: int checks: dict[str, Callable[[list[Message]], bool]] def _render(messages: list[Message]) -> str: return "\n".join(str(message.get("content", "")) for message in messages) def _hot_tail_only(messages: list[Message], hot_tail_messages: int) -> list[Message]: system_head = [dict(m) for m in messages if m.get("role") == "system"] non_system = [dict(m) for m in messages if m.get("role") != "system"] return [*system_head, *non_system[-hot_tail_messages:]] def _semantic_rle(messages: list[Message], hot_tail_messages: int) -> list[Message]: return SemanticRLEEngine(hot_tail_messages=hot_tail_messages).compress(messages) def _scenarios() -> list[Scenario]: fake_token = "sk-test1234567890abcdef1234567890" fake_pat = "ghp_abcdef1234567890abcdef1234567890" ip = "203.0.113.42" supersession_messages = [ {"role": "system", "content": "You are Hermes."}, {"role": "user", "content": "server alpha.example is the current deployment target"}, {"role": "assistant", "content": "Noted: server alpha.example."}, {"role": "user", "content": "decision: use postgres for the ledger"}, {"role": "assistant", "content": "I will use postgres."}, {"role": "user", "content": "todo: compare misses against baseline"}, {"role": "assistant", "content": "Added comparison todo."}, {"role": "user", "content": "unresolved question: how often to compact?"}, {"role": "assistant", "content": "We can measure compaction cadence."}, {"role": "user", "content": "server beta.example is the current deployment target now"}, {"role": "assistant", "content": "Switched to beta.example."}, {"role": "user", "content": "hot tail user message"}, {"role": "assistant", "content": "hot tail assistant message"}, ] secret_messages = [ {"role": "system", "content": "You are Hermes."}, {"role": "user", "content": f"api_key={fake_token} server {ip}"}, {"role": "assistant", "content": f"token: {fake_pat}"}, {"role": "user", "content": "todo: rotate the credential reference"}, {"role": "assistant", "content": "Will track credential refs only."}, {"role": "user", "content": "hot tail one"}, {"role": "assistant", "content": "hot tail two"}, ] return [ Scenario( name="supersession_and_obligation", messages=supersession_messages, hot_tail_messages=2, checks={ "current_fact_retained": lambda m: "server: beta.example" in _render(m), "old_fact_marked_superseded": lambda m: "[superseded by beta.example] server: alpha.example" in _render(m), "decision_retained": lambda m: "use postgres" in _render(m), "obligation_retained": lambda m: "compare misses against baseline" in _render(m), "question_retained": lambda m: "how often to compact" in _render(m), "hot_tail_preserved": lambda m: m[-2:] == supersession_messages[-2:], }, ), Scenario( name="cold_secret_redaction", messages=secret_messages, hot_tail_messages=2, checks={ "raw_fake_token_absent": lambda m: fake_token not in _render(m) and fake_pat not in _render(m), "raw_ip_absent": lambda m: ip not in _render(m), "credential_ref_present": lambda m: "credential_ref:credential:" in _render(m), "ip_redacted_marker_present": lambda m: "[REDACTED_IP]" in _render(m), "obligation_retained": lambda m: "rotate the credential reference" in _render(m), "hot_tail_preserved": lambda m: m[-2:] == secret_messages[-2:], }, ), ] def run_eval() -> dict[str, Any]: engines: dict[str, Callable[[list[Message], int], list[Message]]] = { "hot_tail_only_baseline": _hot_tail_only, "semantic_rle": _semantic_rle, } results: list[dict[str, Any]] = [] for scenario in _scenarios(): for engine_name, engine_fn in engines.items(): compacted = engine_fn(scenario.messages, scenario.hot_tail_messages) checks = {name: bool(check(compacted)) for name, check in scenario.checks.items()} passed = sum(1 for ok in checks.values() if ok) results.append( { "scenario": scenario.name, "engine": engine_name, "passed": passed, "total": len(checks), "misses": [name for name, ok in checks.items() if not ok], "message_count_before": len(scenario.messages), "message_count_after": len(compacted), "char_count_after": len(_render(compacted)), } ) by_engine: dict[str, dict[str, int]] = {} for row in results: aggregate = by_engine.setdefault(row["engine"], {"passed": 0, "total": 0}) aggregate["passed"] += int(row["passed"]) aggregate["total"] += int(row["total"]) return {"results": results, "summary": by_engine} def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--json", action="store_true", help="print machine-readable JSON") args = parser.parse_args() report = run_eval() if args.json: print(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True)) else: print("Semantic RLE deterministic smoke-eval") for engine, row in report["summary"].items(): print(f"- {engine}: {row['passed']}/{row['total']} invariant checks passed") print("Details:") for row in report["results"]: misses = ", ".join(row["misses"]) or "none" print( f"- {row['scenario']} / {row['engine']}: " f"{row['passed']}/{row['total']}, misses={misses}, " f"messages {row['message_count_before']}→{row['message_count_after']}, " f"chars_after={row['char_count_after']}" ) return 0 if __name__ == "__main__": raise SystemExit(main())