172 lines
7.0 KiB
Python
172 lines
7.0 KiB
Python
#!/usr/bin/env python3
|
|
"""Deterministic smoke-eval for the experimental semantic_rle context engine.
|
|
|
|
This is not an LLM quality benchmark. It checks the minimum invariant we need
|
|
before trying live Telegram replay: after cold-history compaction, the
|
|
model-visible context still contains current facts/obligations, marks stale
|
|
facts as superseded, preserves the hot tail byte-for-byte, and does not leak
|
|
raw fake secrets from cold turns.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Any, Callable
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
if str(REPO_ROOT) not in sys.path:
|
|
sys.path.insert(0, str(REPO_ROOT))
|
|
|
|
from plugins.context_engine.semantic_rle import SemanticRLEEngine # noqa: E402
|
|
|
|
Message = dict[str, Any]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Scenario:
|
|
name: str
|
|
messages: list[Message]
|
|
hot_tail_messages: int
|
|
checks: dict[str, Callable[[list[Message]], bool]]
|
|
|
|
|
|
def _render(messages: list[Message]) -> str:
|
|
return "\n".join(str(message.get("content", "")) for message in messages)
|
|
|
|
|
|
def _hot_tail_only(messages: list[Message], hot_tail_messages: int) -> list[Message]:
|
|
system_head = [dict(m) for m in messages if m.get("role") == "system"]
|
|
non_system = [dict(m) for m in messages if m.get("role") != "system"]
|
|
return [*system_head, *non_system[-hot_tail_messages:]]
|
|
|
|
|
|
def _semantic_rle(messages: list[Message], hot_tail_messages: int) -> list[Message]:
|
|
return SemanticRLEEngine(hot_tail_messages=hot_tail_messages).compress(messages)
|
|
|
|
|
|
def _scenarios() -> list[Scenario]:
|
|
fake_token = "sk-test1234567890abcdef1234567890"
|
|
fake_pat = "ghp_abcdef1234567890abcdef1234567890"
|
|
ip = "203.0.113.42"
|
|
|
|
supersession_messages = [
|
|
{"role": "system", "content": "You are Hermes."},
|
|
{"role": "user", "content": "server alpha.example is the current deployment target"},
|
|
{"role": "assistant", "content": "Noted: server alpha.example."},
|
|
{"role": "user", "content": "decision: use postgres for the ledger"},
|
|
{"role": "assistant", "content": "I will use postgres."},
|
|
{"role": "user", "content": "todo: compare misses against baseline"},
|
|
{"role": "assistant", "content": "Added comparison todo."},
|
|
{"role": "user", "content": "unresolved question: how often to compact?"},
|
|
{"role": "assistant", "content": "We can measure compaction cadence."},
|
|
{"role": "user", "content": "server beta.example is the current deployment target now"},
|
|
{"role": "assistant", "content": "Switched to beta.example."},
|
|
{"role": "user", "content": "hot tail user message"},
|
|
{"role": "assistant", "content": "hot tail assistant message"},
|
|
]
|
|
|
|
secret_messages = [
|
|
{"role": "system", "content": "You are Hermes."},
|
|
{"role": "user", "content": f"api_key={fake_token} server {ip}"},
|
|
{"role": "assistant", "content": f"token: {fake_pat}"},
|
|
{"role": "user", "content": "todo: rotate the credential reference"},
|
|
{"role": "assistant", "content": "Will track credential refs only."},
|
|
{"role": "user", "content": "hot tail one"},
|
|
{"role": "assistant", "content": "hot tail two"},
|
|
]
|
|
|
|
return [
|
|
Scenario(
|
|
name="supersession_and_obligation",
|
|
messages=supersession_messages,
|
|
hot_tail_messages=2,
|
|
checks={
|
|
"current_fact_retained": lambda m: "server: beta.example" in _render(m),
|
|
"old_fact_marked_superseded": lambda m: "[superseded by beta.example] server: alpha.example" in _render(m),
|
|
"decision_retained": lambda m: "use postgres" in _render(m),
|
|
"obligation_retained": lambda m: "compare misses against baseline" in _render(m),
|
|
"question_retained": lambda m: "how often to compact" in _render(m),
|
|
"hot_tail_preserved": lambda m: m[-2:] == supersession_messages[-2:],
|
|
},
|
|
),
|
|
Scenario(
|
|
name="cold_secret_redaction",
|
|
messages=secret_messages,
|
|
hot_tail_messages=2,
|
|
checks={
|
|
"raw_fake_token_absent": lambda m: fake_token not in _render(m) and fake_pat not in _render(m),
|
|
"raw_ip_absent": lambda m: ip not in _render(m),
|
|
"credential_ref_present": lambda m: "credential_ref:credential:" in _render(m),
|
|
"ip_redacted_marker_present": lambda m: "[REDACTED_IP]" in _render(m),
|
|
"obligation_retained": lambda m: "rotate the credential reference" in _render(m),
|
|
"hot_tail_preserved": lambda m: m[-2:] == secret_messages[-2:],
|
|
},
|
|
),
|
|
]
|
|
|
|
|
|
def run_eval() -> dict[str, Any]:
|
|
engines: dict[str, Callable[[list[Message], int], list[Message]]] = {
|
|
"hot_tail_only_baseline": _hot_tail_only,
|
|
"semantic_rle": _semantic_rle,
|
|
}
|
|
results: list[dict[str, Any]] = []
|
|
|
|
for scenario in _scenarios():
|
|
for engine_name, engine_fn in engines.items():
|
|
compacted = engine_fn(scenario.messages, scenario.hot_tail_messages)
|
|
checks = {name: bool(check(compacted)) for name, check in scenario.checks.items()}
|
|
passed = sum(1 for ok in checks.values() if ok)
|
|
results.append(
|
|
{
|
|
"scenario": scenario.name,
|
|
"engine": engine_name,
|
|
"passed": passed,
|
|
"total": len(checks),
|
|
"misses": [name for name, ok in checks.items() if not ok],
|
|
"message_count_before": len(scenario.messages),
|
|
"message_count_after": len(compacted),
|
|
"char_count_after": len(_render(compacted)),
|
|
}
|
|
)
|
|
|
|
by_engine: dict[str, dict[str, int]] = {}
|
|
for row in results:
|
|
aggregate = by_engine.setdefault(row["engine"], {"passed": 0, "total": 0})
|
|
aggregate["passed"] += int(row["passed"])
|
|
aggregate["total"] += int(row["total"])
|
|
|
|
return {"results": results, "summary": by_engine}
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--json", action="store_true", help="print machine-readable JSON")
|
|
args = parser.parse_args()
|
|
|
|
report = run_eval()
|
|
if args.json:
|
|
print(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True))
|
|
else:
|
|
print("Semantic RLE deterministic smoke-eval")
|
|
for engine, row in report["summary"].items():
|
|
print(f"- {engine}: {row['passed']}/{row['total']} invariant checks passed")
|
|
print("Details:")
|
|
for row in report["results"]:
|
|
misses = ", ".join(row["misses"]) or "none"
|
|
print(
|
|
f"- {row['scenario']} / {row['engine']}: "
|
|
f"{row['passed']}/{row['total']}, misses={misses}, "
|
|
f"messages {row['message_count_before']}→{row['message_count_after']}, "
|
|
f"chars_after={row['char_count_after']}"
|
|
)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|