fix(matrix): warn on clock-skew silent message drops (#12614) (#27330)

The 5-second startup-grace filter in _on_room_message silently drops
events where event_ts < startup_ts - 5. When the host clock is set
ahead of real time, the comparison flips against every live event and
the bot 'connects but never replies' — exactly the symptom in #12614.

Reporter Schnurzel700 chased this for several weeks before tracing it
to their Debian VM's clock being out of sync. The current /1000.0
millisecond->second conversion is correct (mautrix returns ms); the
failure mode is purely environmental.

Add a one-shot WARNING that fires when:
  - we are >30s past startup (initial-sync replay window closed), AND
  - 3 consecutive drops share the same skew within 60s (a constant
    clock offset, not varied-age backfill from an invited room).

State is reset in connect() so reconnects after fixing NTP rearm the
detector. Includes the NTP fix instruction in the warning message
itself and a new Troubleshooting entry in the Matrix docs.

5 new tests cover the happy path, initial-sync backfill, under-
threshold drops, varied-age backfill, and the reconnect rearm path.
This commit is contained in:
kshitij 2026-05-17 00:28:24 -07:00 committed by GitHub
parent 56ad30de17
commit 519657aa98
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 280 additions and 0 deletions

View File

@ -348,6 +348,17 @@ class MatrixAdapter(BasePlatformAdapter):
self._sync_task: Optional[asyncio.Task] = None
self._closing = False
self._startup_ts: float = 0.0
# Clock-skew detection: count grace-check drops that happen well
# after startup (i.e. not initial-sync backfill). If the host's
# system clock is set ahead of real time, the startup grace check
# `event_ts < startup_ts - 5` silently drops every live message.
# See #12614 — the symptom is "bot joins rooms but never replies".
# Drops only count when their skew matches the first sampled drop
# (within 60s), so varied-age backfill from freshly-invited rooms
# doesn't trip the heuristic.
self._late_grace_drops: int = 0
self._late_grace_skew: float = 0.0
self._clock_skew_warned: bool = False
# Cache: room_id → bool (is DM)
self._dm_rooms: Dict[str, bool] = {}
@ -842,6 +853,11 @@ class MatrixAdapter(BasePlatformAdapter):
# Initial sync to catch up, then start background sync.
self._startup_ts = time.time()
# Reset clock-skew detector for each connect cycle so a reconnect
# after the user fixes NTP doesn't inherit stale counters.
self._late_grace_drops = 0
self._late_grace_skew = 0.0
self._clock_skew_warned = False
self._closing = False
try:
@ -1542,6 +1558,49 @@ class MatrixAdapter(BasePlatformAdapter):
)
event_ts = raw_ts / 1000.0 if raw_ts else 0.0
if event_ts and event_ts < self._startup_ts - _STARTUP_GRACE_SECONDS:
# If we are well past startup but events are still being dropped
# by the grace check, the host clock is probably set ahead of
# real time — every live event then looks "older than startup".
# Warn once so users can fix NTP instead of chasing a ghost.
# See #12614 (Schnurzel700, April 2026).
#
# Filter out backfill (events legitimately old) by requiring:
# - we are >30s past startup (initial-sync replay window closed)
# - the skew is *consistent* across consecutive drops, which is
# the signature of a constant clock offset rather than a
# variable-age room history. Backfill from a freshly invited
# room can deliver events spanning hours/days — those skews
# will be all over the place and reset the counter.
if not self._clock_skew_warned and (
time.time() - self._startup_ts > 30
):
skew = self._startup_ts - event_ts
# Sanity bound: malformed events with negative or absurd
# timestamps shouldn't count.
if 5 < skew < 86400:
if self._late_grace_drops == 0:
self._late_grace_skew = skew
self._late_grace_drops = 1
elif abs(skew - self._late_grace_skew) < 60:
# Consistent offset → likely real clock skew.
self._late_grace_drops += 1
else:
# Varied skew → likely backfill, restart sampling.
self._late_grace_skew = skew
self._late_grace_drops = 1
if self._late_grace_drops >= 3:
logger.warning(
"Matrix: dropped %d consecutive live events as "
"'too old' more than 30s after startup (skew "
"%.0fs). The host system clock is likely set "
"ahead of real time, which causes the startup "
"grace filter to silently discard every incoming "
"message. Run `timedatectl set-ntp true` (or "
"sync NTP) and restart the bot.",
self._late_grace_drops,
skew,
)
self._clock_skew_warned = True
return
# Extract content from the event.

View File

@ -2257,6 +2257,210 @@ class TestMatrixOnRoomMessageFilter:
ev = self._mk_event(sender="@alice:example.org", body="hello bot")
await self.adapter._on_room_message(ev)
self.adapter._handle_text_message.assert_awaited_once()
class TestMatrixClockSkewWarning:
"""Clock-skew detector for #12614.
Reporter's host clock was set ~2 hours ahead of real time. The grace
filter `event_ts < startup_ts - 5` then drops every live event because
server timestamps look "older than startup". When this happens well
after startup (>30s), the adapter logs a one-shot WARNING pointing the
user at NTP instead of failing silently.
"""
def setup_method(self):
self.adapter = _make_adapter()
self.adapter._user_id = "@bot:example.org"
self.adapter._handle_text_message = AsyncMock()
self.adapter._handle_media_message = AsyncMock()
@staticmethod
def _mk_event(sender, ts_ms, event_id=None):
ev = MagicMock()
ev.room_id = "!room:example.org"
ev.sender = sender
ev.event_id = event_id or f"$evt-{sender}-{ts_ms}"
ev.timestamp = ts_ms
ev.server_timestamp = ts_ms
ev.content = {"msgtype": "m.text", "body": "hi"}
return ev
@pytest.mark.asyncio
async def test_late_drops_emit_one_shot_clock_skew_warning(self, caplog):
import logging
import time as _t
# Simulate the reporter's environment: host clock is ~2 hours ahead
# of server time. Startup happened "in the future" relative to the
# real-world events we're now receiving.
now = _t.time()
self.adapter._startup_ts = now - 60 # bot started 60s ago (wall clock)
# Server events are dated 2h before startup_ts (skewed clock).
skewed_event_ts_ms = int((self.adapter._startup_ts - 7200) * 1000)
with caplog.at_level(logging.WARNING, logger="gateway.platforms.matrix"):
for i in range(5):
ev = self._mk_event(
sender=f"@alice{i}:example.org", ts_ms=skewed_event_ts_ms
)
await self.adapter._on_room_message(ev)
# Handler should never be invoked — all events failed the grace check.
self.adapter._handle_text_message.assert_not_called()
# Exactly one WARNING from THIS logger should be emitted. Filter by
# logger name so unrelated stdlib/library warnings can't satisfy the
# assertion.
skew_warnings = [
r for r in caplog.records
if r.name == "gateway.platforms.matrix"
and r.levelname == "WARNING"
and "set-ntp" in r.getMessage()
]
assert len(skew_warnings) == 1, (
f"expected exactly 1 clock-skew warning, got {len(skew_warnings)}"
)
msg = skew_warnings[0].getMessage()
assert "7200" in msg, f"skew value missing from message: {msg!r}"
# Pin the counter so a regression in the gating logic (e.g. warning
# at threshold 1 or 5, or not stopping after warn) is caught.
assert self.adapter._late_grace_drops == 3
assert self.adapter._clock_skew_warned is True
@pytest.mark.asyncio
async def test_initial_sync_drops_do_not_warn(self, caplog):
"""During the first 30s after startup, old events are normal backfill."""
import logging
import time as _t
now = _t.time()
# Startup was 1s ago — we're still in the initial-sync window.
self.adapter._startup_ts = now - 1
old_ts_ms = int((self.adapter._startup_ts - 3600) * 1000)
with caplog.at_level(logging.WARNING, logger="gateway.platforms.matrix"):
for i in range(5):
ev = self._mk_event(
sender=f"@alice{i}:example.org", ts_ms=old_ts_ms
)
await self.adapter._on_room_message(ev)
# Backfill drops are silent — no clock-skew warning fired.
assert self.adapter._clock_skew_warned is False
skew_warnings = [
r for r in caplog.records
if r.name == "gateway.platforms.matrix"
and "set-ntp" in r.getMessage()
]
assert skew_warnings == []
@pytest.mark.asyncio
async def test_fewer_than_three_late_drops_do_not_warn(self, caplog):
"""A single delayed backfill event after 30s shouldn't trigger NTP advice."""
import logging
import time as _t
now = _t.time()
self.adapter._startup_ts = now - 120 # extra slack vs the 30s gate
old_ts_ms = int((self.adapter._startup_ts - 3600) * 1000)
with caplog.at_level(logging.WARNING, logger="gateway.platforms.matrix"):
for i in range(2): # only 2 late drops — under the threshold
ev = self._mk_event(
sender=f"@alice{i}:example.org", ts_ms=old_ts_ms
)
await self.adapter._on_room_message(ev)
assert self.adapter._late_grace_drops == 2
assert self.adapter._clock_skew_warned is False
@pytest.mark.asyncio
async def test_varied_backfill_skews_do_not_warn(self, caplog):
"""Backfill from a freshly-invited room delivers events of varied age.
A genuine clock-skew bug produces drops with a *constant* offset
(every event is ~X seconds older than wall clock). Joining an old
room post-startup delivers events spanning hours-to-days; those
skews vary wildly and must NOT trigger the NTP warning.
"""
import logging
import time as _t
now = _t.time()
self.adapter._startup_ts = now - 120
# Each event has a different age, ranging from 1h to 30d ago.
ages_in_hours = [1, 24, 168, 720, 4] # 1h, 1d, 1w, 30d, 4h
with caplog.at_level(logging.WARNING, logger="gateway.platforms.matrix"):
for i, hrs in enumerate(ages_in_hours):
ts_ms = int((self.adapter._startup_ts - hrs * 3600) * 1000)
ev = self._mk_event(
sender=f"@alice{i}:example.org", ts_ms=ts_ms
)
await self.adapter._on_room_message(ev)
# The varied-skew guard should keep the counter from reaching 3.
assert self.adapter._late_grace_drops < 3
assert self.adapter._clock_skew_warned is False
skew_warnings = [
r for r in caplog.records
if r.name == "gateway.platforms.matrix"
and "set-ntp" in r.getMessage()
]
assert skew_warnings == []
@pytest.mark.asyncio
async def test_state_reset_allows_warning_to_fire_again(self, caplog):
"""After the reset block at top of connect() runs, the warning is rearmed.
Reconnect lifecycle: the user fixes NTP, restarts the bot, and the
new connect() call resets _late_grace_drops / _clock_skew_warned at
the top. This test exercises the rearm path by:
1. Tripping the warning once (state: warned=True).
2. Running the same reset block connect() runs.
3. Tripping the warning a second time the second warning should
fire because the state was cleared.
"""
import logging
import time as _t
now = _t.time()
self.adapter._startup_ts = now - 60
skewed_ms = int((self.adapter._startup_ts - 7200) * 1000)
with caplog.at_level(logging.WARNING, logger="gateway.platforms.matrix"):
for i in range(3):
ev = self._mk_event(
sender=f"@alice{i}:example.org", ts_ms=skewed_ms,
event_id=f"$first-{i}",
)
await self.adapter._on_room_message(ev)
assert self.adapter._clock_skew_warned is True
# Mirror the reset block in connect() (matrix.py around line 855).
self.adapter._startup_ts = _t.time() - 60
self.adapter._late_grace_drops = 0
self.adapter._late_grace_skew = 0.0
self.adapter._clock_skew_warned = False
# Same skewed-clock scenario should warn AGAIN after reset.
skewed_ms2 = int((self.adapter._startup_ts - 7200) * 1000)
for i in range(3):
ev = self._mk_event(
sender=f"@bob{i}:example.org", ts_ms=skewed_ms2,
event_id=f"$second-{i}",
)
await self.adapter._on_room_message(ev)
skew_warnings = [
r for r in caplog.records
if r.name == "gateway.platforms.matrix"
and "set-ntp" in r.getMessage()
]
assert len(skew_warnings) == 2, (
f"expected 2 warnings (one per connect cycle), got {len(skew_warnings)}"
)
# ---------------------------------------------------------------------------
# DM auto-thread
# ---------------------------------------------------------------------------

View File

@ -357,6 +357,23 @@ To find a Room ID: in Element, go to the room → **Settings** → **Advanced**
**Fix**: Invite the bot to the room — it auto-joins on invite. Verify your User ID is in `MATRIX_ALLOWED_USERS` (use the full `@user:server` format). Restart the gateway.
### Bot joins rooms but silently drops every message (clock skew)
**Cause**: The host's system clock is set ahead of real time. The Matrix adapter applies a 5-second startup-grace filter (`event_ts < startup_ts - 5`) to ignore events replayed from initial sync. When the wall clock is ahead, every incoming event looks "older than startup" and is dropped before reaching the message handler the bot appears connected but never replies. See [#12614](https://github.com/NousResearch/hermes-agent/issues/12614).
**Symptom**: Gateway log shows `Matrix: dropped N live events as 'too old' more than 30s after startup`.
**Fix**: Sync the host clock with NTP and restart the bot:
```bash
# Debian/Ubuntu
sudo timedatectl set-ntp true
timedatectl status # confirm "System clock synchronized: yes"
# macOS
sudo sntp -sS time.apple.com
```
### "Failed to authenticate" / "whoami failed" on startup
**Cause**: The access token or homeserver URL is incorrect.