fix: address code review findings on pz_parser

- Strip body-prefix severity in normalize_first_line so pattern_id
  is stable across body-prefix vs bracketed-only variants.
- Lookback for inferred attribution now counts raw file lines
  (per spec literal), not body-line budget across entries.
- Document hash truncation (64-bit) and direct-attribution priority.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 15:33:56 +00:00
parent 4fec3a58f6
commit 2e7bebc911
3 changed files with 216 additions and 21 deletions

View File

@@ -162,6 +162,13 @@ DOUBLE_QUOTED_RE = re.compile(r'"[^"]*"')
SINGLE_QUOTED_RE = re.compile(r"'[^']*'") SINGLE_QUOTED_RE = re.compile(r"'[^']*'")
NUMERIC_RUN_RE = re.compile(r"\d{2,}") NUMERIC_RUN_RE = re.compile(r"\d{2,}")
WS_RUN_RE = re.compile(r"\s+") WS_RUN_RE = re.compile(r"\s+")
#: Strips a leading ``ERROR:`` / ``SEVERE:`` / ``WARN:`` / ``FATAL:`` token
#: from a body line so a body that happens to begin with the severity word
#: hashes to the same pattern_id as the bracketed-only variant. Matches the
#: token plus any colon and trailing whitespace; case-insensitive.
SEVERITY_PREFIX_STRIP_RE = re.compile(
r"^\s*(?:ERROR|SEVERE|WARN|FATAL)\s*[:\s]\s*", re.IGNORECASE
)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Dataclasses — match the JSON keys the spec mandates so consumers can # Dataclasses — match the JSON keys the spec mandates so consumers can
@@ -403,12 +410,20 @@ def _entry_text(entry: Entry) -> str:
def attribute_entry(entry: Entry, prior_lookback_lines: list[str]) -> tuple[str, str, str, str, str]: def attribute_entry(entry: Entry, prior_lookback_lines: list[str]) -> tuple[str, str, str, str, str]:
"""Determine ``(mod_id, mod_name, attribution, confidence, reason)``. """Determine ``(mod_id, mod_name, attribution, confidence, reason)``.
``prior_lookback_lines`` is the up-to-INFERRED_LOOKBACK_LINES raw file ``prior_lookback_lines`` is the body lines from prior entries that fall
lines preceding this entry (used to look up a recent ``Lua((MOD:Y))`` within INFERRED_LOOKBACK_LINES raw-file-line distance from this entry's
marker for inferred attribution). start, in source order. The list is scanned in reverse for the nearest
``Lua((MOD:Y))`` marker when inferred attribution is being attempted.
Direct-attribution priority: Lua marker -> needed-by -> require-failed.
Rationale: ``needed by <mod>`` names the dependent mod (more semantically
targeted) and is preferred over ``require("...") failed`` which only names
the missing module path. ``Lua((MOD:...))`` is unambiguous and wins
outright.
""" """
text = _entry_text(entry) text = _entry_text(entry)
# 1. Direct via Lua((MOD:X)) # 1. Direct via Lua((MOD:X)) — unambiguous; outranks every other signal.
m = LUA_MOD_MARKER_RE.search(text) m = LUA_MOD_MARKER_RE.search(text)
if m: if m:
raw = m.group(1).strip() raw = m.group(1).strip()
@@ -559,12 +574,19 @@ def detect_kind(entry: Entry, attribution: str, body_text: str) -> str:
def normalize_first_line(first: str) -> str: def normalize_first_line(first: str) -> str:
"""Per spec: strip session metadata prefix, flatten quoted strings to """Per spec: strip session metadata prefix, strip any leading severity
``"<S>"`` / ``'<S>'``, flatten ≥2-digit numeric runs to ``<N>``, collapse word (so ``SEVERE: foo`` and ``foo`` produce the same pattern_id when both
whitespace, truncate to 200 chars. are SEVERE-level), flatten quoted strings to ``"<S>"`` / ``'<S>'``, flatten
≥2-digit numeric runs to ``<N>``, collapse whitespace, truncate to 200
chars.
""" """
s = first.strip() s = first.strip()
s = SESSION_META_RE.sub("", s) s = SESSION_META_RE.sub("", s)
# Strip any leading ERROR:/SEVERE:/WARN:/FATAL: that survived in the body
# — the bracketed level already feeds pattern_id separately, so leaving
# the body-prefix in place would fragment signatures across "body has
# SEVERE: prefix" vs "body has no prefix but bracketed level is SEVERE."
s = SEVERITY_PREFIX_STRIP_RE.sub("", s)
s = DOUBLE_QUOTED_RE.sub('"<S>"', s) s = DOUBLE_QUOTED_RE.sub('"<S>"', s)
s = SINGLE_QUOTED_RE.sub("'<S>'", s) s = SINGLE_QUOTED_RE.sub("'<S>'", s)
s = NUMERIC_RUN_RE.sub("<N>", s) s = NUMERIC_RUN_RE.sub("<N>", s)
@@ -573,14 +595,22 @@ def normalize_first_line(first: str) -> str:
def compute_pattern_id(level: str, first_line: str) -> str: def compute_pattern_id(level: str, first_line: str) -> str:
"""``sha256(level + normalized_first_line)[:16]``, prefixed ``sha256:``.""" """``sha256(level + normalized_first_line)[:16]``, prefixed ``sha256:``.
16 hex chars (64 bits) chosen for JSON readability vs collision-resistance
trade-off; consumers treat as opaque.
"""
norm = normalize_first_line(first_line) norm = normalize_first_line(first_line)
h = hashlib.sha256(f"{level}\n{norm}".encode("utf-8")).hexdigest() h = hashlib.sha256(f"{level}\n{norm}".encode("utf-8")).hexdigest()
return f"sha256:{h[:16]}" return f"sha256:{h[:16]}"
def compute_signature(pattern_id: str, mod_id: str) -> str: def compute_signature(pattern_id: str, mod_id: str) -> str:
"""``sha256(pattern_id + mod_id)[:16]``, prefixed ``sha256:``.""" """``sha256(pattern_id + mod_id)[:16]``, prefixed ``sha256:``.
16 hex chars (64 bits) chosen for JSON readability vs collision-resistance
trade-off; consumers treat as opaque.
"""
h = hashlib.sha256(f"{pattern_id}\n{mod_id}".encode("utf-8")).hexdigest() h = hashlib.sha256(f"{pattern_id}\n{mod_id}".encode("utf-8")).hexdigest()
return f"sha256:{h[:16]}" return f"sha256:{h[:16]}"
@@ -613,23 +643,31 @@ def _build_excerpt(entry: Entry, max_chars: int = 1000) -> str:
def _build_lookback_window(entries: list[Entry], hit_idx: int) -> list[str]: def _build_lookback_window(entries: list[Entry], hit_idx: int) -> list[str]:
"""Flatten body lines from prior entries (most recent first; up to """Collect body lines from prior entries whose ``line_start`` falls within
INFERRED_LOOKBACK_LINES total) for inferred attribution lookback. INFERRED_LOOKBACK_LINES raw-file-line distance from the current entry.
Spec wording is "within the previous 40 lines", measured in raw file lines
(mirrors pzmm's ``(i - last_mod_line) <= 40``, inclusive of 40). Counting
raw lines means a multi-line entry (e.g., a 5-line Java stack trace) does
not shrink the practical window the way a body-line budget would.
We walk backwards through ``entries`` accumulating each entry's body lines
until we've gathered INFERRED_LOOKBACK_LINES lines or run out of entries.
Returned list is in source order (oldest first) so callers can call Returned list is in source order (oldest first) so callers can call
``reversed()`` on it. ``reversed()`` on it.
""" """
if hit_idx <= 0:
return []
threshold = entries[hit_idx].line_start - INFERRED_LOOKBACK_LINES
in_window: list[Entry] = []
for j in range(hit_idx - 1, -1, -1):
prior = entries[j]
if prior.line_start < threshold:
break
in_window.append(prior)
# We accumulated newest-first; reverse so we emit in source order.
in_window.reverse()
collected: list[str] = [] collected: list[str] = []
i = hit_idx - 1 for prior in in_window:
while i >= 0 and len(collected) < INFERRED_LOOKBACK_LINES: collected.extend(prior.body)
for line in reversed(entries[i].body):
collected.append(line)
if len(collected) >= INFERRED_LOOKBACK_LINES:
break
i -= 1
collected.reverse()
return collected return collected

View File

@@ -91,5 +91,130 @@ class NeededByTests(unittest.TestCase):
self.assertEqual(rec.mod_id, "testmodalpha") self.assertEqual(rec.mod_id, "testmodalpha")
def _make_marker_line(idx: int) -> str:
"""Synthesise a single LOG-level entry containing a Lua((MOD:...)) marker."""
# Vary timestamps so the bracketed prefix is unique-ish; not strictly
# required — they only feed Entry.timestamp, not parsing.
return (
f"[16-04-26 00:00:{idx:02d}.000] LOG : General f:0, "
f"t:1776297642{idx:03d}, st:48,648,157,434> "
"Lua((MOD:Test Mod Alpha)) initialised."
)
def _make_filler_line(idx: int) -> str:
"""A plain LOG-level entry with no marker; one raw line."""
return (
f"[16-04-26 00:01:{idx % 60:02d}.000] LOG : General f:0, "
f"t:177629760{idx:04d}, st:48,648,200,178> filler entry {idx}."
)
def _make_error_line() -> str:
"""A Lua-shaped ERROR with no Lua((MOD:...)) marker on the entry itself
— so attribution must come from the lookback window if it comes at all."""
return (
"[16-04-26 00:02:00.000] ERROR: General f:0, "
"t:1776297900000, st:48,648,300,178> "
"LuaManager.GetFunctionObject> no such function: doStuff"
)
class RawLineLookbackTests(unittest.TestCase):
"""Phase 3 — lookback semantics measure raw file lines, not body-line
budgets. Multi-line entries inside the window must not shrink the
practical reach."""
def _write_fixture(self, name: str, lines: list[str]) -> pathlib.Path:
path = FIXTURE_DIR / name
path.write_text("\n".join(lines) + "\n")
return path
def test_marker_exactly_at_lookback_boundary_attributes(self) -> None:
# Marker on line 1, ERROR on line 41 -> raw-line distance = 40
# (inclusive of INFERRED_LOOKBACK_LINES=40 -> still attributed).
lines = [_make_marker_line(0)]
for i in range(1, 40):
lines.append(_make_filler_line(i))
lines.append(_make_error_line()) # line 41 in the fixture
path = self._write_fixture("_rawline_at_boundary.txt", lines)
try:
entries = pz_parser.parse_file(path)
self.assertEqual(entries[0].line_start, 1)
self.assertEqual(entries[-1].line_start, 41)
records = pz_parser.classify_entries(entries, source_file="b1.txt")
self.assertEqual(len(records), 1)
self.assertEqual(records[0].attribution, "inferred")
self.assertEqual(records[0].mod_id, "testmodalpha")
finally:
path.unlink()
def test_marker_one_line_past_boundary_does_not_attribute(self) -> None:
# Marker on line 1, ERROR on line 42 -> raw-line distance = 41
# (just outside INFERRED_LOOKBACK_LINES -> unattributed).
lines = [_make_marker_line(0)]
for i in range(1, 41):
lines.append(_make_filler_line(i))
lines.append(_make_error_line()) # line 42 in the fixture
path = self._write_fixture("_rawline_past_boundary.txt", lines)
try:
entries = pz_parser.parse_file(path)
self.assertEqual(entries[0].line_start, 1)
self.assertEqual(entries[-1].line_start, 42)
records = pz_parser.classify_entries(entries, source_file="b2.txt")
self.assertEqual(len(records), 1)
self.assertEqual(records[0].attribution, "unattributed")
self.assertEqual(records[0].mod_id, "__unattributed__")
finally:
path.unlink()
def test_multiline_entry_does_not_shrink_practical_lookback(self) -> None:
# Layout the file so a multi-line entry sits between marker and ERROR.
# Under the OLD body-line-budget semantics the multi-line entry's 5
# continuation lines would consume the budget and push the marker
# outside the window. Under raw-line semantics the marker on line 1 is
# still within 40 raw lines of the ERROR even though the file has a
# 6-line multi-line entry in between.
lines = [_make_marker_line(0)] # raw line 1: marker entry
# Single-line fillers on raw lines 2..30 (29 entries).
for i in range(1, 30):
lines.append(_make_filler_line(i))
# Multi-line entry: header on raw line 31, 5 continuations on lines
# 32..36 (Java-stack-trace shape).
lines.append(
"[16-04-26 00:01:30.000] LOG : General f:0, "
"t:1776297930000, st:48,648,200,178> stack trace dump"
)
for k in range(5):
lines.append(f"\tat zombie.SomeClass.method{k}(SomeClass.java:{k + 1})")
# Single-line fillers on raw lines 37..40 (4 entries).
for i in range(30, 34):
lines.append(_make_filler_line(i))
# ERROR at raw line 41 -> N - 1 = 40 -> within window.
lines.append(_make_error_line())
path = self._write_fixture("_rawline_multiline.txt", lines)
try:
entries = pz_parser.parse_file(path)
# Sanity-check the layout: first entry at line 1, multi-line entry
# sits at line 31 with 6 body lines (header + 5 continuations),
# ERROR at line 41.
self.assertEqual(entries[0].line_start, 1)
multi = next(
e for e in entries
if e.line_start == 31 and len(e.body) == 6
)
self.assertEqual(multi.line_end, 36)
self.assertEqual(entries[-1].line_start, 41)
records = pz_parser.classify_entries(entries, source_file="ml.txt")
self.assertEqual(len(records), 1)
# Under the OLD body-line-budget rule, the 5 stack-frame lines
# plus the surrounding fillers would have pushed the marker out
# of the budget. Under raw-line semantics it survives.
self.assertEqual(records[0].attribution, "inferred")
self.assertEqual(records[0].mod_id, "testmodalpha")
finally:
path.unlink()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()

View File

@@ -55,5 +55,37 @@ class SignatureUniquenessTests(unittest.TestCase):
self.assertEqual(pat[:7], "sha256:") self.assertEqual(pat[:7], "sha256:")
class SeverityPrefixStripTests(unittest.TestCase):
"""A body line that begins with a literal severity word (``SEVERE:``,
``ERROR:``, ``WARN:``, ``FATAL:``) should not fragment pattern_id away
from the otherwise-identical body that lacks the prefix. The bracketed
level already feeds pattern_id; the prefix is redundant and varies in
practice."""
def test_pattern_id_invariant_under_body_prefix_severe(self) -> None:
# Same logical error: one line carries ``SEVERE: `` body prefix, the
# other doesn't. Both classified as SEVERE by their bracketed level.
with_prefix = pz_parser.compute_pattern_id(
"SEVERE",
"SEVERE: foo at zombie.X(File.java:42)",
)
without_prefix = pz_parser.compute_pattern_id(
"SEVERE",
"foo at zombie.X(File.java:42)",
)
self.assertEqual(with_prefix, without_prefix)
def test_pattern_id_invariant_under_body_prefix_error(self) -> None:
with_prefix = pz_parser.compute_pattern_id(
"ERROR",
"ERROR: doStuff failed in module",
)
without_prefix = pz_parser.compute_pattern_id(
"ERROR",
"doStuff failed in module",
)
self.assertEqual(with_prefix, without_prefix)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()