From 2e7bebc911548ce798d2b5eff06240e0f90ae205 Mon Sep 17 00:00:00 2001 From: indifferentketchup Date: Mon, 4 May 2026 15:33:56 +0000 Subject: [PATCH] fix: address code review findings on pz_parser - Strip body-prefix severity in normalize_first_line so pattern_id is stable across body-prefix vs bracketed-only variants. - Lookback for inferred attribution now counts raw file lines (per spec literal), not body-line budget across entries. - Document hash truncation (64-bit) and direct-attribution priority. Co-Authored-By: Claude Opus 4.7 (1M context) --- tools/pz-analyzer/pz_parser.py | 80 +++++++++---- tools/pz-analyzer/tests/test_attribution.py | 125 ++++++++++++++++++++ tools/pz-analyzer/tests/test_signatures.py | 32 +++++ 3 files changed, 216 insertions(+), 21 deletions(-) diff --git a/tools/pz-analyzer/pz_parser.py b/tools/pz-analyzer/pz_parser.py index ecb460c..b4463fa 100644 --- a/tools/pz-analyzer/pz_parser.py +++ b/tools/pz-analyzer/pz_parser.py @@ -162,6 +162,13 @@ DOUBLE_QUOTED_RE = re.compile(r'"[^"]*"') SINGLE_QUOTED_RE = re.compile(r"'[^']*'") NUMERIC_RUN_RE = re.compile(r"\d{2,}") WS_RUN_RE = re.compile(r"\s+") +#: Strips a leading ``ERROR:`` / ``SEVERE:`` / ``WARN:`` / ``FATAL:`` token +#: from a body line so a body that happens to begin with the severity word +#: hashes to the same pattern_id as the bracketed-only variant. Matches the +#: token plus any colon and trailing whitespace; case-insensitive. +SEVERITY_PREFIX_STRIP_RE = re.compile( + r"^\s*(?:ERROR|SEVERE|WARN|FATAL)\s*[:\s]\s*", re.IGNORECASE +) # --------------------------------------------------------------------------- # Dataclasses — match the JSON keys the spec mandates so consumers can @@ -403,12 +410,20 @@ def _entry_text(entry: Entry) -> str: def attribute_entry(entry: Entry, prior_lookback_lines: list[str]) -> tuple[str, str, str, str, str]: """Determine ``(mod_id, mod_name, attribution, confidence, reason)``. - ``prior_lookback_lines`` is the up-to-INFERRED_LOOKBACK_LINES raw file - lines preceding this entry (used to look up a recent ``Lua((MOD:Y))`` - marker for inferred attribution). + ``prior_lookback_lines`` is the body lines from prior entries that fall + within INFERRED_LOOKBACK_LINES raw-file-line distance from this entry's + start, in source order. The list is scanned in reverse for the nearest + ``Lua((MOD:Y))`` marker when inferred attribution is being attempted. + + Direct-attribution priority: Lua marker -> needed-by -> require-failed. + + Rationale: ``needed by `` names the dependent mod (more semantically + targeted) and is preferred over ``require("...") failed`` which only names + the missing module path. ``Lua((MOD:...))`` is unambiguous and wins + outright. """ text = _entry_text(entry) - # 1. Direct via Lua((MOD:X)) + # 1. Direct via Lua((MOD:X)) — unambiguous; outranks every other signal. m = LUA_MOD_MARKER_RE.search(text) if m: raw = m.group(1).strip() @@ -559,12 +574,19 @@ def detect_kind(entry: Entry, attribution: str, body_text: str) -> str: def normalize_first_line(first: str) -> str: - """Per spec: strip session metadata prefix, flatten quoted strings to - ``""`` / ``''``, flatten ≥2-digit numeric runs to ````, collapse - whitespace, truncate to 200 chars. + """Per spec: strip session metadata prefix, strip any leading severity + word (so ``SEVERE: foo`` and ``foo`` produce the same pattern_id when both + are SEVERE-level), flatten quoted strings to ``""`` / ``''``, flatten + ≥2-digit numeric runs to ````, collapse whitespace, truncate to 200 + chars. """ s = first.strip() s = SESSION_META_RE.sub("", s) + # Strip any leading ERROR:/SEVERE:/WARN:/FATAL: that survived in the body + # — the bracketed level already feeds pattern_id separately, so leaving + # the body-prefix in place would fragment signatures across "body has + # SEVERE: prefix" vs "body has no prefix but bracketed level is SEVERE." + s = SEVERITY_PREFIX_STRIP_RE.sub("", s) s = DOUBLE_QUOTED_RE.sub('""', s) s = SINGLE_QUOTED_RE.sub("''", s) s = NUMERIC_RUN_RE.sub("", s) @@ -573,14 +595,22 @@ def normalize_first_line(first: str) -> str: def compute_pattern_id(level: str, first_line: str) -> str: - """``sha256(level + normalized_first_line)[:16]``, prefixed ``sha256:``.""" + """``sha256(level + normalized_first_line)[:16]``, prefixed ``sha256:``. + + 16 hex chars (64 bits) chosen for JSON readability vs collision-resistance + trade-off; consumers treat as opaque. + """ norm = normalize_first_line(first_line) h = hashlib.sha256(f"{level}\n{norm}".encode("utf-8")).hexdigest() return f"sha256:{h[:16]}" def compute_signature(pattern_id: str, mod_id: str) -> str: - """``sha256(pattern_id + mod_id)[:16]``, prefixed ``sha256:``.""" + """``sha256(pattern_id + mod_id)[:16]``, prefixed ``sha256:``. + + 16 hex chars (64 bits) chosen for JSON readability vs collision-resistance + trade-off; consumers treat as opaque. + """ h = hashlib.sha256(f"{pattern_id}\n{mod_id}".encode("utf-8")).hexdigest() return f"sha256:{h[:16]}" @@ -613,23 +643,31 @@ def _build_excerpt(entry: Entry, max_chars: int = 1000) -> str: def _build_lookback_window(entries: list[Entry], hit_idx: int) -> list[str]: - """Flatten body lines from prior entries (most recent first; up to - INFERRED_LOOKBACK_LINES total) for inferred attribution lookback. + """Collect body lines from prior entries whose ``line_start`` falls within + INFERRED_LOOKBACK_LINES raw-file-line distance from the current entry. + + Spec wording is "within the previous 40 lines", measured in raw file lines + (mirrors pzmm's ``(i - last_mod_line) <= 40``, inclusive of 40). Counting + raw lines means a multi-line entry (e.g., a 5-line Java stack trace) does + not shrink the practical window the way a body-line budget would. - We walk backwards through ``entries`` accumulating each entry's body lines - until we've gathered INFERRED_LOOKBACK_LINES lines or run out of entries. Returned list is in source order (oldest first) so callers can call ``reversed()`` on it. """ + if hit_idx <= 0: + return [] + threshold = entries[hit_idx].line_start - INFERRED_LOOKBACK_LINES + in_window: list[Entry] = [] + for j in range(hit_idx - 1, -1, -1): + prior = entries[j] + if prior.line_start < threshold: + break + in_window.append(prior) + # We accumulated newest-first; reverse so we emit in source order. + in_window.reverse() collected: list[str] = [] - i = hit_idx - 1 - while i >= 0 and len(collected) < INFERRED_LOOKBACK_LINES: - for line in reversed(entries[i].body): - collected.append(line) - if len(collected) >= INFERRED_LOOKBACK_LINES: - break - i -= 1 - collected.reverse() + for prior in in_window: + collected.extend(prior.body) return collected diff --git a/tools/pz-analyzer/tests/test_attribution.py b/tools/pz-analyzer/tests/test_attribution.py index 178f687..89e4732 100644 --- a/tools/pz-analyzer/tests/test_attribution.py +++ b/tools/pz-analyzer/tests/test_attribution.py @@ -91,5 +91,130 @@ class NeededByTests(unittest.TestCase): self.assertEqual(rec.mod_id, "testmodalpha") +def _make_marker_line(idx: int) -> str: + """Synthesise a single LOG-level entry containing a Lua((MOD:...)) marker.""" + # Vary timestamps so the bracketed prefix is unique-ish; not strictly + # required — they only feed Entry.timestamp, not parsing. + return ( + f"[16-04-26 00:00:{idx:02d}.000] LOG : General f:0, " + f"t:1776297642{idx:03d}, st:48,648,157,434> " + "Lua((MOD:Test Mod Alpha)) initialised." + ) + + +def _make_filler_line(idx: int) -> str: + """A plain LOG-level entry with no marker; one raw line.""" + return ( + f"[16-04-26 00:01:{idx % 60:02d}.000] LOG : General f:0, " + f"t:177629760{idx:04d}, st:48,648,200,178> filler entry {idx}." + ) + + +def _make_error_line() -> str: + """A Lua-shaped ERROR with no Lua((MOD:...)) marker on the entry itself + — so attribution must come from the lookback window if it comes at all.""" + return ( + "[16-04-26 00:02:00.000] ERROR: General f:0, " + "t:1776297900000, st:48,648,300,178> " + "LuaManager.GetFunctionObject> no such function: doStuff" + ) + + +class RawLineLookbackTests(unittest.TestCase): + """Phase 3 — lookback semantics measure raw file lines, not body-line + budgets. Multi-line entries inside the window must not shrink the + practical reach.""" + + def _write_fixture(self, name: str, lines: list[str]) -> pathlib.Path: + path = FIXTURE_DIR / name + path.write_text("\n".join(lines) + "\n") + return path + + def test_marker_exactly_at_lookback_boundary_attributes(self) -> None: + # Marker on line 1, ERROR on line 41 -> raw-line distance = 40 + # (inclusive of INFERRED_LOOKBACK_LINES=40 -> still attributed). + lines = [_make_marker_line(0)] + for i in range(1, 40): + lines.append(_make_filler_line(i)) + lines.append(_make_error_line()) # line 41 in the fixture + path = self._write_fixture("_rawline_at_boundary.txt", lines) + try: + entries = pz_parser.parse_file(path) + self.assertEqual(entries[0].line_start, 1) + self.assertEqual(entries[-1].line_start, 41) + records = pz_parser.classify_entries(entries, source_file="b1.txt") + self.assertEqual(len(records), 1) + self.assertEqual(records[0].attribution, "inferred") + self.assertEqual(records[0].mod_id, "testmodalpha") + finally: + path.unlink() + + def test_marker_one_line_past_boundary_does_not_attribute(self) -> None: + # Marker on line 1, ERROR on line 42 -> raw-line distance = 41 + # (just outside INFERRED_LOOKBACK_LINES -> unattributed). + lines = [_make_marker_line(0)] + for i in range(1, 41): + lines.append(_make_filler_line(i)) + lines.append(_make_error_line()) # line 42 in the fixture + path = self._write_fixture("_rawline_past_boundary.txt", lines) + try: + entries = pz_parser.parse_file(path) + self.assertEqual(entries[0].line_start, 1) + self.assertEqual(entries[-1].line_start, 42) + records = pz_parser.classify_entries(entries, source_file="b2.txt") + self.assertEqual(len(records), 1) + self.assertEqual(records[0].attribution, "unattributed") + self.assertEqual(records[0].mod_id, "__unattributed__") + finally: + path.unlink() + + def test_multiline_entry_does_not_shrink_practical_lookback(self) -> None: + # Layout the file so a multi-line entry sits between marker and ERROR. + # Under the OLD body-line-budget semantics the multi-line entry's 5 + # continuation lines would consume the budget and push the marker + # outside the window. Under raw-line semantics the marker on line 1 is + # still within 40 raw lines of the ERROR even though the file has a + # 6-line multi-line entry in between. + lines = [_make_marker_line(0)] # raw line 1: marker entry + # Single-line fillers on raw lines 2..30 (29 entries). + for i in range(1, 30): + lines.append(_make_filler_line(i)) + # Multi-line entry: header on raw line 31, 5 continuations on lines + # 32..36 (Java-stack-trace shape). + lines.append( + "[16-04-26 00:01:30.000] LOG : General f:0, " + "t:1776297930000, st:48,648,200,178> stack trace dump" + ) + for k in range(5): + lines.append(f"\tat zombie.SomeClass.method{k}(SomeClass.java:{k + 1})") + # Single-line fillers on raw lines 37..40 (4 entries). + for i in range(30, 34): + lines.append(_make_filler_line(i)) + # ERROR at raw line 41 -> N - 1 = 40 -> within window. + lines.append(_make_error_line()) + path = self._write_fixture("_rawline_multiline.txt", lines) + try: + entries = pz_parser.parse_file(path) + # Sanity-check the layout: first entry at line 1, multi-line entry + # sits at line 31 with 6 body lines (header + 5 continuations), + # ERROR at line 41. + self.assertEqual(entries[0].line_start, 1) + multi = next( + e for e in entries + if e.line_start == 31 and len(e.body) == 6 + ) + self.assertEqual(multi.line_end, 36) + self.assertEqual(entries[-1].line_start, 41) + records = pz_parser.classify_entries(entries, source_file="ml.txt") + self.assertEqual(len(records), 1) + # Under the OLD body-line-budget rule, the 5 stack-frame lines + # plus the surrounding fillers would have pushed the marker out + # of the budget. Under raw-line semantics it survives. + self.assertEqual(records[0].attribution, "inferred") + self.assertEqual(records[0].mod_id, "testmodalpha") + finally: + path.unlink() + + if __name__ == "__main__": unittest.main() diff --git a/tools/pz-analyzer/tests/test_signatures.py b/tools/pz-analyzer/tests/test_signatures.py index fbd0479..8cdd3ed 100644 --- a/tools/pz-analyzer/tests/test_signatures.py +++ b/tools/pz-analyzer/tests/test_signatures.py @@ -55,5 +55,37 @@ class SignatureUniquenessTests(unittest.TestCase): self.assertEqual(pat[:7], "sha256:") +class SeverityPrefixStripTests(unittest.TestCase): + """A body line that begins with a literal severity word (``SEVERE:``, + ``ERROR:``, ``WARN:``, ``FATAL:``) should not fragment pattern_id away + from the otherwise-identical body that lacks the prefix. The bracketed + level already feeds pattern_id; the prefix is redundant and varies in + practice.""" + + def test_pattern_id_invariant_under_body_prefix_severe(self) -> None: + # Same logical error: one line carries ``SEVERE: `` body prefix, the + # other doesn't. Both classified as SEVERE by their bracketed level. + with_prefix = pz_parser.compute_pattern_id( + "SEVERE", + "SEVERE: foo at zombie.X(File.java:42)", + ) + without_prefix = pz_parser.compute_pattern_id( + "SEVERE", + "foo at zombie.X(File.java:42)", + ) + self.assertEqual(with_prefix, without_prefix) + + def test_pattern_id_invariant_under_body_prefix_error(self) -> None: + with_prefix = pz_parser.compute_pattern_id( + "ERROR", + "ERROR: doStuff failed in module", + ) + without_prefix = pz_parser.compute_pattern_id( + "ERROR", + "doStuff failed in module", + ) + self.assertEqual(with_prefix, without_prefix) + + if __name__ == "__main__": unittest.main()