fix: address code review findings on pz_parser

- Strip body-prefix severity in normalize_first_line so pattern_id is stable across body-prefix vs bracketed-only variants. - Lookback for inferred attribution now counts raw file lines (per spec literal), not body-line budget across entries. - Document hash truncation (64-bit) and direct-attribution priority. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 15:33:56 +00:00
parent 4fec3a58f6
commit 2e7bebc911
3 changed files with 216 additions and 21 deletions
--- a/tools/pz-analyzer/tests/test_attribution.py
+++ b/tools/pz-analyzer/tests/test_attribution.py
@@ -91,5 +91,130 @@ class NeededByTests(unittest.TestCase):
        self.assertEqual(rec.mod_id, "testmodalpha")


+def _make_marker_line(idx: int) -> str:
+    """Synthesise a single LOG-level entry containing a Lua((MOD:...)) marker."""
+    # Vary timestamps so the bracketed prefix is unique-ish; not strictly
+    # required — they only feed Entry.timestamp, not parsing.
+    return (
+        f"[16-04-26 00:00:{idx:02d}.000] LOG  : General      f:0, "
+        f"t:1776297642{idx:03d}, st:48,648,157,434> "
+        "Lua((MOD:Test Mod Alpha)) initialised."
+    )
+
+
+def _make_filler_line(idx: int) -> str:
+    """A plain LOG-level entry with no marker; one raw line."""
+    return (
+        f"[16-04-26 00:01:{idx % 60:02d}.000] LOG  : General      f:0, "
+        f"t:177629760{idx:04d}, st:48,648,200,178> filler entry {idx}."
+    )
+
+
+def _make_error_line() -> str:
+    """A Lua-shaped ERROR with no Lua((MOD:...)) marker on the entry itself
+    — so attribution must come from the lookback window if it comes at all."""
+    return (
+        "[16-04-26 00:02:00.000] ERROR: General      f:0, "
+        "t:1776297900000, st:48,648,300,178> "
+        "LuaManager.GetFunctionObject> no such function: doStuff"
+    )
+
+
+class RawLineLookbackTests(unittest.TestCase):
+    """Phase 3 — lookback semantics measure raw file lines, not body-line
+    budgets. Multi-line entries inside the window must not shrink the
+    practical reach."""
+
+    def _write_fixture(self, name: str, lines: list[str]) -> pathlib.Path:
+        path = FIXTURE_DIR / name
+        path.write_text("\n".join(lines) + "\n")
+        return path
+
+    def test_marker_exactly_at_lookback_boundary_attributes(self) -> None:
+        # Marker on line 1, ERROR on line 41 -> raw-line distance = 40
+        # (inclusive of INFERRED_LOOKBACK_LINES=40 -> still attributed).
+        lines = [_make_marker_line(0)]
+        for i in range(1, 40):
+            lines.append(_make_filler_line(i))
+        lines.append(_make_error_line())  # line 41 in the fixture
+        path = self._write_fixture("_rawline_at_boundary.txt", lines)
+        try:
+            entries = pz_parser.parse_file(path)
+            self.assertEqual(entries[0].line_start, 1)
+            self.assertEqual(entries[-1].line_start, 41)
+            records = pz_parser.classify_entries(entries, source_file="b1.txt")
+            self.assertEqual(len(records), 1)
+            self.assertEqual(records[0].attribution, "inferred")
+            self.assertEqual(records[0].mod_id, "testmodalpha")
+        finally:
+            path.unlink()
+
+    def test_marker_one_line_past_boundary_does_not_attribute(self) -> None:
+        # Marker on line 1, ERROR on line 42 -> raw-line distance = 41
+        # (just outside INFERRED_LOOKBACK_LINES -> unattributed).
+        lines = [_make_marker_line(0)]
+        for i in range(1, 41):
+            lines.append(_make_filler_line(i))
+        lines.append(_make_error_line())  # line 42 in the fixture
+        path = self._write_fixture("_rawline_past_boundary.txt", lines)
+        try:
+            entries = pz_parser.parse_file(path)
+            self.assertEqual(entries[0].line_start, 1)
+            self.assertEqual(entries[-1].line_start, 42)
+            records = pz_parser.classify_entries(entries, source_file="b2.txt")
+            self.assertEqual(len(records), 1)
+            self.assertEqual(records[0].attribution, "unattributed")
+            self.assertEqual(records[0].mod_id, "__unattributed__")
+        finally:
+            path.unlink()
+
+    def test_multiline_entry_does_not_shrink_practical_lookback(self) -> None:
+        # Layout the file so a multi-line entry sits between marker and ERROR.
+        # Under the OLD body-line-budget semantics the multi-line entry's 5
+        # continuation lines would consume the budget and push the marker
+        # outside the window. Under raw-line semantics the marker on line 1 is
+        # still within 40 raw lines of the ERROR even though the file has a
+        # 6-line multi-line entry in between.
+        lines = [_make_marker_line(0)]            # raw line 1: marker entry
+        # Single-line fillers on raw lines 2..30 (29 entries).
+        for i in range(1, 30):
+            lines.append(_make_filler_line(i))
+        # Multi-line entry: header on raw line 31, 5 continuations on lines
+        # 32..36 (Java-stack-trace shape).
+        lines.append(
+            "[16-04-26 00:01:30.000] LOG  : General      f:0, "
+            "t:1776297930000, st:48,648,200,178> stack trace dump"
+        )
+        for k in range(5):
+            lines.append(f"\tat zombie.SomeClass.method{k}(SomeClass.java:{k + 1})")
+        # Single-line fillers on raw lines 37..40 (4 entries).
+        for i in range(30, 34):
+            lines.append(_make_filler_line(i))
+        # ERROR at raw line 41 -> N - 1 = 40 -> within window.
+        lines.append(_make_error_line())
+        path = self._write_fixture("_rawline_multiline.txt", lines)
+        try:
+            entries = pz_parser.parse_file(path)
+            # Sanity-check the layout: first entry at line 1, multi-line entry
+            # sits at line 31 with 6 body lines (header + 5 continuations),
+            # ERROR at line 41.
+            self.assertEqual(entries[0].line_start, 1)
+            multi = next(
+                e for e in entries
+                if e.line_start == 31 and len(e.body) == 6
+            )
+            self.assertEqual(multi.line_end, 36)
+            self.assertEqual(entries[-1].line_start, 41)
+            records = pz_parser.classify_entries(entries, source_file="ml.txt")
+            self.assertEqual(len(records), 1)
+            # Under the OLD body-line-budget rule, the 5 stack-frame lines
+            # plus the surrounding fillers would have pushed the marker out
+            # of the budget. Under raw-line semantics it survives.
+            self.assertEqual(records[0].attribution, "inferred")
+            self.assertEqual(records[0].mod_id, "testmodalpha")
+        finally:
+            path.unlink()
+
+
 if __name__ == "__main__":
    unittest.main()