fix: address code review findings on pz_parser
- Strip body-prefix severity in normalize_first_line so pattern_id is stable across body-prefix vs bracketed-only variants. - Lookback for inferred attribution now counts raw file lines (per spec literal), not body-line budget across entries. - Document hash truncation (64-bit) and direct-attribution priority. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -162,6 +162,13 @@ DOUBLE_QUOTED_RE = re.compile(r'"[^"]*"')
|
|||||||
SINGLE_QUOTED_RE = re.compile(r"'[^']*'")
|
SINGLE_QUOTED_RE = re.compile(r"'[^']*'")
|
||||||
NUMERIC_RUN_RE = re.compile(r"\d{2,}")
|
NUMERIC_RUN_RE = re.compile(r"\d{2,}")
|
||||||
WS_RUN_RE = re.compile(r"\s+")
|
WS_RUN_RE = re.compile(r"\s+")
|
||||||
|
#: Strips a leading ``ERROR:`` / ``SEVERE:`` / ``WARN:`` / ``FATAL:`` token
|
||||||
|
#: from a body line so a body that happens to begin with the severity word
|
||||||
|
#: hashes to the same pattern_id as the bracketed-only variant. Matches the
|
||||||
|
#: token plus any colon and trailing whitespace; case-insensitive.
|
||||||
|
SEVERITY_PREFIX_STRIP_RE = re.compile(
|
||||||
|
r"^\s*(?:ERROR|SEVERE|WARN|FATAL)\s*[:\s]\s*", re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Dataclasses — match the JSON keys the spec mandates so consumers can
|
# Dataclasses — match the JSON keys the spec mandates so consumers can
|
||||||
@@ -403,12 +410,20 @@ def _entry_text(entry: Entry) -> str:
|
|||||||
def attribute_entry(entry: Entry, prior_lookback_lines: list[str]) -> tuple[str, str, str, str, str]:
|
def attribute_entry(entry: Entry, prior_lookback_lines: list[str]) -> tuple[str, str, str, str, str]:
|
||||||
"""Determine ``(mod_id, mod_name, attribution, confidence, reason)``.
|
"""Determine ``(mod_id, mod_name, attribution, confidence, reason)``.
|
||||||
|
|
||||||
``prior_lookback_lines`` is the up-to-INFERRED_LOOKBACK_LINES raw file
|
``prior_lookback_lines`` is the body lines from prior entries that fall
|
||||||
lines preceding this entry (used to look up a recent ``Lua((MOD:Y))``
|
within INFERRED_LOOKBACK_LINES raw-file-line distance from this entry's
|
||||||
marker for inferred attribution).
|
start, in source order. The list is scanned in reverse for the nearest
|
||||||
|
``Lua((MOD:Y))`` marker when inferred attribution is being attempted.
|
||||||
|
|
||||||
|
Direct-attribution priority: Lua marker -> needed-by -> require-failed.
|
||||||
|
|
||||||
|
Rationale: ``needed by <mod>`` names the dependent mod (more semantically
|
||||||
|
targeted) and is preferred over ``require("...") failed`` which only names
|
||||||
|
the missing module path. ``Lua((MOD:...))`` is unambiguous and wins
|
||||||
|
outright.
|
||||||
"""
|
"""
|
||||||
text = _entry_text(entry)
|
text = _entry_text(entry)
|
||||||
# 1. Direct via Lua((MOD:X))
|
# 1. Direct via Lua((MOD:X)) — unambiguous; outranks every other signal.
|
||||||
m = LUA_MOD_MARKER_RE.search(text)
|
m = LUA_MOD_MARKER_RE.search(text)
|
||||||
if m:
|
if m:
|
||||||
raw = m.group(1).strip()
|
raw = m.group(1).strip()
|
||||||
@@ -559,12 +574,19 @@ def detect_kind(entry: Entry, attribution: str, body_text: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def normalize_first_line(first: str) -> str:
|
def normalize_first_line(first: str) -> str:
|
||||||
"""Per spec: strip session metadata prefix, flatten quoted strings to
|
"""Per spec: strip session metadata prefix, strip any leading severity
|
||||||
``"<S>"`` / ``'<S>'``, flatten ≥2-digit numeric runs to ``<N>``, collapse
|
word (so ``SEVERE: foo`` and ``foo`` produce the same pattern_id when both
|
||||||
whitespace, truncate to 200 chars.
|
are SEVERE-level), flatten quoted strings to ``"<S>"`` / ``'<S>'``, flatten
|
||||||
|
≥2-digit numeric runs to ``<N>``, collapse whitespace, truncate to 200
|
||||||
|
chars.
|
||||||
"""
|
"""
|
||||||
s = first.strip()
|
s = first.strip()
|
||||||
s = SESSION_META_RE.sub("", s)
|
s = SESSION_META_RE.sub("", s)
|
||||||
|
# Strip any leading ERROR:/SEVERE:/WARN:/FATAL: that survived in the body
|
||||||
|
# — the bracketed level already feeds pattern_id separately, so leaving
|
||||||
|
# the body-prefix in place would fragment signatures across "body has
|
||||||
|
# SEVERE: prefix" vs "body has no prefix but bracketed level is SEVERE."
|
||||||
|
s = SEVERITY_PREFIX_STRIP_RE.sub("", s)
|
||||||
s = DOUBLE_QUOTED_RE.sub('"<S>"', s)
|
s = DOUBLE_QUOTED_RE.sub('"<S>"', s)
|
||||||
s = SINGLE_QUOTED_RE.sub("'<S>'", s)
|
s = SINGLE_QUOTED_RE.sub("'<S>'", s)
|
||||||
s = NUMERIC_RUN_RE.sub("<N>", s)
|
s = NUMERIC_RUN_RE.sub("<N>", s)
|
||||||
@@ -573,14 +595,22 @@ def normalize_first_line(first: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def compute_pattern_id(level: str, first_line: str) -> str:
|
def compute_pattern_id(level: str, first_line: str) -> str:
|
||||||
"""``sha256(level + normalized_first_line)[:16]``, prefixed ``sha256:``."""
|
"""``sha256(level + normalized_first_line)[:16]``, prefixed ``sha256:``.
|
||||||
|
|
||||||
|
16 hex chars (64 bits) chosen for JSON readability vs collision-resistance
|
||||||
|
trade-off; consumers treat as opaque.
|
||||||
|
"""
|
||||||
norm = normalize_first_line(first_line)
|
norm = normalize_first_line(first_line)
|
||||||
h = hashlib.sha256(f"{level}\n{norm}".encode("utf-8")).hexdigest()
|
h = hashlib.sha256(f"{level}\n{norm}".encode("utf-8")).hexdigest()
|
||||||
return f"sha256:{h[:16]}"
|
return f"sha256:{h[:16]}"
|
||||||
|
|
||||||
|
|
||||||
def compute_signature(pattern_id: str, mod_id: str) -> str:
|
def compute_signature(pattern_id: str, mod_id: str) -> str:
|
||||||
"""``sha256(pattern_id + mod_id)[:16]``, prefixed ``sha256:``."""
|
"""``sha256(pattern_id + mod_id)[:16]``, prefixed ``sha256:``.
|
||||||
|
|
||||||
|
16 hex chars (64 bits) chosen for JSON readability vs collision-resistance
|
||||||
|
trade-off; consumers treat as opaque.
|
||||||
|
"""
|
||||||
h = hashlib.sha256(f"{pattern_id}\n{mod_id}".encode("utf-8")).hexdigest()
|
h = hashlib.sha256(f"{pattern_id}\n{mod_id}".encode("utf-8")).hexdigest()
|
||||||
return f"sha256:{h[:16]}"
|
return f"sha256:{h[:16]}"
|
||||||
|
|
||||||
@@ -613,23 +643,31 @@ def _build_excerpt(entry: Entry, max_chars: int = 1000) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def _build_lookback_window(entries: list[Entry], hit_idx: int) -> list[str]:
|
def _build_lookback_window(entries: list[Entry], hit_idx: int) -> list[str]:
|
||||||
"""Flatten body lines from prior entries (most recent first; up to
|
"""Collect body lines from prior entries whose ``line_start`` falls within
|
||||||
INFERRED_LOOKBACK_LINES total) for inferred attribution lookback.
|
INFERRED_LOOKBACK_LINES raw-file-line distance from the current entry.
|
||||||
|
|
||||||
|
Spec wording is "within the previous 40 lines", measured in raw file lines
|
||||||
|
(mirrors pzmm's ``(i - last_mod_line) <= 40``, inclusive of 40). Counting
|
||||||
|
raw lines means a multi-line entry (e.g., a 5-line Java stack trace) does
|
||||||
|
not shrink the practical window the way a body-line budget would.
|
||||||
|
|
||||||
We walk backwards through ``entries`` accumulating each entry's body lines
|
|
||||||
until we've gathered INFERRED_LOOKBACK_LINES lines or run out of entries.
|
|
||||||
Returned list is in source order (oldest first) so callers can call
|
Returned list is in source order (oldest first) so callers can call
|
||||||
``reversed()`` on it.
|
``reversed()`` on it.
|
||||||
"""
|
"""
|
||||||
|
if hit_idx <= 0:
|
||||||
|
return []
|
||||||
|
threshold = entries[hit_idx].line_start - INFERRED_LOOKBACK_LINES
|
||||||
|
in_window: list[Entry] = []
|
||||||
|
for j in range(hit_idx - 1, -1, -1):
|
||||||
|
prior = entries[j]
|
||||||
|
if prior.line_start < threshold:
|
||||||
|
break
|
||||||
|
in_window.append(prior)
|
||||||
|
# We accumulated newest-first; reverse so we emit in source order.
|
||||||
|
in_window.reverse()
|
||||||
collected: list[str] = []
|
collected: list[str] = []
|
||||||
i = hit_idx - 1
|
for prior in in_window:
|
||||||
while i >= 0 and len(collected) < INFERRED_LOOKBACK_LINES:
|
collected.extend(prior.body)
|
||||||
for line in reversed(entries[i].body):
|
|
||||||
collected.append(line)
|
|
||||||
if len(collected) >= INFERRED_LOOKBACK_LINES:
|
|
||||||
break
|
|
||||||
i -= 1
|
|
||||||
collected.reverse()
|
|
||||||
return collected
|
return collected
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -91,5 +91,130 @@ class NeededByTests(unittest.TestCase):
|
|||||||
self.assertEqual(rec.mod_id, "testmodalpha")
|
self.assertEqual(rec.mod_id, "testmodalpha")
|
||||||
|
|
||||||
|
|
||||||
|
def _make_marker_line(idx: int) -> str:
|
||||||
|
"""Synthesise a single LOG-level entry containing a Lua((MOD:...)) marker."""
|
||||||
|
# Vary timestamps so the bracketed prefix is unique-ish; not strictly
|
||||||
|
# required — they only feed Entry.timestamp, not parsing.
|
||||||
|
return (
|
||||||
|
f"[16-04-26 00:00:{idx:02d}.000] LOG : General f:0, "
|
||||||
|
f"t:1776297642{idx:03d}, st:48,648,157,434> "
|
||||||
|
"Lua((MOD:Test Mod Alpha)) initialised."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_filler_line(idx: int) -> str:
|
||||||
|
"""A plain LOG-level entry with no marker; one raw line."""
|
||||||
|
return (
|
||||||
|
f"[16-04-26 00:01:{idx % 60:02d}.000] LOG : General f:0, "
|
||||||
|
f"t:177629760{idx:04d}, st:48,648,200,178> filler entry {idx}."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_error_line() -> str:
|
||||||
|
"""A Lua-shaped ERROR with no Lua((MOD:...)) marker on the entry itself
|
||||||
|
— so attribution must come from the lookback window if it comes at all."""
|
||||||
|
return (
|
||||||
|
"[16-04-26 00:02:00.000] ERROR: General f:0, "
|
||||||
|
"t:1776297900000, st:48,648,300,178> "
|
||||||
|
"LuaManager.GetFunctionObject> no such function: doStuff"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class RawLineLookbackTests(unittest.TestCase):
|
||||||
|
"""Phase 3 — lookback semantics measure raw file lines, not body-line
|
||||||
|
budgets. Multi-line entries inside the window must not shrink the
|
||||||
|
practical reach."""
|
||||||
|
|
||||||
|
def _write_fixture(self, name: str, lines: list[str]) -> pathlib.Path:
|
||||||
|
path = FIXTURE_DIR / name
|
||||||
|
path.write_text("\n".join(lines) + "\n")
|
||||||
|
return path
|
||||||
|
|
||||||
|
def test_marker_exactly_at_lookback_boundary_attributes(self) -> None:
|
||||||
|
# Marker on line 1, ERROR on line 41 -> raw-line distance = 40
|
||||||
|
# (inclusive of INFERRED_LOOKBACK_LINES=40 -> still attributed).
|
||||||
|
lines = [_make_marker_line(0)]
|
||||||
|
for i in range(1, 40):
|
||||||
|
lines.append(_make_filler_line(i))
|
||||||
|
lines.append(_make_error_line()) # line 41 in the fixture
|
||||||
|
path = self._write_fixture("_rawline_at_boundary.txt", lines)
|
||||||
|
try:
|
||||||
|
entries = pz_parser.parse_file(path)
|
||||||
|
self.assertEqual(entries[0].line_start, 1)
|
||||||
|
self.assertEqual(entries[-1].line_start, 41)
|
||||||
|
records = pz_parser.classify_entries(entries, source_file="b1.txt")
|
||||||
|
self.assertEqual(len(records), 1)
|
||||||
|
self.assertEqual(records[0].attribution, "inferred")
|
||||||
|
self.assertEqual(records[0].mod_id, "testmodalpha")
|
||||||
|
finally:
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
def test_marker_one_line_past_boundary_does_not_attribute(self) -> None:
|
||||||
|
# Marker on line 1, ERROR on line 42 -> raw-line distance = 41
|
||||||
|
# (just outside INFERRED_LOOKBACK_LINES -> unattributed).
|
||||||
|
lines = [_make_marker_line(0)]
|
||||||
|
for i in range(1, 41):
|
||||||
|
lines.append(_make_filler_line(i))
|
||||||
|
lines.append(_make_error_line()) # line 42 in the fixture
|
||||||
|
path = self._write_fixture("_rawline_past_boundary.txt", lines)
|
||||||
|
try:
|
||||||
|
entries = pz_parser.parse_file(path)
|
||||||
|
self.assertEqual(entries[0].line_start, 1)
|
||||||
|
self.assertEqual(entries[-1].line_start, 42)
|
||||||
|
records = pz_parser.classify_entries(entries, source_file="b2.txt")
|
||||||
|
self.assertEqual(len(records), 1)
|
||||||
|
self.assertEqual(records[0].attribution, "unattributed")
|
||||||
|
self.assertEqual(records[0].mod_id, "__unattributed__")
|
||||||
|
finally:
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
def test_multiline_entry_does_not_shrink_practical_lookback(self) -> None:
|
||||||
|
# Layout the file so a multi-line entry sits between marker and ERROR.
|
||||||
|
# Under the OLD body-line-budget semantics the multi-line entry's 5
|
||||||
|
# continuation lines would consume the budget and push the marker
|
||||||
|
# outside the window. Under raw-line semantics the marker on line 1 is
|
||||||
|
# still within 40 raw lines of the ERROR even though the file has a
|
||||||
|
# 6-line multi-line entry in between.
|
||||||
|
lines = [_make_marker_line(0)] # raw line 1: marker entry
|
||||||
|
# Single-line fillers on raw lines 2..30 (29 entries).
|
||||||
|
for i in range(1, 30):
|
||||||
|
lines.append(_make_filler_line(i))
|
||||||
|
# Multi-line entry: header on raw line 31, 5 continuations on lines
|
||||||
|
# 32..36 (Java-stack-trace shape).
|
||||||
|
lines.append(
|
||||||
|
"[16-04-26 00:01:30.000] LOG : General f:0, "
|
||||||
|
"t:1776297930000, st:48,648,200,178> stack trace dump"
|
||||||
|
)
|
||||||
|
for k in range(5):
|
||||||
|
lines.append(f"\tat zombie.SomeClass.method{k}(SomeClass.java:{k + 1})")
|
||||||
|
# Single-line fillers on raw lines 37..40 (4 entries).
|
||||||
|
for i in range(30, 34):
|
||||||
|
lines.append(_make_filler_line(i))
|
||||||
|
# ERROR at raw line 41 -> N - 1 = 40 -> within window.
|
||||||
|
lines.append(_make_error_line())
|
||||||
|
path = self._write_fixture("_rawline_multiline.txt", lines)
|
||||||
|
try:
|
||||||
|
entries = pz_parser.parse_file(path)
|
||||||
|
# Sanity-check the layout: first entry at line 1, multi-line entry
|
||||||
|
# sits at line 31 with 6 body lines (header + 5 continuations),
|
||||||
|
# ERROR at line 41.
|
||||||
|
self.assertEqual(entries[0].line_start, 1)
|
||||||
|
multi = next(
|
||||||
|
e for e in entries
|
||||||
|
if e.line_start == 31 and len(e.body) == 6
|
||||||
|
)
|
||||||
|
self.assertEqual(multi.line_end, 36)
|
||||||
|
self.assertEqual(entries[-1].line_start, 41)
|
||||||
|
records = pz_parser.classify_entries(entries, source_file="ml.txt")
|
||||||
|
self.assertEqual(len(records), 1)
|
||||||
|
# Under the OLD body-line-budget rule, the 5 stack-frame lines
|
||||||
|
# plus the surrounding fillers would have pushed the marker out
|
||||||
|
# of the budget. Under raw-line semantics it survives.
|
||||||
|
self.assertEqual(records[0].attribution, "inferred")
|
||||||
|
self.assertEqual(records[0].mod_id, "testmodalpha")
|
||||||
|
finally:
|
||||||
|
path.unlink()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -55,5 +55,37 @@ class SignatureUniquenessTests(unittest.TestCase):
|
|||||||
self.assertEqual(pat[:7], "sha256:")
|
self.assertEqual(pat[:7], "sha256:")
|
||||||
|
|
||||||
|
|
||||||
|
class SeverityPrefixStripTests(unittest.TestCase):
|
||||||
|
"""A body line that begins with a literal severity word (``SEVERE:``,
|
||||||
|
``ERROR:``, ``WARN:``, ``FATAL:``) should not fragment pattern_id away
|
||||||
|
from the otherwise-identical body that lacks the prefix. The bracketed
|
||||||
|
level already feeds pattern_id; the prefix is redundant and varies in
|
||||||
|
practice."""
|
||||||
|
|
||||||
|
def test_pattern_id_invariant_under_body_prefix_severe(self) -> None:
|
||||||
|
# Same logical error: one line carries ``SEVERE: `` body prefix, the
|
||||||
|
# other doesn't. Both classified as SEVERE by their bracketed level.
|
||||||
|
with_prefix = pz_parser.compute_pattern_id(
|
||||||
|
"SEVERE",
|
||||||
|
"SEVERE: foo at zombie.X(File.java:42)",
|
||||||
|
)
|
||||||
|
without_prefix = pz_parser.compute_pattern_id(
|
||||||
|
"SEVERE",
|
||||||
|
"foo at zombie.X(File.java:42)",
|
||||||
|
)
|
||||||
|
self.assertEqual(with_prefix, without_prefix)
|
||||||
|
|
||||||
|
def test_pattern_id_invariant_under_body_prefix_error(self) -> None:
|
||||||
|
with_prefix = pz_parser.compute_pattern_id(
|
||||||
|
"ERROR",
|
||||||
|
"ERROR: doStuff failed in module",
|
||||||
|
)
|
||||||
|
without_prefix = pz_parser.compute_pattern_id(
|
||||||
|
"ERROR",
|
||||||
|
"doStuff failed in module",
|
||||||
|
)
|
||||||
|
self.assertEqual(with_prefix, without_prefix)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user