fix: address code review findings on pz_parser

- Strip body-prefix severity in normalize_first_line so pattern_id
  is stable across body-prefix vs bracketed-only variants.
- Lookback for inferred attribution now counts raw file lines
  (per spec literal), not body-line budget across entries.
- Document hash truncation (64-bit) and direct-attribution priority.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-04 15:33:56 +00:00
parent 4fec3a58f6
commit 2e7bebc911
3 changed files with 216 additions and 21 deletions

View File

@@ -162,6 +162,13 @@ DOUBLE_QUOTED_RE = re.compile(r'"[^"]*"')
SINGLE_QUOTED_RE = re.compile(r"'[^']*'")
NUMERIC_RUN_RE = re.compile(r"\d{2,}")
WS_RUN_RE = re.compile(r"\s+")
#: Strips a leading ``ERROR:`` / ``SEVERE:`` / ``WARN:`` / ``FATAL:`` token
#: from a body line so a body that happens to begin with the severity word
#: hashes to the same pattern_id as the bracketed-only variant. Matches the
#: token plus any colon and trailing whitespace; case-insensitive.
SEVERITY_PREFIX_STRIP_RE = re.compile(
r"^\s*(?:ERROR|SEVERE|WARN|FATAL)\s*[:\s]\s*", re.IGNORECASE
)
# ---------------------------------------------------------------------------
# Dataclasses — match the JSON keys the spec mandates so consumers can
@@ -403,12 +410,20 @@ def _entry_text(entry: Entry) -> str:
def attribute_entry(entry: Entry, prior_lookback_lines: list[str]) -> tuple[str, str, str, str, str]:
"""Determine ``(mod_id, mod_name, attribution, confidence, reason)``.
``prior_lookback_lines`` is the up-to-INFERRED_LOOKBACK_LINES raw file
lines preceding this entry (used to look up a recent ``Lua((MOD:Y))``
marker for inferred attribution).
``prior_lookback_lines`` is the body lines from prior entries that fall
within INFERRED_LOOKBACK_LINES raw-file-line distance from this entry's
start, in source order. The list is scanned in reverse for the nearest
``Lua((MOD:Y))`` marker when inferred attribution is being attempted.
Direct-attribution priority: Lua marker -> needed-by -> require-failed.
Rationale: ``needed by <mod>`` names the dependent mod (more semantically
targeted) and is preferred over ``require("...") failed`` which only names
the missing module path. ``Lua((MOD:...))`` is unambiguous and wins
outright.
"""
text = _entry_text(entry)
# 1. Direct via Lua((MOD:X))
# 1. Direct via Lua((MOD:X)) — unambiguous; outranks every other signal.
m = LUA_MOD_MARKER_RE.search(text)
if m:
raw = m.group(1).strip()
@@ -559,12 +574,19 @@ def detect_kind(entry: Entry, attribution: str, body_text: str) -> str:
def normalize_first_line(first: str) -> str:
"""Per spec: strip session metadata prefix, flatten quoted strings to
``"<S>"`` / ``'<S>'``, flatten ≥2-digit numeric runs to ``<N>``, collapse
whitespace, truncate to 200 chars.
"""Per spec: strip session metadata prefix, strip any leading severity
word (so ``SEVERE: foo`` and ``foo`` produce the same pattern_id when both
are SEVERE-level), flatten quoted strings to ``"<S>"`` / ``'<S>'``, flatten
≥2-digit numeric runs to ``<N>``, collapse whitespace, truncate to 200
chars.
"""
s = first.strip()
s = SESSION_META_RE.sub("", s)
# Strip any leading ERROR:/SEVERE:/WARN:/FATAL: that survived in the body
# — the bracketed level already feeds pattern_id separately, so leaving
# the body-prefix in place would fragment signatures across "body has
# SEVERE: prefix" vs "body has no prefix but bracketed level is SEVERE."
s = SEVERITY_PREFIX_STRIP_RE.sub("", s)
s = DOUBLE_QUOTED_RE.sub('"<S>"', s)
s = SINGLE_QUOTED_RE.sub("'<S>'", s)
s = NUMERIC_RUN_RE.sub("<N>", s)
@@ -573,14 +595,22 @@ def normalize_first_line(first: str) -> str:
def compute_pattern_id(level: str, first_line: str) -> str:
"""``sha256(level + normalized_first_line)[:16]``, prefixed ``sha256:``."""
"""``sha256(level + normalized_first_line)[:16]``, prefixed ``sha256:``.
16 hex chars (64 bits) chosen for JSON readability vs collision-resistance
trade-off; consumers treat as opaque.
"""
norm = normalize_first_line(first_line)
h = hashlib.sha256(f"{level}\n{norm}".encode("utf-8")).hexdigest()
return f"sha256:{h[:16]}"
def compute_signature(pattern_id: str, mod_id: str) -> str:
"""``sha256(pattern_id + mod_id)[:16]``, prefixed ``sha256:``."""
"""``sha256(pattern_id + mod_id)[:16]``, prefixed ``sha256:``.
16 hex chars (64 bits) chosen for JSON readability vs collision-resistance
trade-off; consumers treat as opaque.
"""
h = hashlib.sha256(f"{pattern_id}\n{mod_id}".encode("utf-8")).hexdigest()
return f"sha256:{h[:16]}"
@@ -613,23 +643,31 @@ def _build_excerpt(entry: Entry, max_chars: int = 1000) -> str:
def _build_lookback_window(entries: list[Entry], hit_idx: int) -> list[str]:
"""Flatten body lines from prior entries (most recent first; up to
INFERRED_LOOKBACK_LINES total) for inferred attribution lookback.
"""Collect body lines from prior entries whose ``line_start`` falls within
INFERRED_LOOKBACK_LINES raw-file-line distance from the current entry.
Spec wording is "within the previous 40 lines", measured in raw file lines
(mirrors pzmm's ``(i - last_mod_line) <= 40``, inclusive of 40). Counting
raw lines means a multi-line entry (e.g., a 5-line Java stack trace) does
not shrink the practical window the way a body-line budget would.
We walk backwards through ``entries`` accumulating each entry's body lines
until we've gathered INFERRED_LOOKBACK_LINES lines or run out of entries.
Returned list is in source order (oldest first) so callers can call
``reversed()`` on it.
"""
if hit_idx <= 0:
return []
threshold = entries[hit_idx].line_start - INFERRED_LOOKBACK_LINES
in_window: list[Entry] = []
for j in range(hit_idx - 1, -1, -1):
prior = entries[j]
if prior.line_start < threshold:
break
in_window.append(prior)
# We accumulated newest-first; reverse so we emit in source order.
in_window.reverse()
collected: list[str] = []
i = hit_idx - 1
while i >= 0 and len(collected) < INFERRED_LOOKBACK_LINES:
for line in reversed(entries[i].body):
collected.append(line)
if len(collected) >= INFERRED_LOOKBACK_LINES:
break
i -= 1
collected.reverse()
for prior in in_window:
collected.extend(prior.body)
return collected