feat: pzmm conflict detection + content-type categorization

- mod_files manifest table populated at parse time - POST /api/conflicts endpoint - mod_types fingerprinting feeds derive_category - DD filelist regex broadened to cover conflict-eligible exts - media/maps/<*>/* excluded from manifest (per-mod namespaced, no conflict value, can be tens of MB per mod) Plan: docs/plans/2026-05-04-pzmm-conflict-and-typing.md
2026-05-04 15:22:35 +00:00
parent a15d35214e
commit b73325882e
9 changed files with 936 additions and 18 deletions
--- a/api/diagnostics.py
+++ b/api/diagnostics.py
@@ -0,0 +1,93 @@
+"""File-level conflict detection from cached manifests.
+
+Port of pzmm core/scanner.py:scan_file_conflicts adapted to read from the
+mod_files table (populated by worker.build_manifest_and_types) instead of
+walking on-disk media trees. See docs/plans/2026-05-04-pzmm-conflict-and-typing.md.
+"""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+
+from mlos_sort import ModInfo
+
+
+@dataclass
+class FileConflict:
+    rel_path: str
+    providers: List[str]   # mod_ids in input load order
+    winner: str            # mod_id (last in load order)
+
+
+_FETCH_MANIFEST = """
+WITH inputs AS (
+    SELECT unnest($1::text[]) AS workshop_id,
+           unnest($2::text[]) AS mod_id
+)
+SELECT mf.workshop_id, mf.mod_id, mf.rel_path, mf.sha1
+  FROM mod_files mf
+  JOIN inputs i
+    ON mf.workshop_id = i.workshop_id
+   AND mf.mod_id      = i.mod_id
+"""
+
+
+async def scan_file_conflicts(conn, mods: List[ModInfo]) -> List[FileConflict]:
+    """For the given (already-loaded) ModInfos, report rel_paths claimed by
+    ≥2 mods with non-equal sha1. Returns list ordered by rel_path.
+
+    Mods without manifest rows (`files_manifest_built=false`) silently
+    contribute nothing to the conflict scan; the caller is responsible for
+    surfacing them as `missing_manifests` in any user-facing payload.
+    """
+    if len(mods) < 2:
+        return []
+
+    wsids = [m.workshop_id or "" for m in mods]
+    mod_ids = [m.id for m in mods]
+    rows = await conn.fetch(_FETCH_MANIFEST, wsids, mod_ids)
+
+    # mod_id → load-order index (input order = load order, mirroring pzmm)
+    order_index: Dict[str, int] = {m.id: i for i, m in enumerate(mods)}
+
+    # rel_path → list of (load_order_index, mod_id, sha1)
+    by_path: Dict[str, List[Tuple[int, str, str]]] = defaultdict(list)
+    for r in rows:
+        mod_id = r["mod_id"]
+        idx = order_index.get(mod_id)
+        if idx is None:
+            continue
+        by_path[r["rel_path"]].append((idx, mod_id, r["sha1"]))
+
+    conflicts: List[FileConflict] = []
+    for rel, entries in by_path.items():
+        # Need ≥2 distinct providers AND ≥2 distinct sha1s. If every
+        # provider ships byte-identical content (same sha1), it's a
+        # duplicate, not a conflict — pzmm scanner.py:55–66.
+        unique_providers = {mod_id for _, mod_id, _ in entries}
+        if len(unique_providers) < 2:
+            continue
+        unique_hashes = {sha for _, _, sha in entries}
+        if len(unique_hashes) < 2:
+            continue
+        # Order providers by input load-order index. Winner = last loaded.
+        ordered = sorted(entries, key=lambda e: e[0])
+        providers = [mod_id for _, mod_id, _ in ordered]
+        # De-dup providers preserving order (a mod could ship the same
+        # rel_path under both B41 and B42 layouts → seen twice).
+        seen: set = set()
+        dedup_providers: List[str] = []
+        for p in providers:
+            if p not in seen:
+                seen.add(p)
+                dedup_providers.append(p)
+        conflicts.append(FileConflict(
+            rel_path=rel,
+            providers=dedup_providers,
+            winner=dedup_providers[-1],
+        ))
+
+    conflicts.sort(key=lambda c: c.rel_path)
+    return conflicts