feat: pzmm conflict detection + content-type categorization

- mod_files manifest table populated at parse time
- POST /api/conflicts endpoint
- mod_types fingerprinting feeds derive_category
- DD filelist regex broadened to cover conflict-eligible exts
- media/maps/<*>/* excluded from manifest (per-mod namespaced,
  no conflict value, can be tens of MB per mod)

Plan: docs/plans/2026-05-04-pzmm-conflict-and-typing.md
This commit is contained in:
2026-05-04 15:22:35 +00:00
parent a15d35214e
commit b73325882e
9 changed files with 936 additions and 18 deletions

View File

@@ -130,6 +130,11 @@ class ModInfo:
# signal for build / multiplayer / category detection. Distinct from
# `tags` which is mod.info-side (freeform).
workshop_tags: List[str] = field(default_factory=list)
# pzmm-style content fingerprint (Maps, Vehicles, Weapons, Traits, …)
# populated by worker.build_manifest_and_types at parse time. Empty when
# files_manifest_built=false (older cached rows); derive_category falls
# through to the existing cascade in that case.
mod_types: List[str] = field(default_factory=list)
warnings: Dict[str, List[str]] = field(default_factory=dict)
@@ -347,8 +352,15 @@ def load_mods_from_dir(root: Path) -> List[ModInfo]:
_PATCH_NAME_RE = re.compile(r"\b(patch|compat|compatibility)\b", re.IGNORECASE)
# Substring-based category hints (kept in sync with api/mlos_sort.py)
_LIB_NAME_HINTS = ["library", "libraries", "framework"]
# Substring lists used for derive_category name heuristics. Plain substring
# matching (vs. \b regex) survives PZ's mishmash of camelCase + underscore
# + version-suffix mod names (TrueActions_1.09, TrueMusic, TMMumble, …)
# that strict word boundaries fail on. False positives are accepted in
# exchange — names containing "music" without being music-related are rare
# in PZ.
_LIB_NAME_HINTS = [
"library", "libraries", "framework",
]
_LIB_NAME_RE = re.compile(
r'(?<![A-Za-z])(?:lib|api|core)(?![A-Za-z])'
r'|(?<=[a-z])(?:Lib|API|Core)(?![A-Za-z])',
@@ -381,6 +393,43 @@ def _name_has(name: str, hints: List[str]) -> bool:
return any(h in n for h in hints)
# pzmm content-type → sortof CATEGORY_ORDER mapping. "skip" entries fall
# through to the existing derive_category cascade. Items/Animations/Lua/Unknown
# are too generic; Maps/Sounds/Patch/Vehicles/Clothing duplicate signals already
# captured by the cascade but stay here as fallbacks for poorly-tagged mods.
_TYPE_TO_CAT: Dict[str, str] = {
"Maps": "map",
"Vehicles": "vehicle",
"Weapons": "weapon",
"Clothing": "wearable",
"Traits": "code",
"Professions": "profession",
"Recipes": "crafting",
"Tiles": "tile",
"Textures": "texture",
"Sounds": "sound",
"UI": "ui",
"Translations": "translation",
"Patch": "patch",
"Dependency": "tweaks",
"Framework": "tweaks",
}
def _types_to_category(mod_types: List[str], name: str) -> Optional[str]:
"""First mod_type that maps to a sortof CATEGORY_ORDER bucket wins.
Returns None if mod_types contains only skip-types (Items/Animations/Lua/
Unknown), so the caller can fall through to the existing cascade."""
for t in mod_types:
cat = _TYPE_TO_CAT.get(t)
if cat:
# vehicle_spawn refinement matches the downstream ws_tag check.
if cat == "vehicle" and name and "spawn zone" in name.lower():
return "vehicle_spawn"
return cat
return None
def derive_category(mod: ModInfo) -> str:
"""Best-effort category from mod.info + workshop_meta.tags + name.
Mirrors api/mlos_sort.py; keep both copies in sync.
@@ -389,6 +438,15 @@ def derive_category(mod: ModInfo) -> str:
return mod.category
name = mod.name or ""
# pzmm-style content fingerprint takes precedence over name regex when
# available. Empty mod_types means files_manifest_built=false (older
# cached row); fall through to existing cascade.
if mod.mod_types:
cat = _types_to_category(mod.mod_types, name)
if cat:
return cat
if name and _PATCH_NAME_RE.search(name):
return "patch"
if _name_has(name, _LIB_NAME_HINTS) or (name and _LIB_NAME_RE.search(name)):

View File

@@ -19,6 +19,7 @@ from __future__ import annotations
import argparse
import asyncio
import hashlib
import json
import os
import re
@@ -27,7 +28,7 @@ import subprocess
import sys
import tempfile
from pathlib import Path
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Tuple
import asyncpg
import httpx
@@ -208,14 +209,21 @@ def run_depot_downloader(
workshop_id: str,
output_dir: Path,
dd_path: Path,
filelist_regex: str = r"regex:.*\.info$",
# mod.info / map.info for the existing parse path, plus _CONFLICT_EXTS
# (lua/txt/xml/json/ini) for build_manifest_and_types — both the conflict
# manifest and pzmm-style content sniffing live on these. Binary assets
# (.png/.dds/.bank/.ogg/.wav/.X) are intentionally excluded; their type
# signals degrade gracefully via workshop_meta.tags fallback in
# mlos_sort.derive_category.
filelist_regex: str = r"regex:.*\.(info|lua|txt|xml|json|ini)$",
timeout: int = 300,
max_attempts: int = 3,
backoff_s: float = 2.0,
) -> bool:
"""
Fetch workshop item using DepotDownloader, filtered to .info files only.
Writes <output_dir>/mods/<mod_id>/mod.info (and possibly map.info paths).
Fetch workshop item using DepotDownloader, filtered to mod.info plus
conflict-eligible asset files (lua/txt/xml/json/ini). Writes
<output_dir>/mods/<mod_id>/mod.info (and the asset tree under media/).
Returns True on success.
Retries up to max_attempts times on rc!=0 or timeout - Steam Workshop's
@@ -308,6 +316,178 @@ def discover_map_folders(mip_parent: Path) -> List[str]:
return out
# -----------------------------------------------------------------------------
# Single-pass manifest + content-type detection (Plan: 2026-05-04 pzmm port)
#
# Both Integration A (file-conflict manifest) and Integration B (mod_types
# fingerprinting) read the same files from the temp DD extraction. We walk
# the mod_id root once, hashing conflict-eligible files into a manifest map
# AND collecting pzmm-style content signals into a tag set in the same loop.
# No two-pass implementations.
# -----------------------------------------------------------------------------
_CONFLICT_EXTS = {".lua", ".txt", ".xml", ".json", ".ini"}
# map subtrees are namespaced per-mod by directory; conflict surface is ~zero,
# and worldmap.xml can be tens of MB. Skipping anything under maps/<MapName>/
# from manifest insertion AND mod_types content sniffing. map.info itself
# still drives `mod.maps` (via discover_map_folders) which feeds the existing
# `mod.maps non-empty → map` rule in derive_category.
_RE_MAP_SUBTREE = re.compile(r"^maps/[^/]+/")
# Ordered by pzmm preference: first match wins when types_to_category maps
# this list down to a single sortof CATEGORY_ORDER bucket.
_TYPE_PREFERRED = [
"Maps", "Vehicles", "Weapons", "Items", "Clothing", "Traits",
"Professions", "Recipes", "Tiles", "Textures", "Sounds",
"Animations", "UI", "Translations", "Lua", "Patch", "Dependency",
"Framework",
]
def _sha1(path: Path) -> str:
h = hashlib.sha1()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 128), b""):
h.update(chunk)
return h.hexdigest()
def _read_small_text(path: Path, limit: int = 256_000) -> str:
try:
with path.open("rb") as f:
data = f.read(limit)
return data.decode("utf-8", errors="ignore").lower()
except Exception:
return ""
def build_manifest_and_types(
mip_parent: Path, mod_id: str, raw: str
) -> Tuple[List[Tuple[str, str, int]], List[str]]:
"""Single-pass walk under the mod_id root.
Returns:
manifest_rows: list of (rel_path, sha1, size_bytes) for conflict-eligible
files (lua/txt/xml/json/ini). rel_path is lowercased,
posix-style, prefixed with "media/" so it's comparable
across mods regardless of B41/B42/split branch layout.
Last-wins on duplicate rel_paths from multi-branch mods.
mod_types: pzmm-ordered content tag list (Maps, Vehicles, …).
Falls back to ["Dependency"] or ["Unknown"] when no
media/ subtree exists.
"""
if mip_parent.parent.name == "mods":
modid_root = mip_parent
else:
modid_root = mip_parent.parent
tags: set = set()
name_blob = (raw or "").lower()
if any(w in name_blob for w in ("compat", "compatibility", "patch", "fix")):
tags.add("Patch")
if any(w in name_blob for w in ("api", "core", "dependency", "framework", "library", "required")):
tags.add("Dependency")
manifest_map: Dict[str, Tuple[str, int]] = {}
script_text_parts: List[str] = []
lua_text_parts: List[str] = []
has_media = False
for media_dir in modid_root.rglob("media"):
if not media_dir.is_dir():
continue
has_media = True
for path in media_dir.rglob("*"):
if not path.is_file():
continue
try:
rel_below = path.relative_to(media_dir).as_posix().lower()
except ValueError:
continue
suffix = path.suffix.lower()
in_map_subtree = bool(_RE_MAP_SUBTREE.match(rel_below))
# Manifest: skip per-map subtree (see _RE_MAP_SUBTREE comment).
if suffix in _CONFLICT_EXTS and not in_map_subtree:
rel = "media/" + rel_below
try:
size = path.stat().st_size
sha = _sha1(path)
manifest_map[rel] = (sha, size)
except OSError:
pass
# Path-based mod_type signals — always fire, even for files inside
# the per-map subtree. The "Maps" tag is the canonical signal that
# this mod ships a map and shouldn't be lost when we exclude the
# heavy worldmap.xml / objects.lua content from the manifest.
if rel_below.startswith("maps/"):
tags.add("Maps")
if rel_below.startswith("texturepacks/") or "tiledefinitions" in rel_below:
tags.add("Tiles")
if (rel_below.startswith(("textures/", "models_x/", "models/"))
or suffix in {".png", ".dds"}):
if path.name.lower() != "poster.png":
tags.add("Textures")
if rel_below.startswith(("scripts/vehicles/", "scripts/vehicle")):
tags.add("Vehicles")
if rel_below.startswith(("clothing/", "scripts/clothing/")):
tags.add("Clothing")
if (rel_below.startswith(("sound/", "sounds/", "fmod/"))
or suffix in {".bank", ".ogg", ".wav"}):
tags.add("Sounds")
if rel_below.startswith(("ui/", "lua/client/ui/")):
tags.add("UI")
if rel_below.startswith(("anims/", "animsets/", "actiongroups/")):
tags.add("Animations")
if rel_below.startswith("lua/shared/translate/") or "/translate/" in rel_below:
tags.add("Translations")
# Content-blob accumulation: skip per-map subtree. Map-internal lua
# / scripts (rare but possible) wouldn't normally collide with
# other mods anyway, and reading them just costs memory for no
# detection upside.
if not in_map_subtree:
if rel_below.startswith("lua/"):
tags.add("Lua")
if suffix == ".lua" and len(lua_text_parts) < 60:
lua_text_parts.append(_read_small_text(path, 64_000))
if (rel_below.startswith("scripts/") and suffix in {".txt", ".xml"}
and len(script_text_parts) < 80):
script_text_parts.append(_read_small_text(path, 96_000))
if not has_media:
return [], (["Dependency"] if "Dependency" in tags else ["Unknown"])
script_blob = "\n".join(script_text_parts)
lua_blob = "\n".join(lua_text_parts)
if " type = weapon" in script_blob or "displaycategory = weapon" in script_blob:
tags.add("Weapons")
if " vehicle " in script_blob or "module vehicles" in script_blob:
tags.add("Vehicles")
if "item " in script_blob:
tags.add("Items")
if "recipe " in script_blob or " evolvedrecipe " in script_blob:
tags.add("Recipes")
if "bodylocation" in script_blob or "clothingitem" in script_blob:
tags.add("Clothing")
if "traitfactory.addtrait" in lua_blob:
tags.add("Traits")
if "professionfactory.addprofession" in lua_blob:
tags.add("Professions")
has_require = bool(re.search(r"^\s*require\s*=", raw or "", re.IGNORECASE | re.MULTILINE))
if not tags and has_require and not script_blob:
tags.add("Framework")
ordered = [t for t in _TYPE_PREFERRED if t in tags] or ["Unknown"]
manifest_rows = [(rel, sha, size) for rel, (sha, size) in sorted(manifest_map.items())]
return manifest_rows, ordered
# -----------------------------------------------------------------------------
# DB upserts
# -----------------------------------------------------------------------------
@@ -356,8 +536,9 @@ INSERT INTO mod_parsed (
requirements, load_after, load_before, incompatible_mods,
load_first, load_last, tags, maps,
raw_mod_info, version_min, is_addon,
mod_types, files_manifest_built,
parsed_at_time_updated, parsed_at
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16, now())
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18, now())
ON CONFLICT (workshop_id, mod_id) DO UPDATE SET
name = EXCLUDED.name,
category = EXCLUDED.category,
@@ -372,10 +553,24 @@ ON CONFLICT (workshop_id, mod_id) DO UPDATE SET
raw_mod_info = EXCLUDED.raw_mod_info,
version_min = EXCLUDED.version_min,
is_addon = EXCLUDED.is_addon,
mod_types = EXCLUDED.mod_types,
files_manifest_built = EXCLUDED.files_manifest_built,
parsed_at_time_updated = EXCLUDED.parsed_at_time_updated,
parsed_at = now();
"""
DELETE_MOD_FILES = """
DELETE FROM mod_files WHERE workshop_id = $1 AND mod_id = $2;
"""
INSERT_MOD_FILE = """
INSERT INTO mod_files (workshop_id, mod_id, rel_path, sha1, size_bytes)
VALUES ($1, $2, $3, $4, $5)
ON CONFLICT (workshop_id, mod_id, rel_path) DO UPDATE SET
sha1 = EXCLUDED.sha1,
size_bytes = EXCLUDED.size_bytes;
"""
# Description-text heuristic for "this mod is an optional add-on to the
# primary mod published by the same wsid". Matches:
# "Optional add-on: removes ..." (TMMumble)
@@ -487,6 +682,10 @@ async def process_one(
if mod is None:
continue
maps = discover_map_folders(mip.parent)
# Single-pass walk under the mod_id root: produces both the
# conflict manifest and the pzmm-style mod_types list. See
# build_manifest_and_types.
manifest_rows, mod_types = build_manifest_and_types(mip.parent, mod.id, raw)
# Evict any other wsid's claim on this mod_id before we install
# ours. Cache invariant: at most one wsid per mod_id, with the
# most-recent pull winning.
@@ -508,8 +707,19 @@ async def process_one(
raw,
extract_version_min(raw),
detect_is_addon(raw),
mod_types,
True, # files_manifest_built
time_updated,
)
# Replace any stale manifest rows for this (workshop_id, mod_id)
# so a re-parse can't leave behind orphans from a prior layout.
await conn.execute(DELETE_MOD_FILES, workshop_id, mod.id)
if manifest_rows:
await conn.executemany(
INSERT_MOD_FILE,
[(workshop_id, mod.id, rel, sha, size)
for rel, sha, size in manifest_rows],
)
seen_mod_ids.append(mod.id)
# Drop rows for mods that no longer exist in this workshop item