feat: pzmm conflict detection + content-type categorization
- mod_files manifest table populated at parse time - POST /api/conflicts endpoint - mod_types fingerprinting feeds derive_category - DD filelist regex broadened to cover conflict-eligible exts - media/maps/<*>/* excluded from manifest (per-mod namespaced, no conflict value, can be tens of MB per mod) Plan: docs/plans/2026-05-04-pzmm-conflict-and-typing.md
This commit is contained in:
@@ -130,6 +130,11 @@ class ModInfo:
|
||||
# signal for build / multiplayer / category detection. Distinct from
|
||||
# `tags` which is mod.info-side (freeform).
|
||||
workshop_tags: List[str] = field(default_factory=list)
|
||||
# pzmm-style content fingerprint (Maps, Vehicles, Weapons, Traits, …)
|
||||
# populated by worker.build_manifest_and_types at parse time. Empty when
|
||||
# files_manifest_built=false (older cached rows); derive_category falls
|
||||
# through to the existing cascade in that case.
|
||||
mod_types: List[str] = field(default_factory=list)
|
||||
warnings: Dict[str, List[str]] = field(default_factory=dict)
|
||||
|
||||
|
||||
@@ -347,8 +352,15 @@ def load_mods_from_dir(root: Path) -> List[ModInfo]:
|
||||
_PATCH_NAME_RE = re.compile(r"\b(patch|compat|compatibility)\b", re.IGNORECASE)
|
||||
|
||||
|
||||
# Substring-based category hints (kept in sync with api/mlos_sort.py)
|
||||
_LIB_NAME_HINTS = ["library", "libraries", "framework"]
|
||||
# Substring lists used for derive_category name heuristics. Plain substring
|
||||
# matching (vs. \b regex) survives PZ's mishmash of camelCase + underscore
|
||||
# + version-suffix mod names (TrueActions_1.09, TrueMusic, TMMumble, …)
|
||||
# that strict word boundaries fail on. False positives are accepted in
|
||||
# exchange — names containing "music" without being music-related are rare
|
||||
# in PZ.
|
||||
_LIB_NAME_HINTS = [
|
||||
"library", "libraries", "framework",
|
||||
]
|
||||
_LIB_NAME_RE = re.compile(
|
||||
r'(?<![A-Za-z])(?:lib|api|core)(?![A-Za-z])'
|
||||
r'|(?<=[a-z])(?:Lib|API|Core)(?![A-Za-z])',
|
||||
@@ -381,6 +393,43 @@ def _name_has(name: str, hints: List[str]) -> bool:
|
||||
return any(h in n for h in hints)
|
||||
|
||||
|
||||
# pzmm content-type → sortof CATEGORY_ORDER mapping. "skip" entries fall
|
||||
# through to the existing derive_category cascade. Items/Animations/Lua/Unknown
|
||||
# are too generic; Maps/Sounds/Patch/Vehicles/Clothing duplicate signals already
|
||||
# captured by the cascade but stay here as fallbacks for poorly-tagged mods.
|
||||
_TYPE_TO_CAT: Dict[str, str] = {
|
||||
"Maps": "map",
|
||||
"Vehicles": "vehicle",
|
||||
"Weapons": "weapon",
|
||||
"Clothing": "wearable",
|
||||
"Traits": "code",
|
||||
"Professions": "profession",
|
||||
"Recipes": "crafting",
|
||||
"Tiles": "tile",
|
||||
"Textures": "texture",
|
||||
"Sounds": "sound",
|
||||
"UI": "ui",
|
||||
"Translations": "translation",
|
||||
"Patch": "patch",
|
||||
"Dependency": "tweaks",
|
||||
"Framework": "tweaks",
|
||||
}
|
||||
|
||||
|
||||
def _types_to_category(mod_types: List[str], name: str) -> Optional[str]:
|
||||
"""First mod_type that maps to a sortof CATEGORY_ORDER bucket wins.
|
||||
Returns None if mod_types contains only skip-types (Items/Animations/Lua/
|
||||
Unknown), so the caller can fall through to the existing cascade."""
|
||||
for t in mod_types:
|
||||
cat = _TYPE_TO_CAT.get(t)
|
||||
if cat:
|
||||
# vehicle_spawn refinement matches the downstream ws_tag check.
|
||||
if cat == "vehicle" and name and "spawn zone" in name.lower():
|
||||
return "vehicle_spawn"
|
||||
return cat
|
||||
return None
|
||||
|
||||
|
||||
def derive_category(mod: ModInfo) -> str:
|
||||
"""Best-effort category from mod.info + workshop_meta.tags + name.
|
||||
Mirrors api/mlos_sort.py; keep both copies in sync.
|
||||
@@ -389,6 +438,15 @@ def derive_category(mod: ModInfo) -> str:
|
||||
return mod.category
|
||||
|
||||
name = mod.name or ""
|
||||
|
||||
# pzmm-style content fingerprint takes precedence over name regex when
|
||||
# available. Empty mod_types means files_manifest_built=false (older
|
||||
# cached row); fall through to existing cascade.
|
||||
if mod.mod_types:
|
||||
cat = _types_to_category(mod.mod_types, name)
|
||||
if cat:
|
||||
return cat
|
||||
|
||||
if name and _PATCH_NAME_RE.search(name):
|
||||
return "patch"
|
||||
if _name_has(name, _LIB_NAME_HINTS) or (name and _LIB_NAME_RE.search(name)):
|
||||
|
||||
220
worker/worker.py
220
worker/worker.py
@@ -19,6 +19,7 @@ from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -27,7 +28,7 @@ import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import asyncpg
|
||||
import httpx
|
||||
@@ -208,14 +209,21 @@ def run_depot_downloader(
|
||||
workshop_id: str,
|
||||
output_dir: Path,
|
||||
dd_path: Path,
|
||||
filelist_regex: str = r"regex:.*\.info$",
|
||||
# mod.info / map.info for the existing parse path, plus _CONFLICT_EXTS
|
||||
# (lua/txt/xml/json/ini) for build_manifest_and_types — both the conflict
|
||||
# manifest and pzmm-style content sniffing live on these. Binary assets
|
||||
# (.png/.dds/.bank/.ogg/.wav/.X) are intentionally excluded; their type
|
||||
# signals degrade gracefully via workshop_meta.tags fallback in
|
||||
# mlos_sort.derive_category.
|
||||
filelist_regex: str = r"regex:.*\.(info|lua|txt|xml|json|ini)$",
|
||||
timeout: int = 300,
|
||||
max_attempts: int = 3,
|
||||
backoff_s: float = 2.0,
|
||||
) -> bool:
|
||||
"""
|
||||
Fetch workshop item using DepotDownloader, filtered to .info files only.
|
||||
Writes <output_dir>/mods/<mod_id>/mod.info (and possibly map.info paths).
|
||||
Fetch workshop item using DepotDownloader, filtered to mod.info plus
|
||||
conflict-eligible asset files (lua/txt/xml/json/ini). Writes
|
||||
<output_dir>/mods/<mod_id>/mod.info (and the asset tree under media/).
|
||||
Returns True on success.
|
||||
|
||||
Retries up to max_attempts times on rc!=0 or timeout - Steam Workshop's
|
||||
@@ -308,6 +316,178 @@ def discover_map_folders(mip_parent: Path) -> List[str]:
|
||||
return out
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Single-pass manifest + content-type detection (Plan: 2026-05-04 pzmm port)
|
||||
#
|
||||
# Both Integration A (file-conflict manifest) and Integration B (mod_types
|
||||
# fingerprinting) read the same files from the temp DD extraction. We walk
|
||||
# the mod_id root once, hashing conflict-eligible files into a manifest map
|
||||
# AND collecting pzmm-style content signals into a tag set in the same loop.
|
||||
# No two-pass implementations.
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
_CONFLICT_EXTS = {".lua", ".txt", ".xml", ".json", ".ini"}
|
||||
|
||||
# map subtrees are namespaced per-mod by directory; conflict surface is ~zero,
|
||||
# and worldmap.xml can be tens of MB. Skipping anything under maps/<MapName>/
|
||||
# from manifest insertion AND mod_types content sniffing. map.info itself
|
||||
# still drives `mod.maps` (via discover_map_folders) which feeds the existing
|
||||
# `mod.maps non-empty → map` rule in derive_category.
|
||||
_RE_MAP_SUBTREE = re.compile(r"^maps/[^/]+/")
|
||||
|
||||
# Ordered by pzmm preference: first match wins when types_to_category maps
|
||||
# this list down to a single sortof CATEGORY_ORDER bucket.
|
||||
_TYPE_PREFERRED = [
|
||||
"Maps", "Vehicles", "Weapons", "Items", "Clothing", "Traits",
|
||||
"Professions", "Recipes", "Tiles", "Textures", "Sounds",
|
||||
"Animations", "UI", "Translations", "Lua", "Patch", "Dependency",
|
||||
"Framework",
|
||||
]
|
||||
|
||||
|
||||
def _sha1(path: Path) -> str:
|
||||
h = hashlib.sha1()
|
||||
with path.open("rb") as f:
|
||||
for chunk in iter(lambda: f.read(1024 * 128), b""):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def _read_small_text(path: Path, limit: int = 256_000) -> str:
|
||||
try:
|
||||
with path.open("rb") as f:
|
||||
data = f.read(limit)
|
||||
return data.decode("utf-8", errors="ignore").lower()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def build_manifest_and_types(
|
||||
mip_parent: Path, mod_id: str, raw: str
|
||||
) -> Tuple[List[Tuple[str, str, int]], List[str]]:
|
||||
"""Single-pass walk under the mod_id root.
|
||||
|
||||
Returns:
|
||||
manifest_rows: list of (rel_path, sha1, size_bytes) for conflict-eligible
|
||||
files (lua/txt/xml/json/ini). rel_path is lowercased,
|
||||
posix-style, prefixed with "media/" so it's comparable
|
||||
across mods regardless of B41/B42/split branch layout.
|
||||
Last-wins on duplicate rel_paths from multi-branch mods.
|
||||
mod_types: pzmm-ordered content tag list (Maps, Vehicles, …).
|
||||
Falls back to ["Dependency"] or ["Unknown"] when no
|
||||
media/ subtree exists.
|
||||
"""
|
||||
if mip_parent.parent.name == "mods":
|
||||
modid_root = mip_parent
|
||||
else:
|
||||
modid_root = mip_parent.parent
|
||||
|
||||
tags: set = set()
|
||||
|
||||
name_blob = (raw or "").lower()
|
||||
if any(w in name_blob for w in ("compat", "compatibility", "patch", "fix")):
|
||||
tags.add("Patch")
|
||||
if any(w in name_blob for w in ("api", "core", "dependency", "framework", "library", "required")):
|
||||
tags.add("Dependency")
|
||||
|
||||
manifest_map: Dict[str, Tuple[str, int]] = {}
|
||||
script_text_parts: List[str] = []
|
||||
lua_text_parts: List[str] = []
|
||||
has_media = False
|
||||
|
||||
for media_dir in modid_root.rglob("media"):
|
||||
if not media_dir.is_dir():
|
||||
continue
|
||||
has_media = True
|
||||
for path in media_dir.rglob("*"):
|
||||
if not path.is_file():
|
||||
continue
|
||||
try:
|
||||
rel_below = path.relative_to(media_dir).as_posix().lower()
|
||||
except ValueError:
|
||||
continue
|
||||
suffix = path.suffix.lower()
|
||||
in_map_subtree = bool(_RE_MAP_SUBTREE.match(rel_below))
|
||||
|
||||
# Manifest: skip per-map subtree (see _RE_MAP_SUBTREE comment).
|
||||
if suffix in _CONFLICT_EXTS and not in_map_subtree:
|
||||
rel = "media/" + rel_below
|
||||
try:
|
||||
size = path.stat().st_size
|
||||
sha = _sha1(path)
|
||||
manifest_map[rel] = (sha, size)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Path-based mod_type signals — always fire, even for files inside
|
||||
# the per-map subtree. The "Maps" tag is the canonical signal that
|
||||
# this mod ships a map and shouldn't be lost when we exclude the
|
||||
# heavy worldmap.xml / objects.lua content from the manifest.
|
||||
if rel_below.startswith("maps/"):
|
||||
tags.add("Maps")
|
||||
if rel_below.startswith("texturepacks/") or "tiledefinitions" in rel_below:
|
||||
tags.add("Tiles")
|
||||
if (rel_below.startswith(("textures/", "models_x/", "models/"))
|
||||
or suffix in {".png", ".dds"}):
|
||||
if path.name.lower() != "poster.png":
|
||||
tags.add("Textures")
|
||||
if rel_below.startswith(("scripts/vehicles/", "scripts/vehicle")):
|
||||
tags.add("Vehicles")
|
||||
if rel_below.startswith(("clothing/", "scripts/clothing/")):
|
||||
tags.add("Clothing")
|
||||
if (rel_below.startswith(("sound/", "sounds/", "fmod/"))
|
||||
or suffix in {".bank", ".ogg", ".wav"}):
|
||||
tags.add("Sounds")
|
||||
if rel_below.startswith(("ui/", "lua/client/ui/")):
|
||||
tags.add("UI")
|
||||
if rel_below.startswith(("anims/", "animsets/", "actiongroups/")):
|
||||
tags.add("Animations")
|
||||
if rel_below.startswith("lua/shared/translate/") or "/translate/" in rel_below:
|
||||
tags.add("Translations")
|
||||
|
||||
# Content-blob accumulation: skip per-map subtree. Map-internal lua
|
||||
# / scripts (rare but possible) wouldn't normally collide with
|
||||
# other mods anyway, and reading them just costs memory for no
|
||||
# detection upside.
|
||||
if not in_map_subtree:
|
||||
if rel_below.startswith("lua/"):
|
||||
tags.add("Lua")
|
||||
if suffix == ".lua" and len(lua_text_parts) < 60:
|
||||
lua_text_parts.append(_read_small_text(path, 64_000))
|
||||
if (rel_below.startswith("scripts/") and suffix in {".txt", ".xml"}
|
||||
and len(script_text_parts) < 80):
|
||||
script_text_parts.append(_read_small_text(path, 96_000))
|
||||
|
||||
if not has_media:
|
||||
return [], (["Dependency"] if "Dependency" in tags else ["Unknown"])
|
||||
|
||||
script_blob = "\n".join(script_text_parts)
|
||||
lua_blob = "\n".join(lua_text_parts)
|
||||
|
||||
if " type = weapon" in script_blob or "displaycategory = weapon" in script_blob:
|
||||
tags.add("Weapons")
|
||||
if " vehicle " in script_blob or "module vehicles" in script_blob:
|
||||
tags.add("Vehicles")
|
||||
if "item " in script_blob:
|
||||
tags.add("Items")
|
||||
if "recipe " in script_blob or " evolvedrecipe " in script_blob:
|
||||
tags.add("Recipes")
|
||||
if "bodylocation" in script_blob or "clothingitem" in script_blob:
|
||||
tags.add("Clothing")
|
||||
if "traitfactory.addtrait" in lua_blob:
|
||||
tags.add("Traits")
|
||||
if "professionfactory.addprofession" in lua_blob:
|
||||
tags.add("Professions")
|
||||
|
||||
has_require = bool(re.search(r"^\s*require\s*=", raw or "", re.IGNORECASE | re.MULTILINE))
|
||||
if not tags and has_require and not script_blob:
|
||||
tags.add("Framework")
|
||||
|
||||
ordered = [t for t in _TYPE_PREFERRED if t in tags] or ["Unknown"]
|
||||
manifest_rows = [(rel, sha, size) for rel, (sha, size) in sorted(manifest_map.items())]
|
||||
return manifest_rows, ordered
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# DB upserts
|
||||
# -----------------------------------------------------------------------------
|
||||
@@ -356,8 +536,9 @@ INSERT INTO mod_parsed (
|
||||
requirements, load_after, load_before, incompatible_mods,
|
||||
load_first, load_last, tags, maps,
|
||||
raw_mod_info, version_min, is_addon,
|
||||
mod_types, files_manifest_built,
|
||||
parsed_at_time_updated, parsed_at
|
||||
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16, now())
|
||||
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18, now())
|
||||
ON CONFLICT (workshop_id, mod_id) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
category = EXCLUDED.category,
|
||||
@@ -372,10 +553,24 @@ ON CONFLICT (workshop_id, mod_id) DO UPDATE SET
|
||||
raw_mod_info = EXCLUDED.raw_mod_info,
|
||||
version_min = EXCLUDED.version_min,
|
||||
is_addon = EXCLUDED.is_addon,
|
||||
mod_types = EXCLUDED.mod_types,
|
||||
files_manifest_built = EXCLUDED.files_manifest_built,
|
||||
parsed_at_time_updated = EXCLUDED.parsed_at_time_updated,
|
||||
parsed_at = now();
|
||||
"""
|
||||
|
||||
DELETE_MOD_FILES = """
|
||||
DELETE FROM mod_files WHERE workshop_id = $1 AND mod_id = $2;
|
||||
"""
|
||||
|
||||
INSERT_MOD_FILE = """
|
||||
INSERT INTO mod_files (workshop_id, mod_id, rel_path, sha1, size_bytes)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
ON CONFLICT (workshop_id, mod_id, rel_path) DO UPDATE SET
|
||||
sha1 = EXCLUDED.sha1,
|
||||
size_bytes = EXCLUDED.size_bytes;
|
||||
"""
|
||||
|
||||
# Description-text heuristic for "this mod is an optional add-on to the
|
||||
# primary mod published by the same wsid". Matches:
|
||||
# "Optional add-on: removes ..." (TMMumble)
|
||||
@@ -487,6 +682,10 @@ async def process_one(
|
||||
if mod is None:
|
||||
continue
|
||||
maps = discover_map_folders(mip.parent)
|
||||
# Single-pass walk under the mod_id root: produces both the
|
||||
# conflict manifest and the pzmm-style mod_types list. See
|
||||
# build_manifest_and_types.
|
||||
manifest_rows, mod_types = build_manifest_and_types(mip.parent, mod.id, raw)
|
||||
# Evict any other wsid's claim on this mod_id before we install
|
||||
# ours. Cache invariant: at most one wsid per mod_id, with the
|
||||
# most-recent pull winning.
|
||||
@@ -508,8 +707,19 @@ async def process_one(
|
||||
raw,
|
||||
extract_version_min(raw),
|
||||
detect_is_addon(raw),
|
||||
mod_types,
|
||||
True, # files_manifest_built
|
||||
time_updated,
|
||||
)
|
||||
# Replace any stale manifest rows for this (workshop_id, mod_id)
|
||||
# so a re-parse can't leave behind orphans from a prior layout.
|
||||
await conn.execute(DELETE_MOD_FILES, workshop_id, mod.id)
|
||||
if manifest_rows:
|
||||
await conn.executemany(
|
||||
INSERT_MOD_FILE,
|
||||
[(workshop_id, mod.id, rel, sha, size)
|
||||
for rel, sha, size in manifest_rows],
|
||||
)
|
||||
seen_mod_ids.append(mod.id)
|
||||
|
||||
# Drop rows for mods that no longer exist in this workshop item
|
||||
|
||||
Reference in New Issue
Block a user