feat: pzmm conflict detection + content-type categorization

- mod_files manifest table populated at parse time
- POST /api/conflicts endpoint
- mod_types fingerprinting feeds derive_category
- DD filelist regex broadened to cover conflict-eligible exts
- media/maps/<*>/* excluded from manifest (per-mod namespaced,
  no conflict value, can be tens of MB per mod)

Plan: docs/plans/2026-05-04-pzmm-conflict-and-typing.md
This commit is contained in:
2026-05-04 15:22:35 +00:00
parent a15d35214e
commit b73325882e
9 changed files with 936 additions and 18 deletions

3
.gitignore vendored
View File

@@ -20,6 +20,9 @@ __pycache__/
*.bak *.bak
*.bak-* *.bak-*
# Operational DB dumps (pg_dump output kept locally for rollback)
backups/
# Editor / IDE / OS artifacts # Editor / IDE / OS artifacts
*.swp *.swp
*.swo *.swo

View File

@@ -26,6 +26,7 @@ from pydantic import BaseModel, Field
import adapters import adapters
import db import db
import diagnostics
import expansion import expansion
import jobs import jobs
import steam import steam
@@ -191,6 +192,7 @@ def _row_to_modinfo(r) -> ModInfo:
maps=list(r["maps"] or []), maps=list(r["maps"] or []),
is_addon=bool(r["is_addon"]) if "is_addon" in r else False, is_addon=bool(r["is_addon"]) if "is_addon" in r else False,
workshop_tags=list(r["workshop_tags"] or []) if "workshop_tags" in r else [], workshop_tags=list(r["workshop_tags"] or []) if "workshop_tags" in r else [],
mod_types=list(r["mod_types"] or []) if "mod_types" in r else [],
) )
@@ -684,7 +686,7 @@ async def _build_result_for_job(
SELECT mp.workshop_id, mp.mod_id, mp.name, mp.category, SELECT mp.workshop_id, mp.mod_id, mp.name, mp.category,
mp.requirements, mp.load_after, mp.load_before, mp.requirements, mp.load_after, mp.load_before,
mp.incompatible_mods, mp.load_first, mp.load_last, mp.incompatible_mods, mp.load_first, mp.load_last,
mp.tags, mp.maps, mp.is_addon, wm.tags AS workshop_tags mp.tags, mp.maps, mp.is_addon, mp.mod_types, wm.tags AS workshop_tags
FROM mod_parsed mp FROM mod_parsed mp
JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id
WHERE mp.workshop_id = ANY($1::text[]) WHERE mp.workshop_id = ANY($1::text[])
@@ -1001,7 +1003,7 @@ async def sort_endpoint(req: SortRequest, request: Request) -> Dict[str, Any]:
SELECT mp.workshop_id, mp.mod_id, mp.name, mp.category, SELECT mp.workshop_id, mp.mod_id, mp.name, mp.category,
mp.requirements, mp.load_after, mp.load_before, mp.requirements, mp.load_after, mp.load_before,
mp.incompatible_mods, mp.load_first, mp.load_last, mp.incompatible_mods, mp.load_first, mp.load_last,
mp.tags, mp.maps, mp.is_addon, wm.tags AS workshop_tags mp.tags, mp.maps, mp.is_addon, mp.mod_types, wm.tags AS workshop_tags
FROM mod_parsed mp FROM mod_parsed mp
JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id
WHERE mp.workshop_id = ANY($1::text[]) WHERE mp.workshop_id = ANY($1::text[])
@@ -1215,7 +1217,7 @@ async def resort_endpoint(req: ResortRequest, request: Request) -> Dict[str, Any
SELECT mp.workshop_id, mp.mod_id, mp.name, mp.category, SELECT mp.workshop_id, mp.mod_id, mp.name, mp.category,
mp.requirements, mp.load_after, mp.load_before, mp.requirements, mp.load_after, mp.load_before,
mp.incompatible_mods, mp.load_first, mp.load_last, mp.incompatible_mods, mp.load_first, mp.load_last,
mp.tags, mp.maps, mp.is_addon, wm.tags AS workshop_tags mp.tags, mp.maps, mp.is_addon, mp.mod_types, wm.tags AS workshop_tags
FROM mod_parsed mp FROM mod_parsed mp
JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id
WHERE mp.workshop_id IN (SELECT workshop_id FROM selected_wsids) WHERE mp.workshop_id IN (SELECT workshop_id FROM selected_wsids)
@@ -1469,6 +1471,71 @@ async def vote_broken_mod(
return {"upvotes": int(row["upvotes"]), "downvotes": int(row["downvotes"])} return {"upvotes": int(row["upvotes"]), "downvotes": int(row["downvotes"])}
@app.post("/api/conflicts")
async def conflicts_endpoint(req: SortRequest, request: Request) -> Dict[str, Any]:
"""Detect rel_paths claimed by ≥2 input mods with non-equal sha1.
v1: bare wsids only. Collection input returns 400 so the caller can
resolve via /api/sort first (where the async-job + drain-progress
plumbing already lives). Mods whose `files_manifest_built` is false
cannot be analyzed and are reported in `missing_manifests` instead of
silently ignored.
"""
bare_wsids, collection_ids = parse_with_collections(req.input or "")
if collection_ids:
raise HTTPException(
status_code=400,
detail="conflict scan does not support collection input; resolve via /api/sort first",
)
if not bare_wsids:
raise HTTPException(status_code=400, detail="no workshop ids found in input")
if len(bare_wsids) > MAX_IDS:
raise HTTPException(
status_code=413,
detail=f"too many workshop ids ({len(bare_wsids)} > {MAX_IDS})",
)
pool = request.app.state.db
async with pool.acquire() as conn:
rows = await conn.fetch(
"""
SELECT mp.workshop_id, mp.mod_id, mp.name, mp.category,
mp.requirements, mp.load_after, mp.load_before,
mp.incompatible_mods, mp.load_first, mp.load_last,
mp.tags, mp.maps, mp.is_addon, mp.mod_types,
mp.files_manifest_built, wm.tags AS workshop_tags
FROM mod_parsed mp
JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id
WHERE mp.workshop_id = ANY($1::text[])
AND mp.parsed_at_time_updated = wm.time_updated
ORDER BY mp.workshop_id, mp.mod_id
""",
bare_wsids,
)
mods: List[ModInfo] = [_row_to_modinfo(r) for r in rows]
# Missing-manifest wsids: input wsids that have no mod_parsed rows
# OR whose rows all have files_manifest_built=false. Any single
# built row in a multi-mod wsid counts as "manifest available".
wsid_has_manifest: Dict[str, bool] = {}
for r in rows:
w = r["workshop_id"]
built = bool(r["files_manifest_built"])
wsid_has_manifest[w] = wsid_has_manifest.get(w, False) or built
missing_manifests = [w for w in bare_wsids if not wsid_has_manifest.get(w, False)]
conflicts = await diagnostics.scan_file_conflicts(conn, mods)
return {
"conflicts": [
{"rel_path": c.rel_path, "providers": c.providers, "winner": c.winner}
for c in conflicts
],
"missing_manifests": missing_manifests,
}
# ── static frontend ──────────────────────────────────────────────────────── # ── static frontend ────────────────────────────────────────────────────────
# Mount LAST so all API routes win path resolution. # Mount LAST so all API routes win path resolution.
_FRONTEND_DIR = Path(__file__).resolve().parent.parent / "frontend" _FRONTEND_DIR = Path(__file__).resolve().parent.parent / "frontend"

58
api/categorize.py Normal file
View File

@@ -0,0 +1,58 @@
"""Public helper for mapping pzmm content-type tags to sortof CATEGORY_ORDER.
The same mapping is also inlined in `mlos_sort.py` (both api/ and worker/
copies, deliberately — worker uses a separate venv with no FastAPI deps,
so it cannot import from api/). This module exposes the helper for
non-mlos consumers (e.g. /api/conflicts diagnostics output) without
forcing them to drag in the whole sorter module.
Source: pzmm core/mods.py:detect_mod_types ordering, mapped to sortof's
CATEGORY_ORDER buckets per docs/plans/2026-05-04-pzmm-conflict-and-typing.md
§3.4.
"""
from __future__ import annotations
from typing import Dict, List, Optional
# Items / Animations / Lua / Unknown intentionally absent — too generic to
# drive a category decision; callers should fall through to other heuristics.
_TYPE_TO_CAT: Dict[str, str] = {
"Maps": "map",
"Vehicles": "vehicle",
"Weapons": "weapon",
"Clothing": "wearable",
"Traits": "code",
"Professions": "profession",
"Recipes": "crafting",
"Tiles": "tile",
"Textures": "texture",
"Sounds": "sound",
"UI": "ui",
"Translations": "translation",
"Patch": "patch",
"Dependency": "tweaks",
"Framework": "tweaks",
}
def types_to_category(mod_types: List[str], name: str = "") -> Optional[str]:
"""First mod_type that maps to a sortof CATEGORY_ORDER bucket wins.
Returns the bucket name (e.g. "weapon", "vehicle"), or None when:
- mod_types is empty (manifest not yet built), or
- mod_types contains only skip-types (Items / Animations / Lua / Unknown).
The `name` arg is used for the vehicle_spawn refinement only — when a
Vehicles-tagged mod is named like "spawn zone X", the more specific
`vehicle_spawn` bucket wins over the generic `vehicle`.
"""
if not mod_types:
return None
for t in mod_types:
cat = _TYPE_TO_CAT.get(t)
if cat:
if cat == "vehicle" and name and "spawn zone" in name.lower():
return "vehicle_spawn"
return cat
return None

93
api/diagnostics.py Normal file
View File

@@ -0,0 +1,93 @@
"""File-level conflict detection from cached manifests.
Port of pzmm core/scanner.py:scan_file_conflicts adapted to read from the
mod_files table (populated by worker.build_manifest_and_types) instead of
walking on-disk media trees. See docs/plans/2026-05-04-pzmm-conflict-and-typing.md.
"""
from __future__ import annotations
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, List, Tuple
from mlos_sort import ModInfo
@dataclass
class FileConflict:
rel_path: str
providers: List[str] # mod_ids in input load order
winner: str # mod_id (last in load order)
_FETCH_MANIFEST = """
WITH inputs AS (
SELECT unnest($1::text[]) AS workshop_id,
unnest($2::text[]) AS mod_id
)
SELECT mf.workshop_id, mf.mod_id, mf.rel_path, mf.sha1
FROM mod_files mf
JOIN inputs i
ON mf.workshop_id = i.workshop_id
AND mf.mod_id = i.mod_id
"""
async def scan_file_conflicts(conn, mods: List[ModInfo]) -> List[FileConflict]:
"""For the given (already-loaded) ModInfos, report rel_paths claimed by
≥2 mods with non-equal sha1. Returns list ordered by rel_path.
Mods without manifest rows (`files_manifest_built=false`) silently
contribute nothing to the conflict scan; the caller is responsible for
surfacing them as `missing_manifests` in any user-facing payload.
"""
if len(mods) < 2:
return []
wsids = [m.workshop_id or "" for m in mods]
mod_ids = [m.id for m in mods]
rows = await conn.fetch(_FETCH_MANIFEST, wsids, mod_ids)
# mod_id → load-order index (input order = load order, mirroring pzmm)
order_index: Dict[str, int] = {m.id: i for i, m in enumerate(mods)}
# rel_path → list of (load_order_index, mod_id, sha1)
by_path: Dict[str, List[Tuple[int, str, str]]] = defaultdict(list)
for r in rows:
mod_id = r["mod_id"]
idx = order_index.get(mod_id)
if idx is None:
continue
by_path[r["rel_path"]].append((idx, mod_id, r["sha1"]))
conflicts: List[FileConflict] = []
for rel, entries in by_path.items():
# Need ≥2 distinct providers AND ≥2 distinct sha1s. If every
# provider ships byte-identical content (same sha1), it's a
# duplicate, not a conflict — pzmm scanner.py:5566.
unique_providers = {mod_id for _, mod_id, _ in entries}
if len(unique_providers) < 2:
continue
unique_hashes = {sha for _, _, sha in entries}
if len(unique_hashes) < 2:
continue
# Order providers by input load-order index. Winner = last loaded.
ordered = sorted(entries, key=lambda e: e[0])
providers = [mod_id for _, mod_id, _ in ordered]
# De-dup providers preserving order (a mod could ship the same
# rel_path under both B41 and B42 layouts → seen twice).
seen: set = set()
dedup_providers: List[str] = []
for p in providers:
if p not in seen:
seen.add(p)
dedup_providers.append(p)
conflicts.append(FileConflict(
rel_path=rel,
providers=dedup_providers,
winner=dedup_providers[-1],
))
conflicts.sort(key=lambda c: c.rel_path)
return conflicts

View File

@@ -130,6 +130,11 @@ class ModInfo:
# signal for build / multiplayer / category detection. Distinct from # signal for build / multiplayer / category detection. Distinct from
# `tags` which is mod.info-side (freeform). # `tags` which is mod.info-side (freeform).
workshop_tags: List[str] = field(default_factory=list) workshop_tags: List[str] = field(default_factory=list)
# pzmm-style content fingerprint (Maps, Vehicles, Weapons, Traits, …)
# populated by worker.build_manifest_and_types at parse time. Empty when
# files_manifest_built=false (older cached rows); derive_category falls
# through to the existing cascade in that case.
mod_types: List[str] = field(default_factory=list)
warnings: Dict[str, List[str]] = field(default_factory=dict) warnings: Dict[str, List[str]] = field(default_factory=dict)
@@ -389,30 +394,77 @@ def _name_has(name: str, hints: List[str]) -> bool:
return any(h in n for h in hints) return any(h in n for h in hints)
# pzmm content-type → sortof CATEGORY_ORDER mapping. "skip" entries fall
# through to the existing derive_category cascade. Items/Animations/Lua/Unknown
# are too generic; Maps/Sounds/Patch/Vehicles/Clothing duplicate signals already
# captured by the cascade but stay here as fallbacks for poorly-tagged mods.
_TYPE_TO_CAT: Dict[str, str] = {
"Maps": "map",
"Vehicles": "vehicle",
"Weapons": "weapon",
"Clothing": "wearable",
"Traits": "code",
"Professions": "profession",
"Recipes": "crafting",
"Tiles": "tile",
"Textures": "texture",
"Sounds": "sound",
"UI": "ui",
"Translations": "translation",
"Patch": "patch",
"Dependency": "tweaks",
"Framework": "tweaks",
}
def _types_to_category(mod_types: List[str], name: str) -> Optional[str]:
"""First mod_type that maps to a sortof CATEGORY_ORDER bucket wins.
Returns None if mod_types contains only skip-types (Items/Animations/Lua/
Unknown), so the caller can fall through to the existing cascade."""
for t in mod_types:
cat = _TYPE_TO_CAT.get(t)
if cat:
# vehicle_spawn refinement matches the downstream ws_tag check.
if cat == "vehicle" and name and "spawn zone" in name.lower():
return "vehicle_spawn"
return cat
return None
def derive_category(mod: ModInfo) -> str: def derive_category(mod: ModInfo) -> str:
"""Best-effort category from mod.info + workshop_meta.tags + name. """Best-effort category from mod.info + workshop_meta.tags + name.
Detection order (most specific → least): Detection order (most specific → least):
1. mod.info `category=` if explicit and recognized. 1. mod.info `category=` if explicit and recognized.
2. patch / fix name regex (Spec G-patch). 2. pzmm-style mod_types fingerprint (when files_manifest_built=true).
3. library/framework name regex (extends FRAMEWORK_KEYS). 3. patch / fix name regex (Spec G-patch).
4. mod.maps non-empty → map. 4. library/framework name regex (extends FRAMEWORK_KEYS).
5. moodle / profession / movement / specific gameplay axes by name. 5. mod.maps non-empty → map.
6. Workshop tags (canonical Steam controlled vocab): Audio + 'music' 6. moodle / profession / movement / specific gameplay axes by name.
7. Workshop tags (canonical Steam controlled vocab): Audio + 'music'
music; Audio → sound; Weapons → weapon; Vehicles → vehicle; music; Audio → sound; Weapons → weapon; Vehicles → vehicle;
Clothing/Armor + 'armor' → armor, else wearable; Building → Clothing/Armor + 'armor' → armor, else wearable; Building →
building; Farming → farming; Food → food; Skills → profession building; Farming → farming; Food → food; Skills → profession
(or moodle); Interface → ui; Textures → texture; (or moodle); Interface → ui; Textures → texture;
Language/Translation → translation; QOL → qol; Multiplayer alone Language/Translation → translation; QOL → qol; Multiplayer alone
→ multiplayer. → multiplayer.
7. mod.info tags (freeform fallback). 8. mod.info tags (freeform fallback).
8. FRAMEWORK_KEYS substring match → tweaks. 9. FRAMEWORK_KEYS substring match → tweaks.
9. Default → other. 10. Default → other.
""" """
if mod.category in CATEGORY_ORDER and mod.category != "undefined": if mod.category in CATEGORY_ORDER and mod.category != "undefined":
return mod.category return mod.category
name = mod.name or "" name = mod.name or ""
# pzmm-style content fingerprint takes precedence over name regex when
# available. Empty mod_types means files_manifest_built=false (older
# cached row); fall through to existing cascade.
if mod.mod_types:
cat = _types_to_category(mod.mod_types, name)
if cat:
return cat
if name and _PATCH_NAME_RE.search(name): if name and _PATCH_NAME_RE.search(name):
return "patch" return "patch"
if _name_has(name, _LIB_NAME_HINTS) or (name and _LIB_NAME_RE.search(name)): if _name_has(name, _LIB_NAME_HINTS) or (name and _LIB_NAME_RE.search(name)):

View File

@@ -0,0 +1,343 @@
# Plan: pzmm conflict detection + content-type categorization
**Date:** 2026-05-04
**Branch:** `feat/pzmm-conflict-typing`
**Status:** Approved (Sam, 2026-05-04)
**Sources read:**
- `/tmp/pzmm-src/pzmm-main/core/scanner.py``scan_file_conflicts`, `solve_load_order`, `FileConflict`
- `/tmp/pzmm-src/pzmm-main/core/mods.py``detect_mod_types`, `ModInfo`
- `/tmp/pzmm-src/pzmm-main/core/bundle.py` — debug bundle (read for context, not integrated)
- `/opt/sortof/init/01_schema.sql` and migrations 02..08
- `/opt/sortof/api/app.py``/api/sort`, `_build_result_for_job`, `_row_to_modinfo`
- `/opt/sortof/api/mlos_sort.py``CATEGORY_ORDER`, `derive_category`
- `/opt/sortof/api/adapters.py``CAT_MAP`
- `/opt/sortof/worker/worker.py``process_one`
**Open questions resolved at approval:**
- Manifest scope: walk all `media/` subtrees under the mod_id root, last-wins on duplicate rel_paths, **no per-branch column**.
- `mod_files.size_bytes` column: keep.
- Module split: `api/diagnostics.py` and `api/categorize.py` are **separate files**.
- `/api/conflicts` v1: **bare wsids only**, return HTTP 400 on collection input. Defer async-job/collection-expansion plumbing to a follow-up plan.
---
## 1. Context
pzmm ships two pieces sortof doesn't have today:
1. **File-conflict detection** — when two mods both ship `media/scripts/items_food.txt` with byte-different content, the later one silently overrides the earlier one at runtime. PZ never reports this; the player only sees the symptom (broken food, duplicate item ids, etc.). pzmm walks each mod's `media/` tree, hashes the conflict-prone extensions (`.lua`, `.txt`, `.xml`, `.json`, `.ini`), and reports rel-paths claimed by ≥2 mods with non-equal content. Sortof currently only detects `mod_id` collisions (one mod_id under multiple wsids). File-level overrides are invisible to us.
2. **Content-type detection** — pzmm walks `media/` paths plus the contents of `lua/` and `scripts/*.txt|xml` files to fingerprint what a mod actually ships (Weapons, Vehicles, Maps, Traits, Professions, Recipes, etc.). Sortof's `derive_category` infers category from `workshop_meta.tags` + name regex + `mod.info` tags. Authors who tag poorly (or skip tagging) end up in `other`/`undefined`. Detection from media/ contents is more reliable for those.
Both pzmm functions assume on-disk media trees. Sortof's worker uses `tempfile.TemporaryDirectory` (`worker/worker.py:472`) — the entire DD extraction is destroyed at the end of `process_one`'s `with` block. **Only `mod.info` (as `raw_mod_info`), discovered map folder names, and a few derived columns persist.**
This plan keeps the existing model: parse once, serve from DB. We **persist a manifest at parse time**. Re-fetch on demand was rejected — every conflict check would queue N DD pulls, minutes per request, completely unusable.
We **do not import pzmm's `solve_load_order`**. Sortof's `mlos_sort.py` is strictly more correct (preorder, loadFirst/loadLast tiers, category buckets, patch G-axis, multi-branch picker, addon injection). pzmm's solver is a plain Kahn topo sort with no tie-breakers.
---
## 2. Integration A — File conflict detection
### 2.1 New schema (`init/09_mod_files.sql`)
```sql
CREATE TABLE IF NOT EXISTS mod_files (
workshop_id TEXT NOT NULL,
mod_id TEXT NOT NULL,
rel_path TEXT NOT NULL, -- lowercased, posix-style, relative to mod_id root
sha1 TEXT NOT NULL,
size_bytes INTEGER NOT NULL DEFAULT 0,
PRIMARY KEY (workshop_id, mod_id, rel_path),
FOREIGN KEY (workshop_id, mod_id) REFERENCES mod_parsed (workshop_id, mod_id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS mod_files_rel_path_idx ON mod_files (rel_path);
CREATE INDEX IF NOT EXISTS mod_files_mod_idx ON mod_files (workshop_id, mod_id);
```
Plus additions to `mod_parsed`:
```sql
ALTER TABLE mod_parsed
ADD COLUMN IF NOT EXISTS mod_types TEXT[] NOT NULL DEFAULT '{}',
ADD COLUMN IF NOT EXISTS files_manifest_built BOOLEAN NOT NULL DEFAULT FALSE;
```
The flag lets `derive_category` and `/api/conflicts` know whether a mod has a manifest yet (graceful degradation while the cache backfills organically).
### 2.2 Worker changes (`worker/worker.py`)
In `process_one`, **inside the existing `with tempfile.TemporaryDirectory` block** (after `discover_mod_infos`, before the `with` exits):
**Single-pass requirement:** the manifest build (Integration A) and `detect_mod_types` content sniffing (Integration B) **share one pass over the tempdir**. No two-pass implementations. The walk reads each file's bytes once: hash → manifest insert; concurrently inspect path + content for type signals. The output is the `mod_files` rows for that mod_id and the ordered `mod_types` list, both committed in the same transaction as the existing `UPSERT_MOD_PARSED`.
For each `(workshop_id, mod_id)` pair we just upserted:
1. Compute `mod_id_root`: the directory whose name equals `mod.id`. For B41 (`mods/<modId>/mod.info`) that's `mip.parent`; for B42 (`mods/<modId>/<branch>/mod.info`) that's `mip.parent.parent`. Detect via `mip.parent.name == mod.id`.
2. Single recursive walk under `mod_id_root` covering every `media/` subtree (handles B42 `<branch>/media/` + `common/media/` together). For each file:
- If suffix matches `_CONFLICT_EXTS = {".lua", ".txt", ".xml", ".json", ".ini"}` (verbatim from pzmm `scanner.py:21`), compute sha1 (chunked reader, mirrors pzmm `_sha1`) and accumulate `(rel_path, sha1, size_bytes)`. **Last-wins** on duplicate rel_paths across branches.
- Concurrently, in the same loop, accumulate the path-based signals from pzmm `mods.py:detect_mod_types` (lines 88115): `Maps`, `Tiles`, `Textures`, `Vehicles`, `Clothing`, `Sounds`, `UI`, `Animations`, `Translations`, `Lua`, plus collected `lua_text_parts` and `script_text_parts` blobs (capped at 60 lua × 64 KB and 80 script × 96 KB per pzmm).
3. After the walk, run pzmm's content-blob checks (lines 117136): weapon/vehicle/item/recipe/clothing/trait/profession signals from concatenated blobs. Resolve to `mod_types` ordered list (lines 138145).
4. DELETE existing `mod_files` rows for `(workshop_id, mod_id)` then bulk INSERT new rows.
5. UPSERT `mod_parsed.mod_types` and set `files_manifest_built = true` for the row.
The whole step adds disk-walk + hashing of small text files only — typical mod has 20200 files in scope, hashing is cheap (≤100 KB each, sha1 ≈ 500 MB/s). Estimated cost: <500 ms per mod, well under the DD pull cost we're already paying.
### 2.3 New module: `api/diagnostics.py`
Port of pzmm `scan_file_conflicts` adapted to read from `mod_files` instead of walking disk:
```python
async def scan_file_conflicts(conn, mods: list[ModInfo]) -> list[FileConflict]:
"""For the given (already-loaded) ModInfos, report rel_paths claimed
by ≥2 mods with non-equal sha1. Returns list ordered by rel_path."""
```
Implementation:
1. `SELECT workshop_id, mod_id, rel_path, sha1 FROM mod_files WHERE (workshop_id, mod_id) IN (...)`.
2. Group rows in Python by `rel_path`.
3. For each group with ≥2 distinct mods, count distinct sha1s. If >1, emit a `FileConflict`.
4. Winner = last in input order (mirrors pzmm's "last in load order wins").
Dataclass:
```python
@dataclass
class FileConflict:
rel_path: str
providers: list[str] # mod_ids (not ModInfo, to keep payload small)
winner: str # mod_id
```
`pzmm.scanner._CONFLICT_EXTS` filtering happened at manifest-build time, so this read path doesn't need it.
### 2.4 New endpoint: `POST /api/conflicts`
Same input shape as `/api/sort`, **bare wsids only** (Q4 resolved):
```json
{"input": "wsid1;wsid2;wsid3", "rules": "...", "pz_build": "B42"}
```
If `parse_with_collections` returns any `collection_ids`, return HTTP 400 with `detail="conflict scan does not support collection input; resolve via /api/sort first"`.
Response:
```json
{
"conflicts": [
{"rel_path": "media/scripts/items_food.txt",
"providers": ["FoodModA", "FoodModB"],
"winner": "FoodModB"}
],
"missing_manifests": ["wsid1", "wsid2"]
}
```
`missing_manifests` lists mods we couldn't analyze because `files_manifest_built=false`. The frontend can show a banner ("X mods haven't been re-fetched since this feature shipped — file conflicts unavailable for them"), and re-clicking sort eventually triggers re-parse on workshop updates.
Reuse path: `_build_result_for_job` already loads ModInfos via `_row_to_modinfo` — the conflicts endpoint follows the same load pattern, then calls `scan_file_conflicts(conn, mods)` instead of `sort_mods`.
### 2.5 Frontend (out of scope for this plan)
A follow-up plan can wire a "File conflicts" warnings section. For now `/api/conflicts` is consumable from curl and lays the groundwork.
---
## 3. Integration B — Content-type detection feeding category derivation
### 3.1 Schema additions
Already covered by §2.1's `mod_parsed` ALTER TABLE (`mod_types` + `files_manifest_built`). One migration file (`init/09_mod_files.sql`) ships both A and B because they share the worker walk.
### 3.2 Worker changes
Folded into §2.2's single-pass walk. No additional file I/O.
### 3.3 New module: `api/categorize.py`
```python
def types_to_category(mod_types: list[str], name: str) -> str | None:
"""First mod_type that maps to a sortof CATEGORY_ORDER bucket wins.
Returns None if mod_types is empty / Unknown / Dependency-only and we
should fall through to the existing derive_category cascade."""
```
### 3.4 Tag→category mapping (explicit)
| pzmm `mod_type` | sortof `CATEGORY_ORDER` | notes |
|---|---|---|
| `Maps` | `map` | already covered by `mod.maps non-empty`; types-derived is a fallback |
| `Vehicles` | `vehicle` | name regex `"spawn zone"` already routes to `vehicle_spawn` upstream |
| `Weapons` | `weapon` | wins over `Items` (pzmm prefers list ordering) |
| `Items` | *skip* | too generic — almost every mod has Items; would mis-trigger |
| `Clothing` | `wearable` | armor name-hint check still runs after, can override to `armor` |
| `Traits` | `code` | no dedicated `trait` bucket; `code` is the gameplay-axis fallback |
| `Professions` | `profession` | |
| `Recipes` | `crafting` | |
| `Tiles` | `tile` | |
| `Textures` | `texture` | |
| `Sounds` | `sound` | already handled by `Audio` ws_tag; types-derived is a fallback |
| `Animations` | *skip* | no bucket; falls through |
| `UI` | `ui` | |
| `Translations` | `translation` | |
| `Lua` | *skip* | too generic; falls through |
| `Patch` | `patch` | already detected by `_PATCH_NAME_RE`; types-derived is a fallback |
| `Dependency` | `tweaks` | maps to existing `lib` pill |
| `Framework` | `tweaks` | same |
| `Unknown` | *skip* | falls through |
"*skip*" means: don't return a category; let `derive_category` continue its cascade.
### 3.5 `derive_category` integration
Insert a single new check in `api/mlos_sort.py:derive_category` after the explicit-category early return at line 412, **before** the patch/lib name regex at lines 416419:
```python
if mod.mod_types:
cat = types_to_category(mod.mod_types, name)
if cat:
return cat
```
`mod.mod_types` is added to the `ModInfo` dataclass (`mlos_sort.py:113`). `_row_to_modinfo` (`api/app.py:176`) is updated to read the new column. **Both `mlos_sort.py` copies must change in lockstep.**
**Position rationale:** `mod_types` comes from media-content fingerprinting, more reliable than name regex but less reliable than an explicit `category=` field in `mod.info`. So it sits between (1) explicit category and (2) name regex. The patch/lib regexes that come after still win for true patches/libraries (they'd usually return `Patch`/`Dependency` from detect_mod_types anyway, but we want the regex to win for cases where a "patch mod" hasn't shipped enough media to fingerprint).
Empty `mod_types` (e.g. older rows where `files_manifest_built=false`) means the new check returns `None` and the existing cascade runs unchanged. **Graceful degradation is built in.**
---
## 4. Blockers / risks
### 4.1 Schema migration cost
- Current cache: **3,123 `workshop_meta` rows, 3,298 `mod_parsed` rows**.
- New `mod_files` rows estimate: median mod ships ~50 conflict-eligible files (light mods 510, heavy framework/map mods 200500). At 50 avg × 3,298 mods = **~165 k rows**. With sha1 (40 chars) + rel_path (avg 80 chars) + overhead ≈ 200 bytes/row, that's ~33 MB before indexes. Postgres handles this trivially.
- `ALTER TABLE mod_parsed ADD COLUMN mod_types TEXT[]` and `files_manifest_built BOOLEAN` are additive and metadata-only on Postgres 16 (no rewrite). Instant.
### 4.2 Backfill feasibility
- The `/tmp/sortof_steam_throttle` flock + `/tmp/sortof_steam_cooldown` 1h kill-switch (worker.py — `fetch_required_wsids`) protect us from Steam metadata 429s. **DD itself does not hit the metadata API**; it hits Steam content servers, which are not part of the rate-limited path. So mass re-DD does not trip the cooldown.
- Mass re-DD still costs real time: typical DD pull is 2060 s wall-clock. 3,123 wsids × 30 s avg ÷ 4 drains = **~6.5 hours wall-clock for a full backfill**. Doable but disruptive.
- **Recommendation: do not run a bulk backfill.** Let the cache populate organically — every workshop update bumps `time_updated`, which triggers a re-parse and now also a manifest build. The `missing_manifests` field in `/api/conflicts` and the empty-`mod_types` graceful-degrade path together mean the feature works on day 1 (empty results for old rows) and improves as authors push updates.
- Per-mod manual trigger pattern still works (operator-only):
```sql
DELETE FROM mod_parsed WHERE workshop_id='<wsid>';
INSERT INTO download_jobs (workshop_id, status) VALUES ('<wsid>','queued');
```
### 4.3 Inline detection at sort time
- Rejected. `detect_mod_types` reads up to ~11 MB per mod from disk (lua/script blobs). With the tempdir destroyed (the actual case), we'd need to re-DD inline — minutes per sort.
- **All detection runs at parse time** in `process_one`. `derive_category` and `/api/conflicts` are pure DB reads.
---
## 5. Files touched (summary)
**New:**
- `init/09_mod_files.sql` — `mod_files` table, `mod_parsed.mod_types`, `mod_parsed.files_manifest_built`
- `api/diagnostics.py` — port of `scan_file_conflicts`, `FileConflict` dataclass
- `api/categorize.py` — `types_to_category` helper
**Modified:**
- `worker/worker.py` — extend `process_one`'s `with` block: single-pass walk, manifest + detect_mod_types, upsert rows
- `worker/worker.py` (top-level) — port `detect_mod_types` from pzmm `mods.py:57145` (sortof-side copy; do not import from pzmm at runtime)
- `api/mlos_sort.py` — add `mod_types: List[str]` to `ModInfo` dataclass; add `mod_types` check at top of `derive_category`
- `worker/mlos_sort.py` — mirror the `ModInfo` and `derive_category` change (worker/api dual-edit rule)
- `api/app.py` — `_row_to_modinfo` reads new `mod_types` column; `_build_result_for_job` SELECT list adds `mp.mod_types`; register `POST /api/conflicts`
**Out of scope (deferred to follow-up plan):**
- Frontend conflicts panel — `/api/conflicts` endpoint only, no UI
- Integration of `pzmm/core/bundle.py` (debug bundle export) — read for context, not ported
- Backfill orchestration — relying on organic backfill
---
## 6. Rollback
Before applying the migration:
```bash
# Backup mod_parsed (the only existing table we ALTER)
sudo docker exec -i sortof_db pg_dump -U sortof -d sortof -t mod_parsed \
> /opt/sortof/backups/mod_parsed-pre-09.sql.$(date +%Y%m%d-%H%M)
ls -la /opt/sortof/backups/ | tail -3
```
Down SQL (paste into psql to revert the schema half of this plan):
```sql
DROP TABLE IF EXISTS mod_files;
ALTER TABLE mod_parsed
DROP COLUMN IF EXISTS mod_types,
DROP COLUMN IF EXISTS files_manifest_built;
```
To revert code, `git checkout main` and restart services:
```bash
sudo systemctl restart sortof-api sortof-drain@1 sortof-drain@2 sortof-drain@3 sortof-drain@4
```
The migration is additive only (new table + new columns with safe defaults), so the rollback is a clean drop. No data is destroyed in `mod_parsed`'s existing columns.
---
## 7. Verification
1. **Migration applies cleanly:**
```bash
sudo docker exec -i sortof_db psql -U sortof -d sortof < /opt/sortof/init/09_mod_files.sql
sudo docker exec -i sortof_db psql -U sortof -d sortof -c "\d mod_files"
sudo docker exec -i sortof_db psql -U sortof -d sortof -c "\d mod_parsed" | grep -E "mod_types|files_manifest_built"
```
2. **Compile checks** (after every Python edit):
```bash
/opt/sortof/api/.venv/bin/python -m py_compile /opt/sortof/api/app.py /opt/sortof/api/mlos_sort.py /opt/sortof/api/diagnostics.py /opt/sortof/api/categorize.py
/opt/sortof/worker/.venv/bin/python -m py_compile /opt/sortof/worker/worker.py /opt/sortof/worker/mlos_sort.py
cd /opt/sortof/api && .venv/bin/python -c "import app" && echo OK
cd /opt/sortof/worker && .venv/bin/python -c "import drain" && echo OK
```
3. **Dual-edit consistency check** (worker/api `mlos_sort.py` lockstep rule):
```bash
diff /opt/sortof/api/mlos_sort.py /opt/sortof/worker/mlos_sort.py | grep -E "^[<>]" | head -20
```
Logic must match; only comments / docstrings may differ. If any logic line shows up in the diff, fix the lockstep before continuing.
4. **Restart services:**
```bash
sudo systemctl restart sortof-api sortof-drain@1 sortof-drain@2 sortof-drain@3 sortof-drain@4
sudo systemctl is-active sortof-api sortof-drain@{1..4}
```
5. **Force a fresh parse on a known multi-file mod and verify manifest:**
```bash
sudo docker exec -i sortof_db psql -U sortof -d sortof -c \
"DELETE FROM mod_parsed WHERE workshop_id='2169435993';
INSERT INTO download_jobs (workshop_id, status) VALUES ('2169435993','queued');"
sleep 60
sudo docker exec -i sortof_db psql -U sortof -d sortof -c \
"SELECT mod_id, mod_types, files_manifest_built FROM mod_parsed WHERE workshop_id='2169435993';
SELECT count(*) AS file_count FROM mod_files WHERE workshop_id='2169435993';"
```
Expected: `files_manifest_built=t`, `mod_types` populated, `file_count > 0`.
6. **Conflict endpoint smoke:**
```bash
curl -sS -X POST http://100.114.205.53:8801/api/conflicts \
-H 'Content-Type: application/json' \
-d '{"input":"2169435993;2392709985;2487022075"}' | jq .
```
Expected: `{"conflicts": [], "missing_manifests": [<wsids without manifests yet>]}`.
7. **Collection-input rejection (Q4):**
```bash
curl -sS -i -X POST http://100.114.205.53:8801/api/conflicts \
-H 'Content-Type: application/json' \
-d '{"input":"https://steamcommunity.com/sharedfiles/filedetails/?id=999999999"}' | head -5
```
Expected: HTTP 400 with the documented `detail` message (when the URL is detected as a collection ref).
8. **Category-from-types smoke:**
- Find a mod whose Steam tags don't reflect content (e.g. weapon mod tagged only `Realistic`); `/api/sort` currently classifies it as `code` / `other` / `undefined`.
- Re-queue it through the new pipeline (delete+insert).
- Re-run `/api/sort`; confirm category is now `weapon`.
9. **Graceful-degradation check:** confirm a mod with `files_manifest_built=false` still sorts correctly through the existing cascade (no exceptions, category falls back to current behavior).

34
init/09_mod_files.sql Normal file
View File

@@ -0,0 +1,34 @@
-- pzmm-conflict-typing migration (Plan: docs/plans/2026-05-04-pzmm-conflict-and-typing.md)
--
-- Adds:
-- - mod_files: per-mod manifest of conflict-eligible asset files (lua/txt/xml/json/ini)
-- with sha1 fingerprint. Used by /api/conflicts to flag rel_paths claimed
-- by ≥2 mods with non-equal content.
-- - mod_parsed.mod_types: ordered tag list from detect_mod_types content fingerprinting
-- (Maps, Vehicles, Weapons, Traits, …). Consumed by derive_category.
-- - mod_parsed.files_manifest_built: graceful-degradation flag. False until the
-- worker has run the single-pass media/ walk that produces both the
-- manifest rows and mod_types. Old cached rows stay false until they
-- organically re-parse.
--
-- Backfill strategy: organic only — time_updated bumps trigger re-parse, which
-- now also runs the manifest walk. /api/conflicts surfaces missing_manifests so
-- the frontend can communicate gaps without lying.
CREATE TABLE IF NOT EXISTS mod_files (
workshop_id TEXT NOT NULL,
mod_id TEXT NOT NULL,
rel_path TEXT NOT NULL,
sha1 TEXT NOT NULL,
size_bytes INTEGER NOT NULL DEFAULT 0,
PRIMARY KEY (workshop_id, mod_id, rel_path),
FOREIGN KEY (workshop_id, mod_id)
REFERENCES mod_parsed (workshop_id, mod_id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS mod_files_rel_path_idx ON mod_files (rel_path);
CREATE INDEX IF NOT EXISTS mod_files_mod_idx ON mod_files (workshop_id, mod_id);
ALTER TABLE mod_parsed
ADD COLUMN IF NOT EXISTS mod_types TEXT[] NOT NULL DEFAULT '{}',
ADD COLUMN IF NOT EXISTS files_manifest_built BOOLEAN NOT NULL DEFAULT FALSE;

View File

@@ -130,6 +130,11 @@ class ModInfo:
# signal for build / multiplayer / category detection. Distinct from # signal for build / multiplayer / category detection. Distinct from
# `tags` which is mod.info-side (freeform). # `tags` which is mod.info-side (freeform).
workshop_tags: List[str] = field(default_factory=list) workshop_tags: List[str] = field(default_factory=list)
# pzmm-style content fingerprint (Maps, Vehicles, Weapons, Traits, …)
# populated by worker.build_manifest_and_types at parse time. Empty when
# files_manifest_built=false (older cached rows); derive_category falls
# through to the existing cascade in that case.
mod_types: List[str] = field(default_factory=list)
warnings: Dict[str, List[str]] = field(default_factory=dict) warnings: Dict[str, List[str]] = field(default_factory=dict)
@@ -347,8 +352,15 @@ def load_mods_from_dir(root: Path) -> List[ModInfo]:
_PATCH_NAME_RE = re.compile(r"\b(patch|compat|compatibility)\b", re.IGNORECASE) _PATCH_NAME_RE = re.compile(r"\b(patch|compat|compatibility)\b", re.IGNORECASE)
# Substring-based category hints (kept in sync with api/mlos_sort.py) # Substring lists used for derive_category name heuristics. Plain substring
_LIB_NAME_HINTS = ["library", "libraries", "framework"] # matching (vs. \b regex) survives PZ's mishmash of camelCase + underscore
# + version-suffix mod names (TrueActions_1.09, TrueMusic, TMMumble, …)
# that strict word boundaries fail on. False positives are accepted in
# exchange — names containing "music" without being music-related are rare
# in PZ.
_LIB_NAME_HINTS = [
"library", "libraries", "framework",
]
_LIB_NAME_RE = re.compile( _LIB_NAME_RE = re.compile(
r'(?<![A-Za-z])(?:lib|api|core)(?![A-Za-z])' r'(?<![A-Za-z])(?:lib|api|core)(?![A-Za-z])'
r'|(?<=[a-z])(?:Lib|API|Core)(?![A-Za-z])', r'|(?<=[a-z])(?:Lib|API|Core)(?![A-Za-z])',
@@ -381,6 +393,43 @@ def _name_has(name: str, hints: List[str]) -> bool:
return any(h in n for h in hints) return any(h in n for h in hints)
# pzmm content-type → sortof CATEGORY_ORDER mapping. "skip" entries fall
# through to the existing derive_category cascade. Items/Animations/Lua/Unknown
# are too generic; Maps/Sounds/Patch/Vehicles/Clothing duplicate signals already
# captured by the cascade but stay here as fallbacks for poorly-tagged mods.
_TYPE_TO_CAT: Dict[str, str] = {
"Maps": "map",
"Vehicles": "vehicle",
"Weapons": "weapon",
"Clothing": "wearable",
"Traits": "code",
"Professions": "profession",
"Recipes": "crafting",
"Tiles": "tile",
"Textures": "texture",
"Sounds": "sound",
"UI": "ui",
"Translations": "translation",
"Patch": "patch",
"Dependency": "tweaks",
"Framework": "tweaks",
}
def _types_to_category(mod_types: List[str], name: str) -> Optional[str]:
"""First mod_type that maps to a sortof CATEGORY_ORDER bucket wins.
Returns None if mod_types contains only skip-types (Items/Animations/Lua/
Unknown), so the caller can fall through to the existing cascade."""
for t in mod_types:
cat = _TYPE_TO_CAT.get(t)
if cat:
# vehicle_spawn refinement matches the downstream ws_tag check.
if cat == "vehicle" and name and "spawn zone" in name.lower():
return "vehicle_spawn"
return cat
return None
def derive_category(mod: ModInfo) -> str: def derive_category(mod: ModInfo) -> str:
"""Best-effort category from mod.info + workshop_meta.tags + name. """Best-effort category from mod.info + workshop_meta.tags + name.
Mirrors api/mlos_sort.py; keep both copies in sync. Mirrors api/mlos_sort.py; keep both copies in sync.
@@ -389,6 +438,15 @@ def derive_category(mod: ModInfo) -> str:
return mod.category return mod.category
name = mod.name or "" name = mod.name or ""
# pzmm-style content fingerprint takes precedence over name regex when
# available. Empty mod_types means files_manifest_built=false (older
# cached row); fall through to existing cascade.
if mod.mod_types:
cat = _types_to_category(mod.mod_types, name)
if cat:
return cat
if name and _PATCH_NAME_RE.search(name): if name and _PATCH_NAME_RE.search(name):
return "patch" return "patch"
if _name_has(name, _LIB_NAME_HINTS) or (name and _LIB_NAME_RE.search(name)): if _name_has(name, _LIB_NAME_HINTS) or (name and _LIB_NAME_RE.search(name)):

View File

@@ -19,6 +19,7 @@ from __future__ import annotations
import argparse import argparse
import asyncio import asyncio
import hashlib
import json import json
import os import os
import re import re
@@ -27,7 +28,7 @@ import subprocess
import sys import sys
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional from typing import Dict, List, Optional, Tuple
import asyncpg import asyncpg
import httpx import httpx
@@ -208,14 +209,21 @@ def run_depot_downloader(
workshop_id: str, workshop_id: str,
output_dir: Path, output_dir: Path,
dd_path: Path, dd_path: Path,
filelist_regex: str = r"regex:.*\.info$", # mod.info / map.info for the existing parse path, plus _CONFLICT_EXTS
# (lua/txt/xml/json/ini) for build_manifest_and_types — both the conflict
# manifest and pzmm-style content sniffing live on these. Binary assets
# (.png/.dds/.bank/.ogg/.wav/.X) are intentionally excluded; their type
# signals degrade gracefully via workshop_meta.tags fallback in
# mlos_sort.derive_category.
filelist_regex: str = r"regex:.*\.(info|lua|txt|xml|json|ini)$",
timeout: int = 300, timeout: int = 300,
max_attempts: int = 3, max_attempts: int = 3,
backoff_s: float = 2.0, backoff_s: float = 2.0,
) -> bool: ) -> bool:
""" """
Fetch workshop item using DepotDownloader, filtered to .info files only. Fetch workshop item using DepotDownloader, filtered to mod.info plus
Writes <output_dir>/mods/<mod_id>/mod.info (and possibly map.info paths). conflict-eligible asset files (lua/txt/xml/json/ini). Writes
<output_dir>/mods/<mod_id>/mod.info (and the asset tree under media/).
Returns True on success. Returns True on success.
Retries up to max_attempts times on rc!=0 or timeout - Steam Workshop's Retries up to max_attempts times on rc!=0 or timeout - Steam Workshop's
@@ -308,6 +316,178 @@ def discover_map_folders(mip_parent: Path) -> List[str]:
return out return out
# -----------------------------------------------------------------------------
# Single-pass manifest + content-type detection (Plan: 2026-05-04 pzmm port)
#
# Both Integration A (file-conflict manifest) and Integration B (mod_types
# fingerprinting) read the same files from the temp DD extraction. We walk
# the mod_id root once, hashing conflict-eligible files into a manifest map
# AND collecting pzmm-style content signals into a tag set in the same loop.
# No two-pass implementations.
# -----------------------------------------------------------------------------
_CONFLICT_EXTS = {".lua", ".txt", ".xml", ".json", ".ini"}
# map subtrees are namespaced per-mod by directory; conflict surface is ~zero,
# and worldmap.xml can be tens of MB. Skipping anything under maps/<MapName>/
# from manifest insertion AND mod_types content sniffing. map.info itself
# still drives `mod.maps` (via discover_map_folders) which feeds the existing
# `mod.maps non-empty → map` rule in derive_category.
_RE_MAP_SUBTREE = re.compile(r"^maps/[^/]+/")
# Ordered by pzmm preference: first match wins when types_to_category maps
# this list down to a single sortof CATEGORY_ORDER bucket.
_TYPE_PREFERRED = [
"Maps", "Vehicles", "Weapons", "Items", "Clothing", "Traits",
"Professions", "Recipes", "Tiles", "Textures", "Sounds",
"Animations", "UI", "Translations", "Lua", "Patch", "Dependency",
"Framework",
]
def _sha1(path: Path) -> str:
h = hashlib.sha1()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 128), b""):
h.update(chunk)
return h.hexdigest()
def _read_small_text(path: Path, limit: int = 256_000) -> str:
try:
with path.open("rb") as f:
data = f.read(limit)
return data.decode("utf-8", errors="ignore").lower()
except Exception:
return ""
def build_manifest_and_types(
mip_parent: Path, mod_id: str, raw: str
) -> Tuple[List[Tuple[str, str, int]], List[str]]:
"""Single-pass walk under the mod_id root.
Returns:
manifest_rows: list of (rel_path, sha1, size_bytes) for conflict-eligible
files (lua/txt/xml/json/ini). rel_path is lowercased,
posix-style, prefixed with "media/" so it's comparable
across mods regardless of B41/B42/split branch layout.
Last-wins on duplicate rel_paths from multi-branch mods.
mod_types: pzmm-ordered content tag list (Maps, Vehicles, …).
Falls back to ["Dependency"] or ["Unknown"] when no
media/ subtree exists.
"""
if mip_parent.parent.name == "mods":
modid_root = mip_parent
else:
modid_root = mip_parent.parent
tags: set = set()
name_blob = (raw or "").lower()
if any(w in name_blob for w in ("compat", "compatibility", "patch", "fix")):
tags.add("Patch")
if any(w in name_blob for w in ("api", "core", "dependency", "framework", "library", "required")):
tags.add("Dependency")
manifest_map: Dict[str, Tuple[str, int]] = {}
script_text_parts: List[str] = []
lua_text_parts: List[str] = []
has_media = False
for media_dir in modid_root.rglob("media"):
if not media_dir.is_dir():
continue
has_media = True
for path in media_dir.rglob("*"):
if not path.is_file():
continue
try:
rel_below = path.relative_to(media_dir).as_posix().lower()
except ValueError:
continue
suffix = path.suffix.lower()
in_map_subtree = bool(_RE_MAP_SUBTREE.match(rel_below))
# Manifest: skip per-map subtree (see _RE_MAP_SUBTREE comment).
if suffix in _CONFLICT_EXTS and not in_map_subtree:
rel = "media/" + rel_below
try:
size = path.stat().st_size
sha = _sha1(path)
manifest_map[rel] = (sha, size)
except OSError:
pass
# Path-based mod_type signals — always fire, even for files inside
# the per-map subtree. The "Maps" tag is the canonical signal that
# this mod ships a map and shouldn't be lost when we exclude the
# heavy worldmap.xml / objects.lua content from the manifest.
if rel_below.startswith("maps/"):
tags.add("Maps")
if rel_below.startswith("texturepacks/") or "tiledefinitions" in rel_below:
tags.add("Tiles")
if (rel_below.startswith(("textures/", "models_x/", "models/"))
or suffix in {".png", ".dds"}):
if path.name.lower() != "poster.png":
tags.add("Textures")
if rel_below.startswith(("scripts/vehicles/", "scripts/vehicle")):
tags.add("Vehicles")
if rel_below.startswith(("clothing/", "scripts/clothing/")):
tags.add("Clothing")
if (rel_below.startswith(("sound/", "sounds/", "fmod/"))
or suffix in {".bank", ".ogg", ".wav"}):
tags.add("Sounds")
if rel_below.startswith(("ui/", "lua/client/ui/")):
tags.add("UI")
if rel_below.startswith(("anims/", "animsets/", "actiongroups/")):
tags.add("Animations")
if rel_below.startswith("lua/shared/translate/") or "/translate/" in rel_below:
tags.add("Translations")
# Content-blob accumulation: skip per-map subtree. Map-internal lua
# / scripts (rare but possible) wouldn't normally collide with
# other mods anyway, and reading them just costs memory for no
# detection upside.
if not in_map_subtree:
if rel_below.startswith("lua/"):
tags.add("Lua")
if suffix == ".lua" and len(lua_text_parts) < 60:
lua_text_parts.append(_read_small_text(path, 64_000))
if (rel_below.startswith("scripts/") and suffix in {".txt", ".xml"}
and len(script_text_parts) < 80):
script_text_parts.append(_read_small_text(path, 96_000))
if not has_media:
return [], (["Dependency"] if "Dependency" in tags else ["Unknown"])
script_blob = "\n".join(script_text_parts)
lua_blob = "\n".join(lua_text_parts)
if " type = weapon" in script_blob or "displaycategory = weapon" in script_blob:
tags.add("Weapons")
if " vehicle " in script_blob or "module vehicles" in script_blob:
tags.add("Vehicles")
if "item " in script_blob:
tags.add("Items")
if "recipe " in script_blob or " evolvedrecipe " in script_blob:
tags.add("Recipes")
if "bodylocation" in script_blob or "clothingitem" in script_blob:
tags.add("Clothing")
if "traitfactory.addtrait" in lua_blob:
tags.add("Traits")
if "professionfactory.addprofession" in lua_blob:
tags.add("Professions")
has_require = bool(re.search(r"^\s*require\s*=", raw or "", re.IGNORECASE | re.MULTILINE))
if not tags and has_require and not script_blob:
tags.add("Framework")
ordered = [t for t in _TYPE_PREFERRED if t in tags] or ["Unknown"]
manifest_rows = [(rel, sha, size) for rel, (sha, size) in sorted(manifest_map.items())]
return manifest_rows, ordered
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# DB upserts # DB upserts
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@@ -356,8 +536,9 @@ INSERT INTO mod_parsed (
requirements, load_after, load_before, incompatible_mods, requirements, load_after, load_before, incompatible_mods,
load_first, load_last, tags, maps, load_first, load_last, tags, maps,
raw_mod_info, version_min, is_addon, raw_mod_info, version_min, is_addon,
mod_types, files_manifest_built,
parsed_at_time_updated, parsed_at parsed_at_time_updated, parsed_at
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16, now()) ) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18, now())
ON CONFLICT (workshop_id, mod_id) DO UPDATE SET ON CONFLICT (workshop_id, mod_id) DO UPDATE SET
name = EXCLUDED.name, name = EXCLUDED.name,
category = EXCLUDED.category, category = EXCLUDED.category,
@@ -372,10 +553,24 @@ ON CONFLICT (workshop_id, mod_id) DO UPDATE SET
raw_mod_info = EXCLUDED.raw_mod_info, raw_mod_info = EXCLUDED.raw_mod_info,
version_min = EXCLUDED.version_min, version_min = EXCLUDED.version_min,
is_addon = EXCLUDED.is_addon, is_addon = EXCLUDED.is_addon,
mod_types = EXCLUDED.mod_types,
files_manifest_built = EXCLUDED.files_manifest_built,
parsed_at_time_updated = EXCLUDED.parsed_at_time_updated, parsed_at_time_updated = EXCLUDED.parsed_at_time_updated,
parsed_at = now(); parsed_at = now();
""" """
DELETE_MOD_FILES = """
DELETE FROM mod_files WHERE workshop_id = $1 AND mod_id = $2;
"""
INSERT_MOD_FILE = """
INSERT INTO mod_files (workshop_id, mod_id, rel_path, sha1, size_bytes)
VALUES ($1, $2, $3, $4, $5)
ON CONFLICT (workshop_id, mod_id, rel_path) DO UPDATE SET
sha1 = EXCLUDED.sha1,
size_bytes = EXCLUDED.size_bytes;
"""
# Description-text heuristic for "this mod is an optional add-on to the # Description-text heuristic for "this mod is an optional add-on to the
# primary mod published by the same wsid". Matches: # primary mod published by the same wsid". Matches:
# "Optional add-on: removes ..." (TMMumble) # "Optional add-on: removes ..." (TMMumble)
@@ -487,6 +682,10 @@ async def process_one(
if mod is None: if mod is None:
continue continue
maps = discover_map_folders(mip.parent) maps = discover_map_folders(mip.parent)
# Single-pass walk under the mod_id root: produces both the
# conflict manifest and the pzmm-style mod_types list. See
# build_manifest_and_types.
manifest_rows, mod_types = build_manifest_and_types(mip.parent, mod.id, raw)
# Evict any other wsid's claim on this mod_id before we install # Evict any other wsid's claim on this mod_id before we install
# ours. Cache invariant: at most one wsid per mod_id, with the # ours. Cache invariant: at most one wsid per mod_id, with the
# most-recent pull winning. # most-recent pull winning.
@@ -508,8 +707,19 @@ async def process_one(
raw, raw,
extract_version_min(raw), extract_version_min(raw),
detect_is_addon(raw), detect_is_addon(raw),
mod_types,
True, # files_manifest_built
time_updated, time_updated,
) )
# Replace any stale manifest rows for this (workshop_id, mod_id)
# so a re-parse can't leave behind orphans from a prior layout.
await conn.execute(DELETE_MOD_FILES, workshop_id, mod.id)
if manifest_rows:
await conn.executemany(
INSERT_MOD_FILE,
[(workshop_id, mod.id, rel, sha, size)
for rel, sha, size in manifest_rows],
)
seen_mod_ids.append(mod.id) seen_mod_ids.append(mod.id)
# Drop rows for mods that no longer exist in this workshop item # Drop rows for mods that no longer exist in this workshop item