feat: pzmm conflict detection + content-type categorization
- mod_files manifest table populated at parse time - POST /api/conflicts endpoint - mod_types fingerprinting feeds derive_category - DD filelist regex broadened to cover conflict-eligible exts - media/maps/<*>/* excluded from manifest (per-mod namespaced, no conflict value, can be tens of MB per mod) Plan: docs/plans/2026-05-04-pzmm-conflict-and-typing.md
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -20,6 +20,9 @@ __pycache__/
|
||||
*.bak
|
||||
*.bak-*
|
||||
|
||||
# Operational DB dumps (pg_dump output kept locally for rollback)
|
||||
backups/
|
||||
|
||||
# Editor / IDE / OS artifacts
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
73
api/app.py
73
api/app.py
@@ -26,6 +26,7 @@ from pydantic import BaseModel, Field
|
||||
|
||||
import adapters
|
||||
import db
|
||||
import diagnostics
|
||||
import expansion
|
||||
import jobs
|
||||
import steam
|
||||
@@ -191,6 +192,7 @@ def _row_to_modinfo(r) -> ModInfo:
|
||||
maps=list(r["maps"] or []),
|
||||
is_addon=bool(r["is_addon"]) if "is_addon" in r else False,
|
||||
workshop_tags=list(r["workshop_tags"] or []) if "workshop_tags" in r else [],
|
||||
mod_types=list(r["mod_types"] or []) if "mod_types" in r else [],
|
||||
)
|
||||
|
||||
|
||||
@@ -684,7 +686,7 @@ async def _build_result_for_job(
|
||||
SELECT mp.workshop_id, mp.mod_id, mp.name, mp.category,
|
||||
mp.requirements, mp.load_after, mp.load_before,
|
||||
mp.incompatible_mods, mp.load_first, mp.load_last,
|
||||
mp.tags, mp.maps, mp.is_addon, wm.tags AS workshop_tags
|
||||
mp.tags, mp.maps, mp.is_addon, mp.mod_types, wm.tags AS workshop_tags
|
||||
FROM mod_parsed mp
|
||||
JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id
|
||||
WHERE mp.workshop_id = ANY($1::text[])
|
||||
@@ -1001,7 +1003,7 @@ async def sort_endpoint(req: SortRequest, request: Request) -> Dict[str, Any]:
|
||||
SELECT mp.workshop_id, mp.mod_id, mp.name, mp.category,
|
||||
mp.requirements, mp.load_after, mp.load_before,
|
||||
mp.incompatible_mods, mp.load_first, mp.load_last,
|
||||
mp.tags, mp.maps, mp.is_addon, wm.tags AS workshop_tags
|
||||
mp.tags, mp.maps, mp.is_addon, mp.mod_types, wm.tags AS workshop_tags
|
||||
FROM mod_parsed mp
|
||||
JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id
|
||||
WHERE mp.workshop_id = ANY($1::text[])
|
||||
@@ -1215,7 +1217,7 @@ async def resort_endpoint(req: ResortRequest, request: Request) -> Dict[str, Any
|
||||
SELECT mp.workshop_id, mp.mod_id, mp.name, mp.category,
|
||||
mp.requirements, mp.load_after, mp.load_before,
|
||||
mp.incompatible_mods, mp.load_first, mp.load_last,
|
||||
mp.tags, mp.maps, mp.is_addon, wm.tags AS workshop_tags
|
||||
mp.tags, mp.maps, mp.is_addon, mp.mod_types, wm.tags AS workshop_tags
|
||||
FROM mod_parsed mp
|
||||
JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id
|
||||
WHERE mp.workshop_id IN (SELECT workshop_id FROM selected_wsids)
|
||||
@@ -1469,6 +1471,71 @@ async def vote_broken_mod(
|
||||
return {"upvotes": int(row["upvotes"]), "downvotes": int(row["downvotes"])}
|
||||
|
||||
|
||||
@app.post("/api/conflicts")
|
||||
async def conflicts_endpoint(req: SortRequest, request: Request) -> Dict[str, Any]:
|
||||
"""Detect rel_paths claimed by ≥2 input mods with non-equal sha1.
|
||||
|
||||
v1: bare wsids only. Collection input returns 400 so the caller can
|
||||
resolve via /api/sort first (where the async-job + drain-progress
|
||||
plumbing already lives). Mods whose `files_manifest_built` is false
|
||||
cannot be analyzed and are reported in `missing_manifests` instead of
|
||||
silently ignored.
|
||||
"""
|
||||
bare_wsids, collection_ids = parse_with_collections(req.input or "")
|
||||
if collection_ids:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="conflict scan does not support collection input; resolve via /api/sort first",
|
||||
)
|
||||
if not bare_wsids:
|
||||
raise HTTPException(status_code=400, detail="no workshop ids found in input")
|
||||
if len(bare_wsids) > MAX_IDS:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"too many workshop ids ({len(bare_wsids)} > {MAX_IDS})",
|
||||
)
|
||||
|
||||
pool = request.app.state.db
|
||||
async with pool.acquire() as conn:
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT mp.workshop_id, mp.mod_id, mp.name, mp.category,
|
||||
mp.requirements, mp.load_after, mp.load_before,
|
||||
mp.incompatible_mods, mp.load_first, mp.load_last,
|
||||
mp.tags, mp.maps, mp.is_addon, mp.mod_types,
|
||||
mp.files_manifest_built, wm.tags AS workshop_tags
|
||||
FROM mod_parsed mp
|
||||
JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id
|
||||
WHERE mp.workshop_id = ANY($1::text[])
|
||||
AND mp.parsed_at_time_updated = wm.time_updated
|
||||
ORDER BY mp.workshop_id, mp.mod_id
|
||||
""",
|
||||
bare_wsids,
|
||||
)
|
||||
|
||||
mods: List[ModInfo] = [_row_to_modinfo(r) for r in rows]
|
||||
|
||||
# Missing-manifest wsids: input wsids that have no mod_parsed rows
|
||||
# OR whose rows all have files_manifest_built=false. Any single
|
||||
# built row in a multi-mod wsid counts as "manifest available".
|
||||
wsid_has_manifest: Dict[str, bool] = {}
|
||||
for r in rows:
|
||||
w = r["workshop_id"]
|
||||
built = bool(r["files_manifest_built"])
|
||||
wsid_has_manifest[w] = wsid_has_manifest.get(w, False) or built
|
||||
missing_manifests = [w for w in bare_wsids if not wsid_has_manifest.get(w, False)]
|
||||
|
||||
conflicts = await diagnostics.scan_file_conflicts(conn, mods)
|
||||
|
||||
return {
|
||||
"conflicts": [
|
||||
{"rel_path": c.rel_path, "providers": c.providers, "winner": c.winner}
|
||||
for c in conflicts
|
||||
],
|
||||
"missing_manifests": missing_manifests,
|
||||
}
|
||||
|
||||
|
||||
# ── static frontend ────────────────────────────────────────────────────────
|
||||
# Mount LAST so all API routes win path resolution.
|
||||
_FRONTEND_DIR = Path(__file__).resolve().parent.parent / "frontend"
|
||||
|
||||
58
api/categorize.py
Normal file
58
api/categorize.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""Public helper for mapping pzmm content-type tags to sortof CATEGORY_ORDER.
|
||||
|
||||
The same mapping is also inlined in `mlos_sort.py` (both api/ and worker/
|
||||
copies, deliberately — worker uses a separate venv with no FastAPI deps,
|
||||
so it cannot import from api/). This module exposes the helper for
|
||||
non-mlos consumers (e.g. /api/conflicts diagnostics output) without
|
||||
forcing them to drag in the whole sorter module.
|
||||
|
||||
Source: pzmm core/mods.py:detect_mod_types ordering, mapped to sortof's
|
||||
CATEGORY_ORDER buckets per docs/plans/2026-05-04-pzmm-conflict-and-typing.md
|
||||
§3.4.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
# Items / Animations / Lua / Unknown intentionally absent — too generic to
|
||||
# drive a category decision; callers should fall through to other heuristics.
|
||||
_TYPE_TO_CAT: Dict[str, str] = {
|
||||
"Maps": "map",
|
||||
"Vehicles": "vehicle",
|
||||
"Weapons": "weapon",
|
||||
"Clothing": "wearable",
|
||||
"Traits": "code",
|
||||
"Professions": "profession",
|
||||
"Recipes": "crafting",
|
||||
"Tiles": "tile",
|
||||
"Textures": "texture",
|
||||
"Sounds": "sound",
|
||||
"UI": "ui",
|
||||
"Translations": "translation",
|
||||
"Patch": "patch",
|
||||
"Dependency": "tweaks",
|
||||
"Framework": "tweaks",
|
||||
}
|
||||
|
||||
|
||||
def types_to_category(mod_types: List[str], name: str = "") -> Optional[str]:
|
||||
"""First mod_type that maps to a sortof CATEGORY_ORDER bucket wins.
|
||||
|
||||
Returns the bucket name (e.g. "weapon", "vehicle"), or None when:
|
||||
- mod_types is empty (manifest not yet built), or
|
||||
- mod_types contains only skip-types (Items / Animations / Lua / Unknown).
|
||||
|
||||
The `name` arg is used for the vehicle_spawn refinement only — when a
|
||||
Vehicles-tagged mod is named like "spawn zone X", the more specific
|
||||
`vehicle_spawn` bucket wins over the generic `vehicle`.
|
||||
"""
|
||||
if not mod_types:
|
||||
return None
|
||||
for t in mod_types:
|
||||
cat = _TYPE_TO_CAT.get(t)
|
||||
if cat:
|
||||
if cat == "vehicle" and name and "spawn zone" in name.lower():
|
||||
return "vehicle_spawn"
|
||||
return cat
|
||||
return None
|
||||
93
api/diagnostics.py
Normal file
93
api/diagnostics.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""File-level conflict detection from cached manifests.
|
||||
|
||||
Port of pzmm core/scanner.py:scan_file_conflicts adapted to read from the
|
||||
mod_files table (populated by worker.build_manifest_and_types) instead of
|
||||
walking on-disk media trees. See docs/plans/2026-05-04-pzmm-conflict-and-typing.md.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from mlos_sort import ModInfo
|
||||
|
||||
|
||||
@dataclass
|
||||
class FileConflict:
|
||||
rel_path: str
|
||||
providers: List[str] # mod_ids in input load order
|
||||
winner: str # mod_id (last in load order)
|
||||
|
||||
|
||||
_FETCH_MANIFEST = """
|
||||
WITH inputs AS (
|
||||
SELECT unnest($1::text[]) AS workshop_id,
|
||||
unnest($2::text[]) AS mod_id
|
||||
)
|
||||
SELECT mf.workshop_id, mf.mod_id, mf.rel_path, mf.sha1
|
||||
FROM mod_files mf
|
||||
JOIN inputs i
|
||||
ON mf.workshop_id = i.workshop_id
|
||||
AND mf.mod_id = i.mod_id
|
||||
"""
|
||||
|
||||
|
||||
async def scan_file_conflicts(conn, mods: List[ModInfo]) -> List[FileConflict]:
|
||||
"""For the given (already-loaded) ModInfos, report rel_paths claimed by
|
||||
≥2 mods with non-equal sha1. Returns list ordered by rel_path.
|
||||
|
||||
Mods without manifest rows (`files_manifest_built=false`) silently
|
||||
contribute nothing to the conflict scan; the caller is responsible for
|
||||
surfacing them as `missing_manifests` in any user-facing payload.
|
||||
"""
|
||||
if len(mods) < 2:
|
||||
return []
|
||||
|
||||
wsids = [m.workshop_id or "" for m in mods]
|
||||
mod_ids = [m.id for m in mods]
|
||||
rows = await conn.fetch(_FETCH_MANIFEST, wsids, mod_ids)
|
||||
|
||||
# mod_id → load-order index (input order = load order, mirroring pzmm)
|
||||
order_index: Dict[str, int] = {m.id: i for i, m in enumerate(mods)}
|
||||
|
||||
# rel_path → list of (load_order_index, mod_id, sha1)
|
||||
by_path: Dict[str, List[Tuple[int, str, str]]] = defaultdict(list)
|
||||
for r in rows:
|
||||
mod_id = r["mod_id"]
|
||||
idx = order_index.get(mod_id)
|
||||
if idx is None:
|
||||
continue
|
||||
by_path[r["rel_path"]].append((idx, mod_id, r["sha1"]))
|
||||
|
||||
conflicts: List[FileConflict] = []
|
||||
for rel, entries in by_path.items():
|
||||
# Need ≥2 distinct providers AND ≥2 distinct sha1s. If every
|
||||
# provider ships byte-identical content (same sha1), it's a
|
||||
# duplicate, not a conflict — pzmm scanner.py:55–66.
|
||||
unique_providers = {mod_id for _, mod_id, _ in entries}
|
||||
if len(unique_providers) < 2:
|
||||
continue
|
||||
unique_hashes = {sha for _, _, sha in entries}
|
||||
if len(unique_hashes) < 2:
|
||||
continue
|
||||
# Order providers by input load-order index. Winner = last loaded.
|
||||
ordered = sorted(entries, key=lambda e: e[0])
|
||||
providers = [mod_id for _, mod_id, _ in ordered]
|
||||
# De-dup providers preserving order (a mod could ship the same
|
||||
# rel_path under both B41 and B42 layouts → seen twice).
|
||||
seen: set = set()
|
||||
dedup_providers: List[str] = []
|
||||
for p in providers:
|
||||
if p not in seen:
|
||||
seen.add(p)
|
||||
dedup_providers.append(p)
|
||||
conflicts.append(FileConflict(
|
||||
rel_path=rel,
|
||||
providers=dedup_providers,
|
||||
winner=dedup_providers[-1],
|
||||
))
|
||||
|
||||
conflicts.sort(key=lambda c: c.rel_path)
|
||||
return conflicts
|
||||
@@ -130,6 +130,11 @@ class ModInfo:
|
||||
# signal for build / multiplayer / category detection. Distinct from
|
||||
# `tags` which is mod.info-side (freeform).
|
||||
workshop_tags: List[str] = field(default_factory=list)
|
||||
# pzmm-style content fingerprint (Maps, Vehicles, Weapons, Traits, …)
|
||||
# populated by worker.build_manifest_and_types at parse time. Empty when
|
||||
# files_manifest_built=false (older cached rows); derive_category falls
|
||||
# through to the existing cascade in that case.
|
||||
mod_types: List[str] = field(default_factory=list)
|
||||
warnings: Dict[str, List[str]] = field(default_factory=dict)
|
||||
|
||||
|
||||
@@ -389,30 +394,77 @@ def _name_has(name: str, hints: List[str]) -> bool:
|
||||
return any(h in n for h in hints)
|
||||
|
||||
|
||||
# pzmm content-type → sortof CATEGORY_ORDER mapping. "skip" entries fall
|
||||
# through to the existing derive_category cascade. Items/Animations/Lua/Unknown
|
||||
# are too generic; Maps/Sounds/Patch/Vehicles/Clothing duplicate signals already
|
||||
# captured by the cascade but stay here as fallbacks for poorly-tagged mods.
|
||||
_TYPE_TO_CAT: Dict[str, str] = {
|
||||
"Maps": "map",
|
||||
"Vehicles": "vehicle",
|
||||
"Weapons": "weapon",
|
||||
"Clothing": "wearable",
|
||||
"Traits": "code",
|
||||
"Professions": "profession",
|
||||
"Recipes": "crafting",
|
||||
"Tiles": "tile",
|
||||
"Textures": "texture",
|
||||
"Sounds": "sound",
|
||||
"UI": "ui",
|
||||
"Translations": "translation",
|
||||
"Patch": "patch",
|
||||
"Dependency": "tweaks",
|
||||
"Framework": "tweaks",
|
||||
}
|
||||
|
||||
|
||||
def _types_to_category(mod_types: List[str], name: str) -> Optional[str]:
|
||||
"""First mod_type that maps to a sortof CATEGORY_ORDER bucket wins.
|
||||
Returns None if mod_types contains only skip-types (Items/Animations/Lua/
|
||||
Unknown), so the caller can fall through to the existing cascade."""
|
||||
for t in mod_types:
|
||||
cat = _TYPE_TO_CAT.get(t)
|
||||
if cat:
|
||||
# vehicle_spawn refinement matches the downstream ws_tag check.
|
||||
if cat == "vehicle" and name and "spawn zone" in name.lower():
|
||||
return "vehicle_spawn"
|
||||
return cat
|
||||
return None
|
||||
|
||||
|
||||
def derive_category(mod: ModInfo) -> str:
|
||||
"""Best-effort category from mod.info + workshop_meta.tags + name.
|
||||
|
||||
Detection order (most specific → least):
|
||||
1. mod.info `category=` if explicit and recognized.
|
||||
2. patch / fix name regex (Spec G-patch).
|
||||
3. library/framework name regex (extends FRAMEWORK_KEYS).
|
||||
4. mod.maps non-empty → map.
|
||||
5. moodle / profession / movement / specific gameplay axes by name.
|
||||
6. Workshop tags (canonical Steam controlled vocab): Audio + 'music' →
|
||||
2. pzmm-style mod_types fingerprint (when files_manifest_built=true).
|
||||
3. patch / fix name regex (Spec G-patch).
|
||||
4. library/framework name regex (extends FRAMEWORK_KEYS).
|
||||
5. mod.maps non-empty → map.
|
||||
6. moodle / profession / movement / specific gameplay axes by name.
|
||||
7. Workshop tags (canonical Steam controlled vocab): Audio + 'music' →
|
||||
music; Audio → sound; Weapons → weapon; Vehicles → vehicle;
|
||||
Clothing/Armor + 'armor' → armor, else wearable; Building →
|
||||
building; Farming → farming; Food → food; Skills → profession
|
||||
(or moodle); Interface → ui; Textures → texture;
|
||||
Language/Translation → translation; QOL → qol; Multiplayer alone
|
||||
→ multiplayer.
|
||||
7. mod.info tags (freeform fallback).
|
||||
8. FRAMEWORK_KEYS substring match → tweaks.
|
||||
9. Default → other.
|
||||
8. mod.info tags (freeform fallback).
|
||||
9. FRAMEWORK_KEYS substring match → tweaks.
|
||||
10. Default → other.
|
||||
"""
|
||||
if mod.category in CATEGORY_ORDER and mod.category != "undefined":
|
||||
return mod.category
|
||||
|
||||
name = mod.name or ""
|
||||
|
||||
# pzmm-style content fingerprint takes precedence over name regex when
|
||||
# available. Empty mod_types means files_manifest_built=false (older
|
||||
# cached row); fall through to existing cascade.
|
||||
if mod.mod_types:
|
||||
cat = _types_to_category(mod.mod_types, name)
|
||||
if cat:
|
||||
return cat
|
||||
|
||||
if name and _PATCH_NAME_RE.search(name):
|
||||
return "patch"
|
||||
if _name_has(name, _LIB_NAME_HINTS) or (name and _LIB_NAME_RE.search(name)):
|
||||
|
||||
343
docs/plans/2026-05-04-pzmm-conflict-and-typing.md
Normal file
343
docs/plans/2026-05-04-pzmm-conflict-and-typing.md
Normal file
@@ -0,0 +1,343 @@
|
||||
# Plan: pzmm conflict detection + content-type categorization
|
||||
|
||||
**Date:** 2026-05-04
|
||||
**Branch:** `feat/pzmm-conflict-typing`
|
||||
**Status:** Approved (Sam, 2026-05-04)
|
||||
|
||||
**Sources read:**
|
||||
- `/tmp/pzmm-src/pzmm-main/core/scanner.py` — `scan_file_conflicts`, `solve_load_order`, `FileConflict`
|
||||
- `/tmp/pzmm-src/pzmm-main/core/mods.py` — `detect_mod_types`, `ModInfo`
|
||||
- `/tmp/pzmm-src/pzmm-main/core/bundle.py` — debug bundle (read for context, not integrated)
|
||||
- `/opt/sortof/init/01_schema.sql` and migrations 02..08
|
||||
- `/opt/sortof/api/app.py` — `/api/sort`, `_build_result_for_job`, `_row_to_modinfo`
|
||||
- `/opt/sortof/api/mlos_sort.py` — `CATEGORY_ORDER`, `derive_category`
|
||||
- `/opt/sortof/api/adapters.py` — `CAT_MAP`
|
||||
- `/opt/sortof/worker/worker.py` — `process_one`
|
||||
|
||||
**Open questions resolved at approval:**
|
||||
- Manifest scope: walk all `media/` subtrees under the mod_id root, last-wins on duplicate rel_paths, **no per-branch column**.
|
||||
- `mod_files.size_bytes` column: keep.
|
||||
- Module split: `api/diagnostics.py` and `api/categorize.py` are **separate files**.
|
||||
- `/api/conflicts` v1: **bare wsids only**, return HTTP 400 on collection input. Defer async-job/collection-expansion plumbing to a follow-up plan.
|
||||
|
||||
---
|
||||
|
||||
## 1. Context
|
||||
|
||||
pzmm ships two pieces sortof doesn't have today:
|
||||
|
||||
1. **File-conflict detection** — when two mods both ship `media/scripts/items_food.txt` with byte-different content, the later one silently overrides the earlier one at runtime. PZ never reports this; the player only sees the symptom (broken food, duplicate item ids, etc.). pzmm walks each mod's `media/` tree, hashes the conflict-prone extensions (`.lua`, `.txt`, `.xml`, `.json`, `.ini`), and reports rel-paths claimed by ≥2 mods with non-equal content. Sortof currently only detects `mod_id` collisions (one mod_id under multiple wsids). File-level overrides are invisible to us.
|
||||
2. **Content-type detection** — pzmm walks `media/` paths plus the contents of `lua/` and `scripts/*.txt|xml` files to fingerprint what a mod actually ships (Weapons, Vehicles, Maps, Traits, Professions, Recipes, etc.). Sortof's `derive_category` infers category from `workshop_meta.tags` + name regex + `mod.info` tags. Authors who tag poorly (or skip tagging) end up in `other`/`undefined`. Detection from media/ contents is more reliable for those.
|
||||
|
||||
Both pzmm functions assume on-disk media trees. Sortof's worker uses `tempfile.TemporaryDirectory` (`worker/worker.py:472`) — the entire DD extraction is destroyed at the end of `process_one`'s `with` block. **Only `mod.info` (as `raw_mod_info`), discovered map folder names, and a few derived columns persist.**
|
||||
|
||||
This plan keeps the existing model: parse once, serve from DB. We **persist a manifest at parse time**. Re-fetch on demand was rejected — every conflict check would queue N DD pulls, minutes per request, completely unusable.
|
||||
|
||||
We **do not import pzmm's `solve_load_order`**. Sortof's `mlos_sort.py` is strictly more correct (preorder, loadFirst/loadLast tiers, category buckets, patch G-axis, multi-branch picker, addon injection). pzmm's solver is a plain Kahn topo sort with no tie-breakers.
|
||||
|
||||
---
|
||||
|
||||
## 2. Integration A — File conflict detection
|
||||
|
||||
### 2.1 New schema (`init/09_mod_files.sql`)
|
||||
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS mod_files (
|
||||
workshop_id TEXT NOT NULL,
|
||||
mod_id TEXT NOT NULL,
|
||||
rel_path TEXT NOT NULL, -- lowercased, posix-style, relative to mod_id root
|
||||
sha1 TEXT NOT NULL,
|
||||
size_bytes INTEGER NOT NULL DEFAULT 0,
|
||||
PRIMARY KEY (workshop_id, mod_id, rel_path),
|
||||
FOREIGN KEY (workshop_id, mod_id) REFERENCES mod_parsed (workshop_id, mod_id) ON DELETE CASCADE
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS mod_files_rel_path_idx ON mod_files (rel_path);
|
||||
CREATE INDEX IF NOT EXISTS mod_files_mod_idx ON mod_files (workshop_id, mod_id);
|
||||
```
|
||||
|
||||
Plus additions to `mod_parsed`:
|
||||
|
||||
```sql
|
||||
ALTER TABLE mod_parsed
|
||||
ADD COLUMN IF NOT EXISTS mod_types TEXT[] NOT NULL DEFAULT '{}',
|
||||
ADD COLUMN IF NOT EXISTS files_manifest_built BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
```
|
||||
|
||||
The flag lets `derive_category` and `/api/conflicts` know whether a mod has a manifest yet (graceful degradation while the cache backfills organically).
|
||||
|
||||
### 2.2 Worker changes (`worker/worker.py`)
|
||||
|
||||
In `process_one`, **inside the existing `with tempfile.TemporaryDirectory` block** (after `discover_mod_infos`, before the `with` exits):
|
||||
|
||||
**Single-pass requirement:** the manifest build (Integration A) and `detect_mod_types` content sniffing (Integration B) **share one pass over the tempdir**. No two-pass implementations. The walk reads each file's bytes once: hash → manifest insert; concurrently inspect path + content for type signals. The output is the `mod_files` rows for that mod_id and the ordered `mod_types` list, both committed in the same transaction as the existing `UPSERT_MOD_PARSED`.
|
||||
|
||||
For each `(workshop_id, mod_id)` pair we just upserted:
|
||||
|
||||
1. Compute `mod_id_root`: the directory whose name equals `mod.id`. For B41 (`mods/<modId>/mod.info`) that's `mip.parent`; for B42 (`mods/<modId>/<branch>/mod.info`) that's `mip.parent.parent`. Detect via `mip.parent.name == mod.id`.
|
||||
2. Single recursive walk under `mod_id_root` covering every `media/` subtree (handles B42 `<branch>/media/` + `common/media/` together). For each file:
|
||||
- If suffix matches `_CONFLICT_EXTS = {".lua", ".txt", ".xml", ".json", ".ini"}` (verbatim from pzmm `scanner.py:21`), compute sha1 (chunked reader, mirrors pzmm `_sha1`) and accumulate `(rel_path, sha1, size_bytes)`. **Last-wins** on duplicate rel_paths across branches.
|
||||
- Concurrently, in the same loop, accumulate the path-based signals from pzmm `mods.py:detect_mod_types` (lines 88–115): `Maps`, `Tiles`, `Textures`, `Vehicles`, `Clothing`, `Sounds`, `UI`, `Animations`, `Translations`, `Lua`, plus collected `lua_text_parts` and `script_text_parts` blobs (capped at 60 lua × 64 KB and 80 script × 96 KB per pzmm).
|
||||
3. After the walk, run pzmm's content-blob checks (lines 117–136): weapon/vehicle/item/recipe/clothing/trait/profession signals from concatenated blobs. Resolve to `mod_types` ordered list (lines 138–145).
|
||||
4. DELETE existing `mod_files` rows for `(workshop_id, mod_id)` then bulk INSERT new rows.
|
||||
5. UPSERT `mod_parsed.mod_types` and set `files_manifest_built = true` for the row.
|
||||
|
||||
The whole step adds disk-walk + hashing of small text files only — typical mod has 20–200 files in scope, hashing is cheap (≤100 KB each, sha1 ≈ 500 MB/s). Estimated cost: <500 ms per mod, well under the DD pull cost we're already paying.
|
||||
|
||||
### 2.3 New module: `api/diagnostics.py`
|
||||
|
||||
Port of pzmm `scan_file_conflicts` adapted to read from `mod_files` instead of walking disk:
|
||||
|
||||
```python
|
||||
async def scan_file_conflicts(conn, mods: list[ModInfo]) -> list[FileConflict]:
|
||||
"""For the given (already-loaded) ModInfos, report rel_paths claimed
|
||||
by ≥2 mods with non-equal sha1. Returns list ordered by rel_path."""
|
||||
```
|
||||
|
||||
Implementation:
|
||||
1. `SELECT workshop_id, mod_id, rel_path, sha1 FROM mod_files WHERE (workshop_id, mod_id) IN (...)`.
|
||||
2. Group rows in Python by `rel_path`.
|
||||
3. For each group with ≥2 distinct mods, count distinct sha1s. If >1, emit a `FileConflict`.
|
||||
4. Winner = last in input order (mirrors pzmm's "last in load order wins").
|
||||
|
||||
Dataclass:
|
||||
```python
|
||||
@dataclass
|
||||
class FileConflict:
|
||||
rel_path: str
|
||||
providers: list[str] # mod_ids (not ModInfo, to keep payload small)
|
||||
winner: str # mod_id
|
||||
```
|
||||
|
||||
`pzmm.scanner._CONFLICT_EXTS` filtering happened at manifest-build time, so this read path doesn't need it.
|
||||
|
||||
### 2.4 New endpoint: `POST /api/conflicts`
|
||||
|
||||
Same input shape as `/api/sort`, **bare wsids only** (Q4 resolved):
|
||||
```json
|
||||
{"input": "wsid1;wsid2;wsid3", "rules": "...", "pz_build": "B42"}
|
||||
```
|
||||
|
||||
If `parse_with_collections` returns any `collection_ids`, return HTTP 400 with `detail="conflict scan does not support collection input; resolve via /api/sort first"`.
|
||||
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"conflicts": [
|
||||
{"rel_path": "media/scripts/items_food.txt",
|
||||
"providers": ["FoodModA", "FoodModB"],
|
||||
"winner": "FoodModB"}
|
||||
],
|
||||
"missing_manifests": ["wsid1", "wsid2"]
|
||||
}
|
||||
```
|
||||
|
||||
`missing_manifests` lists mods we couldn't analyze because `files_manifest_built=false`. The frontend can show a banner ("X mods haven't been re-fetched since this feature shipped — file conflicts unavailable for them"), and re-clicking sort eventually triggers re-parse on workshop updates.
|
||||
|
||||
Reuse path: `_build_result_for_job` already loads ModInfos via `_row_to_modinfo` — the conflicts endpoint follows the same load pattern, then calls `scan_file_conflicts(conn, mods)` instead of `sort_mods`.
|
||||
|
||||
### 2.5 Frontend (out of scope for this plan)
|
||||
|
||||
A follow-up plan can wire a "File conflicts" warnings section. For now `/api/conflicts` is consumable from curl and lays the groundwork.
|
||||
|
||||
---
|
||||
|
||||
## 3. Integration B — Content-type detection feeding category derivation
|
||||
|
||||
### 3.1 Schema additions
|
||||
|
||||
Already covered by §2.1's `mod_parsed` ALTER TABLE (`mod_types` + `files_manifest_built`). One migration file (`init/09_mod_files.sql`) ships both A and B because they share the worker walk.
|
||||
|
||||
### 3.2 Worker changes
|
||||
|
||||
Folded into §2.2's single-pass walk. No additional file I/O.
|
||||
|
||||
### 3.3 New module: `api/categorize.py`
|
||||
|
||||
```python
|
||||
def types_to_category(mod_types: list[str], name: str) -> str | None:
|
||||
"""First mod_type that maps to a sortof CATEGORY_ORDER bucket wins.
|
||||
Returns None if mod_types is empty / Unknown / Dependency-only and we
|
||||
should fall through to the existing derive_category cascade."""
|
||||
```
|
||||
|
||||
### 3.4 Tag→category mapping (explicit)
|
||||
|
||||
| pzmm `mod_type` | sortof `CATEGORY_ORDER` | notes |
|
||||
|---|---|---|
|
||||
| `Maps` | `map` | already covered by `mod.maps non-empty`; types-derived is a fallback |
|
||||
| `Vehicles` | `vehicle` | name regex `"spawn zone"` already routes to `vehicle_spawn` upstream |
|
||||
| `Weapons` | `weapon` | wins over `Items` (pzmm prefers list ordering) |
|
||||
| `Items` | *skip* | too generic — almost every mod has Items; would mis-trigger |
|
||||
| `Clothing` | `wearable` | armor name-hint check still runs after, can override to `armor` |
|
||||
| `Traits` | `code` | no dedicated `trait` bucket; `code` is the gameplay-axis fallback |
|
||||
| `Professions` | `profession` | |
|
||||
| `Recipes` | `crafting` | |
|
||||
| `Tiles` | `tile` | |
|
||||
| `Textures` | `texture` | |
|
||||
| `Sounds` | `sound` | already handled by `Audio` ws_tag; types-derived is a fallback |
|
||||
| `Animations` | *skip* | no bucket; falls through |
|
||||
| `UI` | `ui` | |
|
||||
| `Translations` | `translation` | |
|
||||
| `Lua` | *skip* | too generic; falls through |
|
||||
| `Patch` | `patch` | already detected by `_PATCH_NAME_RE`; types-derived is a fallback |
|
||||
| `Dependency` | `tweaks` | maps to existing `lib` pill |
|
||||
| `Framework` | `tweaks` | same |
|
||||
| `Unknown` | *skip* | falls through |
|
||||
|
||||
"*skip*" means: don't return a category; let `derive_category` continue its cascade.
|
||||
|
||||
### 3.5 `derive_category` integration
|
||||
|
||||
Insert a single new check in `api/mlos_sort.py:derive_category` after the explicit-category early return at line 412, **before** the patch/lib name regex at lines 416–419:
|
||||
|
||||
```python
|
||||
if mod.mod_types:
|
||||
cat = types_to_category(mod.mod_types, name)
|
||||
if cat:
|
||||
return cat
|
||||
```
|
||||
|
||||
`mod.mod_types` is added to the `ModInfo` dataclass (`mlos_sort.py:113`). `_row_to_modinfo` (`api/app.py:176`) is updated to read the new column. **Both `mlos_sort.py` copies must change in lockstep.**
|
||||
|
||||
**Position rationale:** `mod_types` comes from media-content fingerprinting, more reliable than name regex but less reliable than an explicit `category=` field in `mod.info`. So it sits between (1) explicit category and (2) name regex. The patch/lib regexes that come after still win for true patches/libraries (they'd usually return `Patch`/`Dependency` from detect_mod_types anyway, but we want the regex to win for cases where a "patch mod" hasn't shipped enough media to fingerprint).
|
||||
|
||||
Empty `mod_types` (e.g. older rows where `files_manifest_built=false`) means the new check returns `None` and the existing cascade runs unchanged. **Graceful degradation is built in.**
|
||||
|
||||
---
|
||||
|
||||
## 4. Blockers / risks
|
||||
|
||||
### 4.1 Schema migration cost
|
||||
- Current cache: **3,123 `workshop_meta` rows, 3,298 `mod_parsed` rows**.
|
||||
- New `mod_files` rows estimate: median mod ships ~50 conflict-eligible files (light mods 5–10, heavy framework/map mods 200–500). At 50 avg × 3,298 mods = **~165 k rows**. With sha1 (40 chars) + rel_path (avg 80 chars) + overhead ≈ 200 bytes/row, that's ~33 MB before indexes. Postgres handles this trivially.
|
||||
- `ALTER TABLE mod_parsed ADD COLUMN mod_types TEXT[]` and `files_manifest_built BOOLEAN` are additive and metadata-only on Postgres 16 (no rewrite). Instant.
|
||||
|
||||
### 4.2 Backfill feasibility
|
||||
- The `/tmp/sortof_steam_throttle` flock + `/tmp/sortof_steam_cooldown` 1h kill-switch (worker.py — `fetch_required_wsids`) protect us from Steam metadata 429s. **DD itself does not hit the metadata API**; it hits Steam content servers, which are not part of the rate-limited path. So mass re-DD does not trip the cooldown.
|
||||
- Mass re-DD still costs real time: typical DD pull is 20–60 s wall-clock. 3,123 wsids × 30 s avg ÷ 4 drains = **~6.5 hours wall-clock for a full backfill**. Doable but disruptive.
|
||||
- **Recommendation: do not run a bulk backfill.** Let the cache populate organically — every workshop update bumps `time_updated`, which triggers a re-parse and now also a manifest build. The `missing_manifests` field in `/api/conflicts` and the empty-`mod_types` graceful-degrade path together mean the feature works on day 1 (empty results for old rows) and improves as authors push updates.
|
||||
- Per-mod manual trigger pattern still works (operator-only):
|
||||
```sql
|
||||
DELETE FROM mod_parsed WHERE workshop_id='<wsid>';
|
||||
INSERT INTO download_jobs (workshop_id, status) VALUES ('<wsid>','queued');
|
||||
```
|
||||
|
||||
### 4.3 Inline detection at sort time
|
||||
- Rejected. `detect_mod_types` reads up to ~11 MB per mod from disk (lua/script blobs). With the tempdir destroyed (the actual case), we'd need to re-DD inline — minutes per sort.
|
||||
- **All detection runs at parse time** in `process_one`. `derive_category` and `/api/conflicts` are pure DB reads.
|
||||
|
||||
---
|
||||
|
||||
## 5. Files touched (summary)
|
||||
|
||||
**New:**
|
||||
- `init/09_mod_files.sql` — `mod_files` table, `mod_parsed.mod_types`, `mod_parsed.files_manifest_built`
|
||||
- `api/diagnostics.py` — port of `scan_file_conflicts`, `FileConflict` dataclass
|
||||
- `api/categorize.py` — `types_to_category` helper
|
||||
|
||||
**Modified:**
|
||||
- `worker/worker.py` — extend `process_one`'s `with` block: single-pass walk, manifest + detect_mod_types, upsert rows
|
||||
- `worker/worker.py` (top-level) — port `detect_mod_types` from pzmm `mods.py:57–145` (sortof-side copy; do not import from pzmm at runtime)
|
||||
- `api/mlos_sort.py` — add `mod_types: List[str]` to `ModInfo` dataclass; add `mod_types` check at top of `derive_category`
|
||||
- `worker/mlos_sort.py` — mirror the `ModInfo` and `derive_category` change (worker/api dual-edit rule)
|
||||
- `api/app.py` — `_row_to_modinfo` reads new `mod_types` column; `_build_result_for_job` SELECT list adds `mp.mod_types`; register `POST /api/conflicts`
|
||||
|
||||
**Out of scope (deferred to follow-up plan):**
|
||||
- Frontend conflicts panel — `/api/conflicts` endpoint only, no UI
|
||||
- Integration of `pzmm/core/bundle.py` (debug bundle export) — read for context, not ported
|
||||
- Backfill orchestration — relying on organic backfill
|
||||
|
||||
---
|
||||
|
||||
## 6. Rollback
|
||||
|
||||
Before applying the migration:
|
||||
|
||||
```bash
|
||||
# Backup mod_parsed (the only existing table we ALTER)
|
||||
sudo docker exec -i sortof_db pg_dump -U sortof -d sortof -t mod_parsed \
|
||||
> /opt/sortof/backups/mod_parsed-pre-09.sql.$(date +%Y%m%d-%H%M)
|
||||
ls -la /opt/sortof/backups/ | tail -3
|
||||
```
|
||||
|
||||
Down SQL (paste into psql to revert the schema half of this plan):
|
||||
|
||||
```sql
|
||||
DROP TABLE IF EXISTS mod_files;
|
||||
ALTER TABLE mod_parsed
|
||||
DROP COLUMN IF EXISTS mod_types,
|
||||
DROP COLUMN IF EXISTS files_manifest_built;
|
||||
```
|
||||
|
||||
To revert code, `git checkout main` and restart services:
|
||||
```bash
|
||||
sudo systemctl restart sortof-api sortof-drain@1 sortof-drain@2 sortof-drain@3 sortof-drain@4
|
||||
```
|
||||
|
||||
The migration is additive only (new table + new columns with safe defaults), so the rollback is a clean drop. No data is destroyed in `mod_parsed`'s existing columns.
|
||||
|
||||
---
|
||||
|
||||
## 7. Verification
|
||||
|
||||
1. **Migration applies cleanly:**
|
||||
```bash
|
||||
sudo docker exec -i sortof_db psql -U sortof -d sortof < /opt/sortof/init/09_mod_files.sql
|
||||
sudo docker exec -i sortof_db psql -U sortof -d sortof -c "\d mod_files"
|
||||
sudo docker exec -i sortof_db psql -U sortof -d sortof -c "\d mod_parsed" | grep -E "mod_types|files_manifest_built"
|
||||
```
|
||||
|
||||
2. **Compile checks** (after every Python edit):
|
||||
```bash
|
||||
/opt/sortof/api/.venv/bin/python -m py_compile /opt/sortof/api/app.py /opt/sortof/api/mlos_sort.py /opt/sortof/api/diagnostics.py /opt/sortof/api/categorize.py
|
||||
/opt/sortof/worker/.venv/bin/python -m py_compile /opt/sortof/worker/worker.py /opt/sortof/worker/mlos_sort.py
|
||||
cd /opt/sortof/api && .venv/bin/python -c "import app" && echo OK
|
||||
cd /opt/sortof/worker && .venv/bin/python -c "import drain" && echo OK
|
||||
```
|
||||
|
||||
3. **Dual-edit consistency check** (worker/api `mlos_sort.py` lockstep rule):
|
||||
```bash
|
||||
diff /opt/sortof/api/mlos_sort.py /opt/sortof/worker/mlos_sort.py | grep -E "^[<>]" | head -20
|
||||
```
|
||||
Logic must match; only comments / docstrings may differ. If any logic line shows up in the diff, fix the lockstep before continuing.
|
||||
|
||||
4. **Restart services:**
|
||||
```bash
|
||||
sudo systemctl restart sortof-api sortof-drain@1 sortof-drain@2 sortof-drain@3 sortof-drain@4
|
||||
sudo systemctl is-active sortof-api sortof-drain@{1..4}
|
||||
```
|
||||
|
||||
5. **Force a fresh parse on a known multi-file mod and verify manifest:**
|
||||
```bash
|
||||
sudo docker exec -i sortof_db psql -U sortof -d sortof -c \
|
||||
"DELETE FROM mod_parsed WHERE workshop_id='2169435993';
|
||||
INSERT INTO download_jobs (workshop_id, status) VALUES ('2169435993','queued');"
|
||||
sleep 60
|
||||
sudo docker exec -i sortof_db psql -U sortof -d sortof -c \
|
||||
"SELECT mod_id, mod_types, files_manifest_built FROM mod_parsed WHERE workshop_id='2169435993';
|
||||
SELECT count(*) AS file_count FROM mod_files WHERE workshop_id='2169435993';"
|
||||
```
|
||||
Expected: `files_manifest_built=t`, `mod_types` populated, `file_count > 0`.
|
||||
|
||||
6. **Conflict endpoint smoke:**
|
||||
```bash
|
||||
curl -sS -X POST http://100.114.205.53:8801/api/conflicts \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"input":"2169435993;2392709985;2487022075"}' | jq .
|
||||
```
|
||||
Expected: `{"conflicts": [], "missing_manifests": [<wsids without manifests yet>]}`.
|
||||
|
||||
7. **Collection-input rejection (Q4):**
|
||||
```bash
|
||||
curl -sS -i -X POST http://100.114.205.53:8801/api/conflicts \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"input":"https://steamcommunity.com/sharedfiles/filedetails/?id=999999999"}' | head -5
|
||||
```
|
||||
Expected: HTTP 400 with the documented `detail` message (when the URL is detected as a collection ref).
|
||||
|
||||
8. **Category-from-types smoke:**
|
||||
- Find a mod whose Steam tags don't reflect content (e.g. weapon mod tagged only `Realistic`); `/api/sort` currently classifies it as `code` / `other` / `undefined`.
|
||||
- Re-queue it through the new pipeline (delete+insert).
|
||||
- Re-run `/api/sort`; confirm category is now `weapon`.
|
||||
|
||||
9. **Graceful-degradation check:** confirm a mod with `files_manifest_built=false` still sorts correctly through the existing cascade (no exceptions, category falls back to current behavior).
|
||||
34
init/09_mod_files.sql
Normal file
34
init/09_mod_files.sql
Normal file
@@ -0,0 +1,34 @@
|
||||
-- pzmm-conflict-typing migration (Plan: docs/plans/2026-05-04-pzmm-conflict-and-typing.md)
|
||||
--
|
||||
-- Adds:
|
||||
-- - mod_files: per-mod manifest of conflict-eligible asset files (lua/txt/xml/json/ini)
|
||||
-- with sha1 fingerprint. Used by /api/conflicts to flag rel_paths claimed
|
||||
-- by ≥2 mods with non-equal content.
|
||||
-- - mod_parsed.mod_types: ordered tag list from detect_mod_types content fingerprinting
|
||||
-- (Maps, Vehicles, Weapons, Traits, …). Consumed by derive_category.
|
||||
-- - mod_parsed.files_manifest_built: graceful-degradation flag. False until the
|
||||
-- worker has run the single-pass media/ walk that produces both the
|
||||
-- manifest rows and mod_types. Old cached rows stay false until they
|
||||
-- organically re-parse.
|
||||
--
|
||||
-- Backfill strategy: organic only — time_updated bumps trigger re-parse, which
|
||||
-- now also runs the manifest walk. /api/conflicts surfaces missing_manifests so
|
||||
-- the frontend can communicate gaps without lying.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS mod_files (
|
||||
workshop_id TEXT NOT NULL,
|
||||
mod_id TEXT NOT NULL,
|
||||
rel_path TEXT NOT NULL,
|
||||
sha1 TEXT NOT NULL,
|
||||
size_bytes INTEGER NOT NULL DEFAULT 0,
|
||||
PRIMARY KEY (workshop_id, mod_id, rel_path),
|
||||
FOREIGN KEY (workshop_id, mod_id)
|
||||
REFERENCES mod_parsed (workshop_id, mod_id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS mod_files_rel_path_idx ON mod_files (rel_path);
|
||||
CREATE INDEX IF NOT EXISTS mod_files_mod_idx ON mod_files (workshop_id, mod_id);
|
||||
|
||||
ALTER TABLE mod_parsed
|
||||
ADD COLUMN IF NOT EXISTS mod_types TEXT[] NOT NULL DEFAULT '{}',
|
||||
ADD COLUMN IF NOT EXISTS files_manifest_built BOOLEAN NOT NULL DEFAULT FALSE;
|
||||
@@ -130,6 +130,11 @@ class ModInfo:
|
||||
# signal for build / multiplayer / category detection. Distinct from
|
||||
# `tags` which is mod.info-side (freeform).
|
||||
workshop_tags: List[str] = field(default_factory=list)
|
||||
# pzmm-style content fingerprint (Maps, Vehicles, Weapons, Traits, …)
|
||||
# populated by worker.build_manifest_and_types at parse time. Empty when
|
||||
# files_manifest_built=false (older cached rows); derive_category falls
|
||||
# through to the existing cascade in that case.
|
||||
mod_types: List[str] = field(default_factory=list)
|
||||
warnings: Dict[str, List[str]] = field(default_factory=dict)
|
||||
|
||||
|
||||
@@ -347,8 +352,15 @@ def load_mods_from_dir(root: Path) -> List[ModInfo]:
|
||||
_PATCH_NAME_RE = re.compile(r"\b(patch|compat|compatibility)\b", re.IGNORECASE)
|
||||
|
||||
|
||||
# Substring-based category hints (kept in sync with api/mlos_sort.py)
|
||||
_LIB_NAME_HINTS = ["library", "libraries", "framework"]
|
||||
# Substring lists used for derive_category name heuristics. Plain substring
|
||||
# matching (vs. \b regex) survives PZ's mishmash of camelCase + underscore
|
||||
# + version-suffix mod names (TrueActions_1.09, TrueMusic, TMMumble, …)
|
||||
# that strict word boundaries fail on. False positives are accepted in
|
||||
# exchange — names containing "music" without being music-related are rare
|
||||
# in PZ.
|
||||
_LIB_NAME_HINTS = [
|
||||
"library", "libraries", "framework",
|
||||
]
|
||||
_LIB_NAME_RE = re.compile(
|
||||
r'(?<![A-Za-z])(?:lib|api|core)(?![A-Za-z])'
|
||||
r'|(?<=[a-z])(?:Lib|API|Core)(?![A-Za-z])',
|
||||
@@ -381,6 +393,43 @@ def _name_has(name: str, hints: List[str]) -> bool:
|
||||
return any(h in n for h in hints)
|
||||
|
||||
|
||||
# pzmm content-type → sortof CATEGORY_ORDER mapping. "skip" entries fall
|
||||
# through to the existing derive_category cascade. Items/Animations/Lua/Unknown
|
||||
# are too generic; Maps/Sounds/Patch/Vehicles/Clothing duplicate signals already
|
||||
# captured by the cascade but stay here as fallbacks for poorly-tagged mods.
|
||||
_TYPE_TO_CAT: Dict[str, str] = {
|
||||
"Maps": "map",
|
||||
"Vehicles": "vehicle",
|
||||
"Weapons": "weapon",
|
||||
"Clothing": "wearable",
|
||||
"Traits": "code",
|
||||
"Professions": "profession",
|
||||
"Recipes": "crafting",
|
||||
"Tiles": "tile",
|
||||
"Textures": "texture",
|
||||
"Sounds": "sound",
|
||||
"UI": "ui",
|
||||
"Translations": "translation",
|
||||
"Patch": "patch",
|
||||
"Dependency": "tweaks",
|
||||
"Framework": "tweaks",
|
||||
}
|
||||
|
||||
|
||||
def _types_to_category(mod_types: List[str], name: str) -> Optional[str]:
|
||||
"""First mod_type that maps to a sortof CATEGORY_ORDER bucket wins.
|
||||
Returns None if mod_types contains only skip-types (Items/Animations/Lua/
|
||||
Unknown), so the caller can fall through to the existing cascade."""
|
||||
for t in mod_types:
|
||||
cat = _TYPE_TO_CAT.get(t)
|
||||
if cat:
|
||||
# vehicle_spawn refinement matches the downstream ws_tag check.
|
||||
if cat == "vehicle" and name and "spawn zone" in name.lower():
|
||||
return "vehicle_spawn"
|
||||
return cat
|
||||
return None
|
||||
|
||||
|
||||
def derive_category(mod: ModInfo) -> str:
|
||||
"""Best-effort category from mod.info + workshop_meta.tags + name.
|
||||
Mirrors api/mlos_sort.py; keep both copies in sync.
|
||||
@@ -389,6 +438,15 @@ def derive_category(mod: ModInfo) -> str:
|
||||
return mod.category
|
||||
|
||||
name = mod.name or ""
|
||||
|
||||
# pzmm-style content fingerprint takes precedence over name regex when
|
||||
# available. Empty mod_types means files_manifest_built=false (older
|
||||
# cached row); fall through to existing cascade.
|
||||
if mod.mod_types:
|
||||
cat = _types_to_category(mod.mod_types, name)
|
||||
if cat:
|
||||
return cat
|
||||
|
||||
if name and _PATCH_NAME_RE.search(name):
|
||||
return "patch"
|
||||
if _name_has(name, _LIB_NAME_HINTS) or (name and _LIB_NAME_RE.search(name)):
|
||||
|
||||
220
worker/worker.py
220
worker/worker.py
@@ -19,6 +19,7 @@ from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -27,7 +28,7 @@ import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import asyncpg
|
||||
import httpx
|
||||
@@ -208,14 +209,21 @@ def run_depot_downloader(
|
||||
workshop_id: str,
|
||||
output_dir: Path,
|
||||
dd_path: Path,
|
||||
filelist_regex: str = r"regex:.*\.info$",
|
||||
# mod.info / map.info for the existing parse path, plus _CONFLICT_EXTS
|
||||
# (lua/txt/xml/json/ini) for build_manifest_and_types — both the conflict
|
||||
# manifest and pzmm-style content sniffing live on these. Binary assets
|
||||
# (.png/.dds/.bank/.ogg/.wav/.X) are intentionally excluded; their type
|
||||
# signals degrade gracefully via workshop_meta.tags fallback in
|
||||
# mlos_sort.derive_category.
|
||||
filelist_regex: str = r"regex:.*\.(info|lua|txt|xml|json|ini)$",
|
||||
timeout: int = 300,
|
||||
max_attempts: int = 3,
|
||||
backoff_s: float = 2.0,
|
||||
) -> bool:
|
||||
"""
|
||||
Fetch workshop item using DepotDownloader, filtered to .info files only.
|
||||
Writes <output_dir>/mods/<mod_id>/mod.info (and possibly map.info paths).
|
||||
Fetch workshop item using DepotDownloader, filtered to mod.info plus
|
||||
conflict-eligible asset files (lua/txt/xml/json/ini). Writes
|
||||
<output_dir>/mods/<mod_id>/mod.info (and the asset tree under media/).
|
||||
Returns True on success.
|
||||
|
||||
Retries up to max_attempts times on rc!=0 or timeout - Steam Workshop's
|
||||
@@ -308,6 +316,178 @@ def discover_map_folders(mip_parent: Path) -> List[str]:
|
||||
return out
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Single-pass manifest + content-type detection (Plan: 2026-05-04 pzmm port)
|
||||
#
|
||||
# Both Integration A (file-conflict manifest) and Integration B (mod_types
|
||||
# fingerprinting) read the same files from the temp DD extraction. We walk
|
||||
# the mod_id root once, hashing conflict-eligible files into a manifest map
|
||||
# AND collecting pzmm-style content signals into a tag set in the same loop.
|
||||
# No two-pass implementations.
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
_CONFLICT_EXTS = {".lua", ".txt", ".xml", ".json", ".ini"}
|
||||
|
||||
# map subtrees are namespaced per-mod by directory; conflict surface is ~zero,
|
||||
# and worldmap.xml can be tens of MB. Skipping anything under maps/<MapName>/
|
||||
# from manifest insertion AND mod_types content sniffing. map.info itself
|
||||
# still drives `mod.maps` (via discover_map_folders) which feeds the existing
|
||||
# `mod.maps non-empty → map` rule in derive_category.
|
||||
_RE_MAP_SUBTREE = re.compile(r"^maps/[^/]+/")
|
||||
|
||||
# Ordered by pzmm preference: first match wins when types_to_category maps
|
||||
# this list down to a single sortof CATEGORY_ORDER bucket.
|
||||
_TYPE_PREFERRED = [
|
||||
"Maps", "Vehicles", "Weapons", "Items", "Clothing", "Traits",
|
||||
"Professions", "Recipes", "Tiles", "Textures", "Sounds",
|
||||
"Animations", "UI", "Translations", "Lua", "Patch", "Dependency",
|
||||
"Framework",
|
||||
]
|
||||
|
||||
|
||||
def _sha1(path: Path) -> str:
|
||||
h = hashlib.sha1()
|
||||
with path.open("rb") as f:
|
||||
for chunk in iter(lambda: f.read(1024 * 128), b""):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
def _read_small_text(path: Path, limit: int = 256_000) -> str:
|
||||
try:
|
||||
with path.open("rb") as f:
|
||||
data = f.read(limit)
|
||||
return data.decode("utf-8", errors="ignore").lower()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def build_manifest_and_types(
|
||||
mip_parent: Path, mod_id: str, raw: str
|
||||
) -> Tuple[List[Tuple[str, str, int]], List[str]]:
|
||||
"""Single-pass walk under the mod_id root.
|
||||
|
||||
Returns:
|
||||
manifest_rows: list of (rel_path, sha1, size_bytes) for conflict-eligible
|
||||
files (lua/txt/xml/json/ini). rel_path is lowercased,
|
||||
posix-style, prefixed with "media/" so it's comparable
|
||||
across mods regardless of B41/B42/split branch layout.
|
||||
Last-wins on duplicate rel_paths from multi-branch mods.
|
||||
mod_types: pzmm-ordered content tag list (Maps, Vehicles, …).
|
||||
Falls back to ["Dependency"] or ["Unknown"] when no
|
||||
media/ subtree exists.
|
||||
"""
|
||||
if mip_parent.parent.name == "mods":
|
||||
modid_root = mip_parent
|
||||
else:
|
||||
modid_root = mip_parent.parent
|
||||
|
||||
tags: set = set()
|
||||
|
||||
name_blob = (raw or "").lower()
|
||||
if any(w in name_blob for w in ("compat", "compatibility", "patch", "fix")):
|
||||
tags.add("Patch")
|
||||
if any(w in name_blob for w in ("api", "core", "dependency", "framework", "library", "required")):
|
||||
tags.add("Dependency")
|
||||
|
||||
manifest_map: Dict[str, Tuple[str, int]] = {}
|
||||
script_text_parts: List[str] = []
|
||||
lua_text_parts: List[str] = []
|
||||
has_media = False
|
||||
|
||||
for media_dir in modid_root.rglob("media"):
|
||||
if not media_dir.is_dir():
|
||||
continue
|
||||
has_media = True
|
||||
for path in media_dir.rglob("*"):
|
||||
if not path.is_file():
|
||||
continue
|
||||
try:
|
||||
rel_below = path.relative_to(media_dir).as_posix().lower()
|
||||
except ValueError:
|
||||
continue
|
||||
suffix = path.suffix.lower()
|
||||
in_map_subtree = bool(_RE_MAP_SUBTREE.match(rel_below))
|
||||
|
||||
# Manifest: skip per-map subtree (see _RE_MAP_SUBTREE comment).
|
||||
if suffix in _CONFLICT_EXTS and not in_map_subtree:
|
||||
rel = "media/" + rel_below
|
||||
try:
|
||||
size = path.stat().st_size
|
||||
sha = _sha1(path)
|
||||
manifest_map[rel] = (sha, size)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Path-based mod_type signals — always fire, even for files inside
|
||||
# the per-map subtree. The "Maps" tag is the canonical signal that
|
||||
# this mod ships a map and shouldn't be lost when we exclude the
|
||||
# heavy worldmap.xml / objects.lua content from the manifest.
|
||||
if rel_below.startswith("maps/"):
|
||||
tags.add("Maps")
|
||||
if rel_below.startswith("texturepacks/") or "tiledefinitions" in rel_below:
|
||||
tags.add("Tiles")
|
||||
if (rel_below.startswith(("textures/", "models_x/", "models/"))
|
||||
or suffix in {".png", ".dds"}):
|
||||
if path.name.lower() != "poster.png":
|
||||
tags.add("Textures")
|
||||
if rel_below.startswith(("scripts/vehicles/", "scripts/vehicle")):
|
||||
tags.add("Vehicles")
|
||||
if rel_below.startswith(("clothing/", "scripts/clothing/")):
|
||||
tags.add("Clothing")
|
||||
if (rel_below.startswith(("sound/", "sounds/", "fmod/"))
|
||||
or suffix in {".bank", ".ogg", ".wav"}):
|
||||
tags.add("Sounds")
|
||||
if rel_below.startswith(("ui/", "lua/client/ui/")):
|
||||
tags.add("UI")
|
||||
if rel_below.startswith(("anims/", "animsets/", "actiongroups/")):
|
||||
tags.add("Animations")
|
||||
if rel_below.startswith("lua/shared/translate/") or "/translate/" in rel_below:
|
||||
tags.add("Translations")
|
||||
|
||||
# Content-blob accumulation: skip per-map subtree. Map-internal lua
|
||||
# / scripts (rare but possible) wouldn't normally collide with
|
||||
# other mods anyway, and reading them just costs memory for no
|
||||
# detection upside.
|
||||
if not in_map_subtree:
|
||||
if rel_below.startswith("lua/"):
|
||||
tags.add("Lua")
|
||||
if suffix == ".lua" and len(lua_text_parts) < 60:
|
||||
lua_text_parts.append(_read_small_text(path, 64_000))
|
||||
if (rel_below.startswith("scripts/") and suffix in {".txt", ".xml"}
|
||||
and len(script_text_parts) < 80):
|
||||
script_text_parts.append(_read_small_text(path, 96_000))
|
||||
|
||||
if not has_media:
|
||||
return [], (["Dependency"] if "Dependency" in tags else ["Unknown"])
|
||||
|
||||
script_blob = "\n".join(script_text_parts)
|
||||
lua_blob = "\n".join(lua_text_parts)
|
||||
|
||||
if " type = weapon" in script_blob or "displaycategory = weapon" in script_blob:
|
||||
tags.add("Weapons")
|
||||
if " vehicle " in script_blob or "module vehicles" in script_blob:
|
||||
tags.add("Vehicles")
|
||||
if "item " in script_blob:
|
||||
tags.add("Items")
|
||||
if "recipe " in script_blob or " evolvedrecipe " in script_blob:
|
||||
tags.add("Recipes")
|
||||
if "bodylocation" in script_blob or "clothingitem" in script_blob:
|
||||
tags.add("Clothing")
|
||||
if "traitfactory.addtrait" in lua_blob:
|
||||
tags.add("Traits")
|
||||
if "professionfactory.addprofession" in lua_blob:
|
||||
tags.add("Professions")
|
||||
|
||||
has_require = bool(re.search(r"^\s*require\s*=", raw or "", re.IGNORECASE | re.MULTILINE))
|
||||
if not tags and has_require and not script_blob:
|
||||
tags.add("Framework")
|
||||
|
||||
ordered = [t for t in _TYPE_PREFERRED if t in tags] or ["Unknown"]
|
||||
manifest_rows = [(rel, sha, size) for rel, (sha, size) in sorted(manifest_map.items())]
|
||||
return manifest_rows, ordered
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# DB upserts
|
||||
# -----------------------------------------------------------------------------
|
||||
@@ -356,8 +536,9 @@ INSERT INTO mod_parsed (
|
||||
requirements, load_after, load_before, incompatible_mods,
|
||||
load_first, load_last, tags, maps,
|
||||
raw_mod_info, version_min, is_addon,
|
||||
mod_types, files_manifest_built,
|
||||
parsed_at_time_updated, parsed_at
|
||||
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16, now())
|
||||
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18, now())
|
||||
ON CONFLICT (workshop_id, mod_id) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
category = EXCLUDED.category,
|
||||
@@ -372,10 +553,24 @@ ON CONFLICT (workshop_id, mod_id) DO UPDATE SET
|
||||
raw_mod_info = EXCLUDED.raw_mod_info,
|
||||
version_min = EXCLUDED.version_min,
|
||||
is_addon = EXCLUDED.is_addon,
|
||||
mod_types = EXCLUDED.mod_types,
|
||||
files_manifest_built = EXCLUDED.files_manifest_built,
|
||||
parsed_at_time_updated = EXCLUDED.parsed_at_time_updated,
|
||||
parsed_at = now();
|
||||
"""
|
||||
|
||||
DELETE_MOD_FILES = """
|
||||
DELETE FROM mod_files WHERE workshop_id = $1 AND mod_id = $2;
|
||||
"""
|
||||
|
||||
INSERT_MOD_FILE = """
|
||||
INSERT INTO mod_files (workshop_id, mod_id, rel_path, sha1, size_bytes)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
ON CONFLICT (workshop_id, mod_id, rel_path) DO UPDATE SET
|
||||
sha1 = EXCLUDED.sha1,
|
||||
size_bytes = EXCLUDED.size_bytes;
|
||||
"""
|
||||
|
||||
# Description-text heuristic for "this mod is an optional add-on to the
|
||||
# primary mod published by the same wsid". Matches:
|
||||
# "Optional add-on: removes ..." (TMMumble)
|
||||
@@ -487,6 +682,10 @@ async def process_one(
|
||||
if mod is None:
|
||||
continue
|
||||
maps = discover_map_folders(mip.parent)
|
||||
# Single-pass walk under the mod_id root: produces both the
|
||||
# conflict manifest and the pzmm-style mod_types list. See
|
||||
# build_manifest_and_types.
|
||||
manifest_rows, mod_types = build_manifest_and_types(mip.parent, mod.id, raw)
|
||||
# Evict any other wsid's claim on this mod_id before we install
|
||||
# ours. Cache invariant: at most one wsid per mod_id, with the
|
||||
# most-recent pull winning.
|
||||
@@ -508,8 +707,19 @@ async def process_one(
|
||||
raw,
|
||||
extract_version_min(raw),
|
||||
detect_is_addon(raw),
|
||||
mod_types,
|
||||
True, # files_manifest_built
|
||||
time_updated,
|
||||
)
|
||||
# Replace any stale manifest rows for this (workshop_id, mod_id)
|
||||
# so a re-parse can't leave behind orphans from a prior layout.
|
||||
await conn.execute(DELETE_MOD_FILES, workshop_id, mod.id)
|
||||
if manifest_rows:
|
||||
await conn.executemany(
|
||||
INSERT_MOD_FILE,
|
||||
[(workshop_id, mod.id, rel, sha, size)
|
||||
for rel, sha, size in manifest_rows],
|
||||
)
|
||||
seen_mod_ids.append(mod.id)
|
||||
|
||||
# Drop rows for mods that no longer exist in this workshop item
|
||||
|
||||
Reference in New Issue
Block a user