Files
sortof/worker/worker.py

587 lines
21 KiB
Python

"""
worker.py - pzsort cache filler
Single-shot CLI that takes Steam Workshop IDs on argv, refreshes metadata
from Steam's anonymous API, and only runs DepotDownloader for cache misses
(where workshop_meta.time_updated has changed since last parse).
Usage:
python3 worker.py <workshop_id> [<workshop_id> ...]
python3 worker.py --force <workshop_id> ... # ignore cache, re-download
Env (or .env file):
DATABASE_URL postgresql://pzsort:<pw>@127.0.0.1:5439/pzsort
DD_PATH path to DepotDownloader executable
PZ_APP_ID 108600 (default)
"""
from __future__ import annotations
import argparse
import asyncio
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
from typing import Dict, List, Optional
import asyncpg
import httpx
# Reuse the parser from the sorter
sys.path.insert(0, str(Path(__file__).parent))
from mlos_sort import parse_mod_info, ModInfo # noqa: E402
PZ_APP_ID = int(os.environ.get("PZ_APP_ID", "108600"))
DEFAULT_DD_PATH = os.environ.get("DD_PATH", "./DepotDownloader")
STEAM_API = "https://api.steampowered.com/ISteamRemoteStorage/GetPublishedFileDetails/v1/"
# -----------------------------------------------------------------------------
# Steam API
# -----------------------------------------------------------------------------
def fetch_workshop_details(workshop_ids: List[str]) -> Dict[str, dict]:
"""
POST to legacy GetPublishedFileDetails. Anonymous, no API key needed.
Returns {workshop_id: detail_dict}.
"""
if not workshop_ids:
return {}
data: Dict[str, str] = {"itemcount": str(len(workshop_ids))}
for i, wid in enumerate(workshop_ids):
data[f"publishedfileids[{i}]"] = wid
with httpx.Client(timeout=30.0) as client:
r = client.post(STEAM_API, data=data)
r.raise_for_status()
body = r.json()
out: Dict[str, dict] = {}
for item in body.get("response", {}).get("publishedfiledetails", []):
out[item["publishedfileid"]] = item
return out
def flatten_tags(detail: dict) -> List[str]:
return [t.get("tag", "") for t in detail.get("tags", []) if t.get("tag")]
# Public Steam Workshop page URL. The anonymous GetPublishedFileDetails API
# does NOT return `children` for individual mods (only collections), so to
# learn a mod's "Required Items" we have to scrape the public HTML page.
_WORKSHOP_PAGE_URL = "https://steamcommunity.com/sharedfiles/filedetails/?id={wsid}"
_RE_REQUIRED_BLOCK = re.compile(
r'<div[^>]*id="RequiredItems"[^>]*>(.*?)</div>\s*</div>',
re.DOTALL,
)
_RE_REQUIRED_LINK = re.compile(r'filedetails/\?id=(\d+)')
# ── rate-limit safety for Steam HTML scraping ─────────────────────────────
# Steam aggressively 429s anonymous /sharedfiles/filedetails/ HTML requests;
# during a 2026-05-03 backfill at ~1 RPS our IP was blocked for hours and a
# subsequent single curl probe still got 429. Two file-locked, multi-process
# safeguards now sit in front of every scrape:
#
# 1. THROTTLE FILE — records the timestamp of the last attempted scrape.
# Every worker waits via flock until at least
# `_MIN_SCRAPE_INTERVAL_S` seconds have elapsed since the last one.
# Serializes 4 concurrent drain processes so they can't burst.
#
# 2. COOLDOWN FILE — when we observe a hard 429 (after retries), we write
# `now() + _COOLDOWN_S` here. While active, every fetch returns None
# instantly without touching Steam, preserving cached values until the
# IP block ages out.
#
# Defaults: 6s spacing → ≤10 RPM steady-state, 1h cooldown after a 429
# storm. Overridable via SORTOF_STEAM_MIN_INTERVAL / SORTOF_STEAM_COOLDOWN.
import fcntl as _fcntl
_THROTTLE_FILE = "/tmp/sortof_steam_throttle"
_COOLDOWN_FILE = "/tmp/sortof_steam_cooldown"
_MIN_SCRAPE_INTERVAL_S = float(os.environ.get("SORTOF_STEAM_MIN_INTERVAL", "6"))
_COOLDOWN_S = float(os.environ.get("SORTOF_STEAM_COOLDOWN", "3600"))
def _read_cooldown_until() -> float:
try:
with open(_COOLDOWN_FILE, "r") as f:
return float(f.read().strip() or 0)
except (OSError, ValueError):
return 0.0
def _write_cooldown_until(epoch_s: float) -> None:
try:
with open(_COOLDOWN_FILE, "w") as f:
f.write(str(epoch_s))
except OSError:
pass
def _throttle_scrape() -> None:
"""Block until at least `_MIN_SCRAPE_INTERVAL_S` has elapsed since the
last scrape by ANY drain process (multi-process safe via flock)."""
import time as _t
Path(_THROTTLE_FILE).touch(exist_ok=True)
with open(_THROTTLE_FILE, "r+") as f:
_fcntl.flock(f.fileno(), _fcntl.LOCK_EX)
try:
f.seek(0)
raw = f.read().strip()
last = float(raw) if raw else 0.0
now = _t.time()
wait = _MIN_SCRAPE_INTERVAL_S - (now - last)
if wait > 0:
_t.sleep(wait)
now = _t.time()
f.seek(0); f.truncate(); f.write(str(now))
finally:
_fcntl.flock(f.fileno(), _fcntl.LOCK_UN)
def fetch_required_wsids(
workshop_id: str,
timeout: int = 15,
max_attempts: int = 4,
backoff_429: float = 30.0,
) -> Optional[List[str]]:
"""Scrape the public Workshop page for Required Items wsids.
Returns
None — fetch/parse error, persistent 429, or active cooldown.
Caller MUST NOT overwrite the existing cached value.
[] — page loaded successfully but has no required items section.
list — required item wsids in declaration order, deduped.
"""
import time as _time
cooldown_until = _read_cooldown_until()
if cooldown_until and _time.time() < cooldown_until:
return None # Steam recently 429'd us — back off entirely.
_throttle_scrape()
url = _WORKSHOP_PAGE_URL.format(wsid=workshop_id)
html: Optional[str] = None
for attempt in range(1, max_attempts + 1):
try:
with httpx.Client(timeout=timeout, follow_redirects=True) as client:
r = client.get(url)
if r.status_code == 429:
if attempt < max_attempts:
_time.sleep(backoff_429 * attempt)
continue
# Final 429 → arm the global cooldown so other workers
# (and this one's next call) skip Steam entirely.
_write_cooldown_until(_time.time() + _COOLDOWN_S)
print(f" ! required_wsids 429 (gave up) for {workshop_id}; "
f"cooldown {int(_COOLDOWN_S)}s armed", file=sys.stderr)
return None
r.raise_for_status()
html = r.text
break
except (httpx.HTTPError, httpx.TimeoutException) as e:
print(f" ! required_wsids fetch failed for {workshop_id}: {e}",
file=sys.stderr)
return None
if html is None:
return None
m = _RE_REQUIRED_BLOCK.search(html)
if not m:
return []
seen: set = set()
out: List[str] = []
for w in _RE_REQUIRED_LINK.findall(m.group(1)):
if w not in seen and w != workshop_id:
seen.add(w)
out.append(w)
return out
# -----------------------------------------------------------------------------
# DepotDownloader
# -----------------------------------------------------------------------------
def run_depot_downloader(
workshop_id: str,
output_dir: Path,
dd_path: Path,
filelist_regex: str = r"regex:.*\.info$",
timeout: int = 300,
max_attempts: int = 3,
backoff_s: float = 2.0,
) -> bool:
"""
Fetch workshop item using DepotDownloader, filtered to .info files only.
Writes <output_dir>/mods/<mod_id>/mod.info (and possibly map.info paths).
Returns True on success.
Retries up to max_attempts times on rc!=0 or timeout - Steam Workshop's
CDN occasionally flakes on the manifest fetch and a fresh DD invocation
typically succeeds. Caller is also free to retry at a higher level
(drain.py's MAX_ATTEMPTS), but in-process retry avoids the full re-claim
cycle for the common transient case.
"""
import time as _time
output_dir.mkdir(parents=True, exist_ok=True)
filelist = output_dir / "_filelist.txt"
filelist.write_text(filelist_regex + "\n", encoding="utf-8")
cmd = [
str(dd_path),
"-app", str(PZ_APP_ID),
"-pubfile", workshop_id,
"-filelist", str(filelist),
"-dir", str(output_dir),
]
last_err = ""
for attempt in range(1, max_attempts + 1):
try:
proc = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout,
check=False,
)
except subprocess.TimeoutExpired:
last_err = "timeout"
print(f" ! DepotDownloader timeout for {workshop_id} (attempt {attempt}/{max_attempts})",
file=sys.stderr)
else:
if proc.returncode == 0:
if attempt > 1:
print(f" ✓ DepotDownloader recovered for {workshop_id} on attempt {attempt}",
file=sys.stderr)
return True
last_err = f"rc={proc.returncode}"
print(f" ! DepotDownloader rc={proc.returncode} for {workshop_id} "
f"(attempt {attempt}/{max_attempts})", file=sys.stderr)
print(proc.stderr[-500:] if proc.stderr else proc.stdout[-500:], file=sys.stderr)
if attempt < max_attempts:
_time.sleep(backoff_s)
print(f" !! DepotDownloader gave up on {workshop_id} after {max_attempts} attempts (last: {last_err})",
file=sys.stderr)
return False
def discover_mod_infos(output_dir: Path) -> List[Path]:
"""Find all mod.info files. Two layouts coexist in the wild:
B41: mods/<mod_id>/mod.info
B42: mods/<mod_id>/<gameVersion>/mod.info e.g. mods/Foo/42/mod.info
A single mod can ship both. UPSERT on (workshop_id, mod_id) collapses
duplicates; lexicographic sort means the B41 (root-level) variant wins
last when present, the highest-numbered B42 variant otherwise."""
out = list(output_dir.glob("mods/*/mod.info"))
out.extend(output_dir.glob("mods/*/*/mod.info"))
return sorted(out)
def discover_map_folders(mip_parent: Path) -> List[str]:
"""Find map folders for the mod whose mod.info lives in `mip_parent`.
Three layouts coexist:
B41: mods/<modId>/mod.info
mods/<modId>/media/maps/<x>/map.info
B42: mods/<modId>/<branch>/mod.info (branch is e.g., '42','42.13')
mods/<modId>/<branch>/media/maps/<x>/map.info
B42 split: mod.info under '42/' but map data under a sibling 'common/'
branch — observed in Project RV Interior. This is why we
walk back to the mod-id root and enumerate every branch.
"""
if mip_parent.parent.name == "mods":
modid_root = mip_parent
else:
modid_root = mip_parent.parent
seen: set = set()
out: List[str] = []
candidates = list(modid_root.glob("media/maps/*/map.info"))
candidates.extend(modid_root.glob("*/media/maps/*/map.info"))
for cand in sorted(candidates):
folder = cand.parent.name
if folder in seen:
continue
seen.add(folder)
out.append(folder)
return out
# -----------------------------------------------------------------------------
# DB upserts
# -----------------------------------------------------------------------------
UPSERT_WORKSHOP_META = """
INSERT INTO workshop_meta (
workshop_id, title, description, tags, creator_steamid,
time_created, time_updated, file_size, preview_url,
consumer_app_id, visibility, banned, last_checked_at
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12, now())
ON CONFLICT (workshop_id) DO UPDATE SET
title = EXCLUDED.title,
description = EXCLUDED.description,
tags = EXCLUDED.tags,
creator_steamid = EXCLUDED.creator_steamid,
time_created = EXCLUDED.time_created,
time_updated = EXCLUDED.time_updated,
file_size = EXCLUDED.file_size,
preview_url = EXCLUDED.preview_url,
consumer_app_id = EXCLUDED.consumer_app_id,
visibility = EXCLUDED.visibility,
banned = EXCLUDED.banned,
last_checked_at = now();
"""
EVICT_AND_RECORD_CONFLICT = """
-- Per the cache invariant: a mod_id is owned by exactly one wsid at a time.
-- When we're about to UPSERT (wsid, mod_id), evict any (other_wsid, mod_id)
-- claims so the new pull becomes canonical, and record the eviction in
-- mod_id_conflicts so /api/sort can warn users who paste the displaced wsid.
WITH evicted AS (
DELETE FROM mod_parsed
WHERE mod_id = $2 AND workshop_id <> $1
RETURNING workshop_id
)
INSERT INTO mod_id_conflicts (mod_id, evicting_wsid, evicted_wsid)
SELECT $2, $1, workshop_id FROM evicted
ON CONFLICT (mod_id, evicting_wsid, evicted_wsid)
DO UPDATE SET recorded_at = now();
"""
UPSERT_MOD_PARSED = """
INSERT INTO mod_parsed (
workshop_id, mod_id, name, category,
requirements, load_after, load_before, incompatible_mods,
load_first, load_last, tags, maps,
raw_mod_info, version_min, is_addon,
parsed_at_time_updated, parsed_at
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16, now())
ON CONFLICT (workshop_id, mod_id) DO UPDATE SET
name = EXCLUDED.name,
category = EXCLUDED.category,
requirements = EXCLUDED.requirements,
load_after = EXCLUDED.load_after,
load_before = EXCLUDED.load_before,
incompatible_mods = EXCLUDED.incompatible_mods,
load_first = EXCLUDED.load_first,
load_last = EXCLUDED.load_last,
tags = EXCLUDED.tags,
maps = EXCLUDED.maps,
raw_mod_info = EXCLUDED.raw_mod_info,
version_min = EXCLUDED.version_min,
is_addon = EXCLUDED.is_addon,
parsed_at_time_updated = EXCLUDED.parsed_at_time_updated,
parsed_at = now();
"""
# Description-text heuristic for "this mod is an optional add-on to the
# primary mod published by the same wsid". Matches:
# "Optional add-on: removes ..." (TMMumble)
# "optional addon ..."
# "Optional add on ..."
# Strict "optional + add-on" keyword pair to avoid false positives on
# generic "addon" naming. Author-driven signal — set via the description=
# field of mod.info.
_RE_OPTIONAL_ADDON = re.compile(
r"description\s*=\s*[^\r\n]*\bOptional\s+Add[- ]?on\b",
re.IGNORECASE,
)
def detect_is_addon(raw: str) -> bool:
"""Return True if the mod.info description self-identifies as an
optional add-on (`Optional add-on: …`)."""
return bool(_RE_OPTIONAL_ADDON.search(raw or ""))
DELETE_STALE_MOD_PARSED = """
DELETE FROM mod_parsed
WHERE workshop_id = $1 AND mod_id <> ALL($2::text[]);
"""
CHECK_PARSED_FRESH = """
SELECT mod_id FROM mod_parsed
WHERE workshop_id = $1 AND parsed_at_time_updated = $2;
"""
def extract_version_min(raw: str) -> Optional[str]:
for line in raw.splitlines():
s = line.strip().lower()
if s.startswith("versionmin"):
_, _, v = line.partition("=")
return v.strip() or None
return None
# -----------------------------------------------------------------------------
# Main flow
# -----------------------------------------------------------------------------
async def process_one(
conn: asyncpg.Connection,
workshop_id: str,
detail: dict,
dd_path: Path,
force: bool,
) -> str:
"""Returns 'hit' | 'refreshed' | 'banned' | 'missing' | 'no_mod_info' | 'failed'.
'no_mod_info' = DepotDownloader succeeded but the workshop item contained
no parseable mod.info file (typical for collections, art-only items, and
other non-mod uploads that share the PZ consumer_app_id). Distinct from
'failed' (DD itself errored), so the API can surface "this isn't a mod"
differently from "we couldn't fetch this."
"""
# Pre-flight: bad results
if detail.get("result") != 1:
return "missing"
if detail.get("banned"):
return "banned"
if detail.get("consumer_app_id") != PZ_APP_ID:
return "failed" # wrong app
time_updated = int(detail.get("time_updated", 0))
# Always refresh meta (cheap)
await conn.execute(
UPSERT_WORKSHOP_META,
workshop_id,
detail.get("title", ""),
detail.get("description", "") or "",
flatten_tags(detail),
str(detail.get("creator", "")) or None,
int(detail.get("time_created", 0)) or None,
time_updated,
int(detail.get("file_size", 0)) or None,
detail.get("preview_url"),
detail.get("consumer_app_id"),
detail.get("visibility"),
bool(detail.get("banned", False)),
)
# Cache check
if not force:
rows = await conn.fetch(CHECK_PARSED_FRESH, workshop_id, time_updated)
if rows:
return "hit"
# Cache miss → download + parse
with tempfile.TemporaryDirectory(prefix=f"pzsort_{workshop_id}_") as tmpdir:
tmp = Path(tmpdir)
ok = run_depot_downloader(workshop_id, tmp, dd_path)
if not ok:
return "failed"
mod_info_paths = discover_mod_infos(tmp)
if not mod_info_paths:
print(f" ! no mod.info found in {workshop_id}", file=sys.stderr)
return "no_mod_info"
seen_mod_ids: List[str] = []
for mip in mod_info_paths:
raw = mip.read_text(encoding="utf-8", errors="replace")
mod = parse_mod_info(raw, workshop_id=workshop_id)
if mod is None:
continue
maps = discover_map_folders(mip.parent)
# Evict any other wsid's claim on this mod_id before we install
# ours. Cache invariant: at most one wsid per mod_id, with the
# most-recent pull winning.
await conn.execute(EVICT_AND_RECORD_CONFLICT, workshop_id, mod.id)
await conn.execute(
UPSERT_MOD_PARSED,
workshop_id,
mod.id,
mod.name,
mod.category,
mod.requirements,
mod.loadAfter,
mod.loadBefore,
mod.incompatibleMods,
mod.loadFirst,
mod.loadLast,
mod.tags,
maps,
raw,
extract_version_min(raw),
detect_is_addon(raw),
time_updated,
)
seen_mod_ids.append(mod.id)
# Drop rows for mods that no longer exist in this workshop item
if seen_mod_ids:
await conn.execute(DELETE_STALE_MOD_PARSED, workshop_id, seen_mod_ids)
# Scrape the public Workshop page for the "Required Items" section so the
# API can auto-resolve missing-dep warnings against this mod's declared
# Steam-side dependencies. Best-effort: None on fetch error → leave the
# existing cached value; [] or list → overwrite.
required = await asyncio.to_thread(fetch_required_wsids, workshop_id)
if required is not None:
await conn.execute(
"""
UPDATE workshop_meta
SET required_wsids = $1, required_scraped_at = now()
WHERE workshop_id = $2
""",
required, workshop_id,
)
return "refreshed"
async def main_async(workshop_ids: List[str], dd_path: Path, force: bool, dsn: str) -> int:
print(f"[steam] fetching metadata for {len(workshop_ids)} item(s)")
details = fetch_workshop_details(workshop_ids)
missing_from_steam = [w for w in workshop_ids if w not in details]
if missing_from_steam:
print(f"[steam] no detail returned for: {missing_from_steam}", file=sys.stderr)
summary: Dict[str, int] = {"hit": 0, "refreshed": 0, "banned": 0, "missing": 0, "failed": 0}
conn = await asyncpg.connect(dsn=dsn)
try:
for wid in workshop_ids:
detail = details.get(wid)
if detail is None:
summary["missing"] += 1
print(f" - {wid} -> missing (no Steam response)")
continue
status = await process_one(conn, wid, detail, dd_path, force)
summary[status] += 1
print(f" - {wid} -> {status}")
finally:
await conn.close()
print(f"[done] {summary}")
return 0 if summary["failed"] == 0 else 1
def main():
ap = argparse.ArgumentParser()
ap.add_argument("workshop_ids", nargs="+")
ap.add_argument("--force", action="store_true", help="ignore cache, always re-download")
ap.add_argument("--dd-path", default=DEFAULT_DD_PATH)
ap.add_argument("--dsn", default=os.environ.get("DATABASE_URL"))
args = ap.parse_args()
if not args.dsn:
print("ERROR: --dsn or DATABASE_URL required", file=sys.stderr)
sys.exit(2)
dd = Path(args.dd_path)
if not dd.is_file():
print(f"ERROR: DepotDownloader not found at {dd}", file=sys.stderr)
sys.exit(2)
rc = asyncio.run(main_async(args.workshop_ids, dd, args.force, args.dsn))
sys.exit(rc)
if __name__ == "__main__":
main()