sortof/worker/worker.py

"""
worker.py - pzsort cache filler

Single-shot CLI that takes Steam Workshop IDs on argv, refreshes metadata
from Steam's anonymous API, and only runs DepotDownloader for cache misses
(where workshop_meta.time_updated has changed since last parse).

Usage:
    python3 worker.py <workshop_id> [<workshop_id> ...]
    python3 worker.py --force <workshop_id> ...    # ignore cache, re-download

Env (or .env file):
    DATABASE_URL  postgresql://pzsort:<pw>@127.0.0.1:5439/pzsort
    DD_PATH       path to DepotDownloader executable
    PZ_APP_ID     108600 (default)
"""

from __future__ import annotations

import argparse
import asyncio
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
from typing import Dict, List, Optional

import asyncpg
import httpx

# Reuse the parser from the sorter
sys.path.insert(0, str(Path(__file__).parent))
from mlos_sort import parse_mod_info, ModInfo  # noqa: E402

PZ_APP_ID = int(os.environ.get("PZ_APP_ID", "108600"))
DEFAULT_DD_PATH = os.environ.get("DD_PATH", "./DepotDownloader")
STEAM_API = "https://api.steampowered.com/ISteamRemoteStorage/GetPublishedFileDetails/v1/"


# -----------------------------------------------------------------------------
# Steam API
# -----------------------------------------------------------------------------


def fetch_workshop_details(workshop_ids: List[str]) -> Dict[str, dict]:
    """
    POST to legacy GetPublishedFileDetails. Anonymous, no API key needed.
    Returns {workshop_id: detail_dict}.
    """
    if not workshop_ids:
        return {}
    data: Dict[str, str] = {"itemcount": str(len(workshop_ids))}
    for i, wid in enumerate(workshop_ids):
        data[f"publishedfileids[{i}]"] = wid
    with httpx.Client(timeout=30.0) as client:
        r = client.post(STEAM_API, data=data)
        r.raise_for_status()
        body = r.json()
    out: Dict[str, dict] = {}
    for item in body.get("response", {}).get("publishedfiledetails", []):
        out[item["publishedfileid"]] = item
    return out


def flatten_tags(detail: dict) -> List[str]:
    return [t.get("tag", "") for t in detail.get("tags", []) if t.get("tag")]


# Public Steam Workshop page URL. The anonymous GetPublishedFileDetails API
# does NOT return `children` for individual mods (only collections), so to
# learn a mod's "Required Items" we have to scrape the public HTML page.
_WORKSHOP_PAGE_URL = "https://steamcommunity.com/sharedfiles/filedetails/?id={wsid}"
_RE_REQUIRED_BLOCK = re.compile(
    r'<div[^>]*id="RequiredItems"[^>]*>(.*?)</div>\s*</div>',
    re.DOTALL,
)
_RE_REQUIRED_LINK = re.compile(r'filedetails/\?id=(\d+)')

# ── rate-limit safety for Steam HTML scraping ─────────────────────────────
# Steam aggressively 429s anonymous /sharedfiles/filedetails/ HTML requests;
# during a 2026-05-03 backfill at ~1 RPS our IP was blocked for hours and a
# subsequent single curl probe still got 429. Two file-locked, multi-process
# safeguards now sit in front of every scrape:
#
#   1. THROTTLE FILE — records the timestamp of the last attempted scrape.
#      Every worker waits via flock until at least
#      `_MIN_SCRAPE_INTERVAL_S` seconds have elapsed since the last one.
#      Serializes 4 concurrent drain processes so they can't burst.
#
#   2. COOLDOWN FILE — when we observe a hard 429 (after retries), we write
#      `now() + _COOLDOWN_S` here. While active, every fetch returns None
#      instantly without touching Steam, preserving cached values until the
#      IP block ages out.
#
# Defaults: 6s spacing → ≤10 RPM steady-state, 1h cooldown after a 429
# storm. Overridable via SORTOF_STEAM_MIN_INTERVAL / SORTOF_STEAM_COOLDOWN.
import fcntl as _fcntl

_THROTTLE_FILE = "/tmp/sortof_steam_throttle"
_COOLDOWN_FILE = "/tmp/sortof_steam_cooldown"
_MIN_SCRAPE_INTERVAL_S = float(os.environ.get("SORTOF_STEAM_MIN_INTERVAL", "6"))
_COOLDOWN_S = float(os.environ.get("SORTOF_STEAM_COOLDOWN", "3600"))


def _read_cooldown_until() -> float:
    try:
        with open(_COOLDOWN_FILE, "r") as f:
            return float(f.read().strip() or 0)
    except (OSError, ValueError):
        return 0.0


def _write_cooldown_until(epoch_s: float) -> None:
    try:
        with open(_COOLDOWN_FILE, "w") as f:
            f.write(str(epoch_s))
    except OSError:
        pass


def _throttle_scrape() -> None:
    """Block until at least `_MIN_SCRAPE_INTERVAL_S` has elapsed since the
    last scrape by ANY drain process (multi-process safe via flock)."""
    import time as _t
    Path(_THROTTLE_FILE).touch(exist_ok=True)
    with open(_THROTTLE_FILE, "r+") as f:
        _fcntl.flock(f.fileno(), _fcntl.LOCK_EX)
        try:
            f.seek(0)
            raw = f.read().strip()
            last = float(raw) if raw else 0.0
            now = _t.time()
            wait = _MIN_SCRAPE_INTERVAL_S - (now - last)
            if wait > 0:
                _t.sleep(wait)
                now = _t.time()
            f.seek(0); f.truncate(); f.write(str(now))
        finally:
            _fcntl.flock(f.fileno(), _fcntl.LOCK_UN)


def fetch_required_wsids(
    workshop_id: str,
    timeout: int = 15,
    max_attempts: int = 4,
    backoff_429: float = 30.0,
) -> Optional[List[str]]:
    """Scrape the public Workshop page for Required Items wsids.

    Returns
        None  — fetch/parse error, persistent 429, or active cooldown.
                Caller MUST NOT overwrite the existing cached value.
        []    — page loaded successfully but has no required items section.
        list  — required item wsids in declaration order, deduped.
    """
    import time as _time
    cooldown_until = _read_cooldown_until()
    if cooldown_until and _time.time() < cooldown_until:
        return None  # Steam recently 429'd us — back off entirely.
    _throttle_scrape()
    url = _WORKSHOP_PAGE_URL.format(wsid=workshop_id)
    html: Optional[str] = None
    for attempt in range(1, max_attempts + 1):
        try:
            with httpx.Client(timeout=timeout, follow_redirects=True) as client:
                r = client.get(url)
            if r.status_code == 429:
                if attempt < max_attempts:
                    _time.sleep(backoff_429 * attempt)
                    continue
                # Final 429 → arm the global cooldown so other workers
                # (and this one's next call) skip Steam entirely.
                _write_cooldown_until(_time.time() + _COOLDOWN_S)
                print(f"  ! required_wsids 429 (gave up) for {workshop_id}; "
                      f"cooldown {int(_COOLDOWN_S)}s armed", file=sys.stderr)
                return None
            r.raise_for_status()
            html = r.text
            break
        except (httpx.HTTPError, httpx.TimeoutException) as e:
            print(f"  ! required_wsids fetch failed for {workshop_id}: {e}",
                  file=sys.stderr)
            return None
    if html is None:
        return None
    m = _RE_REQUIRED_BLOCK.search(html)
    if not m:
        return []
    seen: set = set()
    out: List[str] = []
    for w in _RE_REQUIRED_LINK.findall(m.group(1)):
        if w not in seen and w != workshop_id:
            seen.add(w)
            out.append(w)
    return out


# -----------------------------------------------------------------------------
# DepotDownloader
# -----------------------------------------------------------------------------


def run_depot_downloader(
    workshop_id: str,
    output_dir: Path,
    dd_path: Path,
    filelist_regex: str = r"regex:.*\.info$",
    timeout: int = 300,
    max_attempts: int = 3,
    backoff_s: float = 2.0,
) -> bool:
    """
    Fetch workshop item using DepotDownloader, filtered to .info files only.
    Writes <output_dir>/mods/<mod_id>/mod.info (and possibly map.info paths).
    Returns True on success.

    Retries up to max_attempts times on rc!=0 or timeout - Steam Workshop's
    CDN occasionally flakes on the manifest fetch and a fresh DD invocation
    typically succeeds. Caller is also free to retry at a higher level
    (drain.py's MAX_ATTEMPTS), but in-process retry avoids the full re-claim
    cycle for the common transient case.
    """
    import time as _time
    output_dir.mkdir(parents=True, exist_ok=True)
    filelist = output_dir / "_filelist.txt"
    filelist.write_text(filelist_regex + "\n", encoding="utf-8")

    cmd = [
        str(dd_path),
        "-app", str(PZ_APP_ID),
        "-pubfile", workshop_id,
        "-filelist", str(filelist),
        "-dir", str(output_dir),
    ]
    last_err = ""
    for attempt in range(1, max_attempts + 1):
        try:
            proc = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=timeout,
                check=False,
            )
        except subprocess.TimeoutExpired:
            last_err = "timeout"
            print(f"  ! DepotDownloader timeout for {workshop_id} (attempt {attempt}/{max_attempts})",
                  file=sys.stderr)
        else:
            if proc.returncode == 0:
                if attempt > 1:
                    print(f"  ✓ DepotDownloader recovered for {workshop_id} on attempt {attempt}",
                          file=sys.stderr)
                return True
            last_err = f"rc={proc.returncode}"
            print(f"  ! DepotDownloader rc={proc.returncode} for {workshop_id} "
                  f"(attempt {attempt}/{max_attempts})", file=sys.stderr)
            print(proc.stderr[-500:] if proc.stderr else proc.stdout[-500:], file=sys.stderr)
        if attempt < max_attempts:
            _time.sleep(backoff_s)
    print(f"  !! DepotDownloader gave up on {workshop_id} after {max_attempts} attempts (last: {last_err})",
          file=sys.stderr)
    return False


def discover_mod_infos(output_dir: Path) -> List[Path]:
    """Find all mod.info files. Two layouts coexist in the wild:
        B41:  mods/<mod_id>/mod.info
        B42:  mods/<mod_id>/<gameVersion>/mod.info   e.g. mods/Foo/42/mod.info
    A single mod can ship both. UPSERT on (workshop_id, mod_id) collapses
    duplicates; lexicographic sort means the B41 (root-level) variant wins
    last when present, the highest-numbered B42 variant otherwise."""
    out = list(output_dir.glob("mods/*/mod.info"))
    out.extend(output_dir.glob("mods/*/*/mod.info"))
    return sorted(out)


def discover_map_folders(mip_parent: Path) -> List[str]:
    """Find map folders for the mod whose mod.info lives in `mip_parent`.

    Three layouts coexist:
      B41:  mods/<modId>/mod.info
            mods/<modId>/media/maps/<x>/map.info
      B42:  mods/<modId>/<branch>/mod.info       (branch is e.g., '42','42.13')
            mods/<modId>/<branch>/media/maps/<x>/map.info
      B42 split: mod.info under '42/' but map data under a sibling 'common/'
                 branch — observed in Project RV Interior. This is why we
                 walk back to the mod-id root and enumerate every branch.
    """
    if mip_parent.parent.name == "mods":
        modid_root = mip_parent
    else:
        modid_root = mip_parent.parent
    seen: set = set()
    out: List[str] = []
    candidates = list(modid_root.glob("media/maps/*/map.info"))
    candidates.extend(modid_root.glob("*/media/maps/*/map.info"))
    for cand in sorted(candidates):
        folder = cand.parent.name
        if folder in seen:
            continue
        seen.add(folder)
        out.append(folder)
    return out


# -----------------------------------------------------------------------------
# DB upserts
# -----------------------------------------------------------------------------


UPSERT_WORKSHOP_META = """
INSERT INTO workshop_meta (
    workshop_id, title, description, tags, creator_steamid,
    time_created, time_updated, file_size, preview_url,
    consumer_app_id, visibility, banned, last_checked_at
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12, now())
ON CONFLICT (workshop_id) DO UPDATE SET
    title           = EXCLUDED.title,
    description     = EXCLUDED.description,
    tags            = EXCLUDED.tags,
    creator_steamid = EXCLUDED.creator_steamid,
    time_created    = EXCLUDED.time_created,
    time_updated    = EXCLUDED.time_updated,
    file_size       = EXCLUDED.file_size,
    preview_url     = EXCLUDED.preview_url,
    consumer_app_id = EXCLUDED.consumer_app_id,
    visibility      = EXCLUDED.visibility,
    banned          = EXCLUDED.banned,
    last_checked_at = now();
"""

EVICT_AND_RECORD_CONFLICT = """
-- Per the cache invariant: a mod_id is owned by exactly one wsid at a time.
-- When we're about to UPSERT (wsid, mod_id), evict any (other_wsid, mod_id)
-- claims so the new pull becomes canonical, and record the eviction in
-- mod_id_conflicts so /api/sort can warn users who paste the displaced wsid.
WITH evicted AS (
    DELETE FROM mod_parsed
     WHERE mod_id = $2 AND workshop_id <> $1
    RETURNING workshop_id
)
INSERT INTO mod_id_conflicts (mod_id, evicting_wsid, evicted_wsid)
SELECT $2, $1, workshop_id FROM evicted
ON CONFLICT (mod_id, evicting_wsid, evicted_wsid)
DO UPDATE SET recorded_at = now();
"""

UPSERT_MOD_PARSED = """
INSERT INTO mod_parsed (
    workshop_id, mod_id, name, category,
    requirements, load_after, load_before, incompatible_mods,
    load_first, load_last, tags, maps,
    raw_mod_info, version_min, is_addon,
    parsed_at_time_updated, parsed_at
) VALUES ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16, now())
ON CONFLICT (workshop_id, mod_id) DO UPDATE SET
    name                   = EXCLUDED.name,
    category               = EXCLUDED.category,
    requirements           = EXCLUDED.requirements,
    load_after             = EXCLUDED.load_after,
    load_before            = EXCLUDED.load_before,
    incompatible_mods      = EXCLUDED.incompatible_mods,
    load_first             = EXCLUDED.load_first,
    load_last              = EXCLUDED.load_last,
    tags                   = EXCLUDED.tags,
    maps                   = EXCLUDED.maps,
    raw_mod_info           = EXCLUDED.raw_mod_info,
    version_min            = EXCLUDED.version_min,
    is_addon               = EXCLUDED.is_addon,
    parsed_at_time_updated = EXCLUDED.parsed_at_time_updated,
    parsed_at              = now();
"""

# Description-text heuristic for "this mod is an optional add-on to the
# primary mod published by the same wsid". Matches:
#   "Optional add-on: removes ..."   (TMMumble)
#   "optional addon ..."
#   "Optional add on ..."
# Strict "optional + add-on" keyword pair to avoid false positives on
# generic "addon" naming. Author-driven signal — set via the description=
# field of mod.info.
_RE_OPTIONAL_ADDON = re.compile(
    r"description\s*=\s*[^\r\n]*\bOptional\s+Add[- ]?on\b",
    re.IGNORECASE,
)


def detect_is_addon(raw: str) -> bool:
    """Return True if the mod.info description self-identifies as an
    optional add-on (`Optional add-on: …`)."""
    return bool(_RE_OPTIONAL_ADDON.search(raw or ""))

DELETE_STALE_MOD_PARSED = """
DELETE FROM mod_parsed
WHERE workshop_id = $1 AND mod_id <> ALL($2::text[]);
"""

CHECK_PARSED_FRESH = """
SELECT mod_id FROM mod_parsed
WHERE workshop_id = $1 AND parsed_at_time_updated = $2;
"""


def extract_version_min(raw: str) -> Optional[str]:
    for line in raw.splitlines():
        s = line.strip().lower()
        if s.startswith("versionmin"):
            _, _, v = line.partition("=")
            return v.strip() or None
    return None


# -----------------------------------------------------------------------------
# Main flow
# -----------------------------------------------------------------------------


async def process_one(
    conn: asyncpg.Connection,
    workshop_id: str,
    detail: dict,
    dd_path: Path,
    force: bool,
) -> str:
    """Returns 'hit' | 'refreshed' | 'banned' | 'missing' | 'no_mod_info' | 'failed'.

    'no_mod_info' = DepotDownloader succeeded but the workshop item contained
    no parseable mod.info file (typical for collections, art-only items, and
    other non-mod uploads that share the PZ consumer_app_id). Distinct from
    'failed' (DD itself errored), so the API can surface "this isn't a mod"
    differently from "we couldn't fetch this."
    """
    # Pre-flight: bad results
    if detail.get("result") != 1:
        return "missing"
    if detail.get("banned"):
        return "banned"
    if detail.get("consumer_app_id") != PZ_APP_ID:
        return "failed"  # wrong app

    time_updated = int(detail.get("time_updated", 0))

    # Always refresh meta (cheap)
    await conn.execute(
        UPSERT_WORKSHOP_META,
        workshop_id,
        detail.get("title", ""),
        detail.get("description", "") or "",
        flatten_tags(detail),
        str(detail.get("creator", "")) or None,
        int(detail.get("time_created", 0)) or None,
        time_updated,
        int(detail.get("file_size", 0)) or None,
        detail.get("preview_url"),
        detail.get("consumer_app_id"),
        detail.get("visibility"),
        bool(detail.get("banned", False)),
    )

    # Cache check
    if not force:
        rows = await conn.fetch(CHECK_PARSED_FRESH, workshop_id, time_updated)
        if rows:
            return "hit"

    # Cache miss → download + parse
    with tempfile.TemporaryDirectory(prefix=f"pzsort_{workshop_id}_") as tmpdir:
        tmp = Path(tmpdir)
        ok = run_depot_downloader(workshop_id, tmp, dd_path)
        if not ok:
            return "failed"

        mod_info_paths = discover_mod_infos(tmp)
        if not mod_info_paths:
            print(f"  ! no mod.info found in {workshop_id}", file=sys.stderr)
            return "no_mod_info"

        seen_mod_ids: List[str] = []
        for mip in mod_info_paths:
            raw = mip.read_text(encoding="utf-8", errors="replace")
            mod = parse_mod_info(raw, workshop_id=workshop_id)
            if mod is None:
                continue
            maps = discover_map_folders(mip.parent)
            # Evict any other wsid's claim on this mod_id before we install
            # ours. Cache invariant: at most one wsid per mod_id, with the
            # most-recent pull winning.
            await conn.execute(EVICT_AND_RECORD_CONFLICT, workshop_id, mod.id)
            await conn.execute(
                UPSERT_MOD_PARSED,
                workshop_id,
                mod.id,
                mod.name,
                mod.category,
                mod.requirements,
                mod.loadAfter,
                mod.loadBefore,
                mod.incompatibleMods,
                mod.loadFirst,
                mod.loadLast,
                mod.tags,
                maps,
                raw,
                extract_version_min(raw),
                detect_is_addon(raw),
                time_updated,
            )
            seen_mod_ids.append(mod.id)

        # Drop rows for mods that no longer exist in this workshop item
        if seen_mod_ids:
            await conn.execute(DELETE_STALE_MOD_PARSED, workshop_id, seen_mod_ids)

    # Scrape the public Workshop page for the "Required Items" section so the
    # API can auto-resolve missing-dep warnings against this mod's declared
    # Steam-side dependencies. Best-effort: None on fetch error → leave the
    # existing cached value; [] or list → overwrite.
    required = await asyncio.to_thread(fetch_required_wsids, workshop_id)
    if required is not None:
        await conn.execute(
            """
            UPDATE workshop_meta
               SET required_wsids = $1, required_scraped_at = now()
             WHERE workshop_id = $2
            """,
            required, workshop_id,
        )

    return "refreshed"


async def main_async(workshop_ids: List[str], dd_path: Path, force: bool, dsn: str) -> int:
    print(f"[steam] fetching metadata for {len(workshop_ids)} item(s)")
    details = fetch_workshop_details(workshop_ids)
    missing_from_steam = [w for w in workshop_ids if w not in details]
    if missing_from_steam:
        print(f"[steam] no detail returned for: {missing_from_steam}", file=sys.stderr)

    summary: Dict[str, int] = {"hit": 0, "refreshed": 0, "banned": 0, "missing": 0, "failed": 0}

    conn = await asyncpg.connect(dsn=dsn)
    try:
        for wid in workshop_ids:
            detail = details.get(wid)
            if detail is None:
                summary["missing"] += 1
                print(f"  - {wid} -> missing (no Steam response)")
                continue
            status = await process_one(conn, wid, detail, dd_path, force)
            summary[status] += 1
            print(f"  - {wid} -> {status}")
    finally:
        await conn.close()

    print(f"[done] {summary}")
    return 0 if summary["failed"] == 0 else 1


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("workshop_ids", nargs="+")
    ap.add_argument("--force", action="store_true", help="ignore cache, always re-download")
    ap.add_argument("--dd-path", default=DEFAULT_DD_PATH)
    ap.add_argument("--dsn", default=os.environ.get("DATABASE_URL"))
    args = ap.parse_args()

    if not args.dsn:
        print("ERROR: --dsn or DATABASE_URL required", file=sys.stderr)
        sys.exit(2)

    dd = Path(args.dd_path)
    if not dd.is_file():
        print(f"ERROR: DepotDownloader not found at {dd}", file=sys.stderr)
        sys.exit(2)

    rc = asyncio.run(main_async(args.workshop_ids, dd, args.force, args.dsn))
    sys.exit(rc)


if __name__ == "__main__":
    main()