feat: stale-require filter + Steam-API-keyed required-items fetch

Drops missing-dep warnings whose source mod's mod.info `require=` is out of sync with its Steam Workshop Required Items sidebar. Author edits to mod.info often lag build ports; trusting the sidebar means B42 sorts no longer raise warnings on B41-only deps the author has already retired (e.g. tikitown's Diederiks Tile Palooza, EN_Newburbs). Filter is conservative: only drops a dep when (a) we have a cached wsid for it, (b) that wsid is wrong-build for the user's pz_build, and (c) the source mod's required_wsids list (with required_scraped_at populated as the "we have evidence" gate, since the column itself defaults to '{}') excludes that wsid. Also swaps worker.fetch_required_wsids from public-page HTML scrape to authenticated IPublishedFileService/GetDetails. Same `children` data, no 429 cooldowns. Removes the now-unused throttle/cooldown infrastructure (SORTOF_STEAM_MIN_INTERVAL / SORTOF_STEAM_COOLDOWN env vars are no longer read). See docs/specs/2026-05-06-stale-requires-filter.md.
2026-05-06 21:30:28 +00:00
parent f8b48fbacb
commit 3a34b71e54
3 changed files with 286 additions and 116 deletions
--- a/worker/worker.py
+++ b/worker/worker.py
@@ -40,6 +40,7 @@ from mlos_sort import parse_mod_info, ModInfo  # noqa: E402
 PZ_APP_ID = int(os.environ.get("PZ_APP_ID", "108600"))
 DEFAULT_DD_PATH = os.environ.get("DD_PATH", "./DepotDownloader")
 STEAM_API = "https://api.steampowered.com/ISteamRemoteStorage/GetPublishedFileDetails/v1/"
+STEAM_AUTHED_DETAILS = "https://api.steampowered.com/IPublishedFileService/GetDetails/v1/"


 # -----------------------------------------------------------------------------
@@ -71,132 +72,65 @@ def flatten_tags(detail: dict) -> List[str]:
    return [t.get("tag", "") for t in detail.get("tags", []) if t.get("tag")]


-# Public Steam Workshop page URL. The anonymous GetPublishedFileDetails API
-# does NOT return `children` for individual mods (only collections), so to
-# learn a mod's "Required Items" we have to scrape the public HTML page.
-_WORKSHOP_PAGE_URL = "https://steamcommunity.com/sharedfiles/filedetails/?id={wsid}"
-_RE_REQUIRED_BLOCK = re.compile(
-    r'<div[^>]*id="RequiredItems"[^>]*>(.*?)</div>\s*</div>',
-    re.DOTALL,
-)
-_RE_REQUIRED_LINK = re.compile(r'filedetails/\?id=(\d+)')
-
-# ── rate-limit safety for Steam HTML scraping ─────────────────────────────
-# Steam aggressively 429s anonymous /sharedfiles/filedetails/ HTML requests;
-# during a 2026-05-03 backfill at ~1 RPS our IP was blocked for hours and a
-# subsequent single curl probe still got 429. Two file-locked, multi-process
-# safeguards now sit in front of every scrape:
-#
-#   1. THROTTLE FILE — records the timestamp of the last attempted scrape.
-#      Every worker waits via flock until at least
-#      `_MIN_SCRAPE_INTERVAL_S` seconds have elapsed since the last one.
-#      Serializes 4 concurrent drain processes so they can't burst.
-#
-#   2. COOLDOWN FILE — when we observe a hard 429 (after retries), we write
-#      `now() + _COOLDOWN_S` here. While active, every fetch returns None
-#      instantly without touching Steam, preserving cached values until the
-#      IP block ages out.
-#
-# Defaults: 6s spacing → ≤10 RPM steady-state, 1h cooldown after a 429
-# storm. Overridable via SORTOF_STEAM_MIN_INTERVAL / SORTOF_STEAM_COOLDOWN.
-import fcntl as _fcntl
-
-_THROTTLE_FILE = "/tmp/sortof_steam_throttle"
-_COOLDOWN_FILE = "/tmp/sortof_steam_cooldown"
-_MIN_SCRAPE_INTERVAL_S = float(os.environ.get("SORTOF_STEAM_MIN_INTERVAL", "6"))
-_COOLDOWN_S = float(os.environ.get("SORTOF_STEAM_COOLDOWN", "3600"))
-
-
-def _read_cooldown_until() -> float:
-    try:
-        with open(_COOLDOWN_FILE, "r") as f:
-            return float(f.read().strip() or 0)
-    except (OSError, ValueError):
-        return 0.0
-
-
-def _write_cooldown_until(epoch_s: float) -> None:
-    try:
-        with open(_COOLDOWN_FILE, "w") as f:
-            f.write(str(epoch_s))
-    except OSError:
-        pass
-
-
-def _throttle_scrape() -> None:
-    """Block until at least `_MIN_SCRAPE_INTERVAL_S` has elapsed since the
-    last scrape by ANY drain process (multi-process safe via flock)."""
-    import time as _t
-    Path(_THROTTLE_FILE).touch(exist_ok=True)
-    with open(_THROTTLE_FILE, "r+") as f:
-        _fcntl.flock(f.fileno(), _fcntl.LOCK_EX)
-        try:
-            f.seek(0)
-            raw = f.read().strip()
-            last = float(raw) if raw else 0.0
-            now = _t.time()
-            wait = _MIN_SCRAPE_INTERVAL_S - (now - last)
-            if wait > 0:
-                _t.sleep(wait)
-                now = _t.time()
-            f.seek(0); f.truncate(); f.write(str(now))
-        finally:
-            _fcntl.flock(f.fileno(), _fcntl.LOCK_UN)
-
-
 def fetch_required_wsids(
    workshop_id: str,
    timeout: int = 15,
-    max_attempts: int = 4,
-    backoff_429: float = 30.0,
 ) -> Optional[List[str]]:
-    """Scrape the public Workshop page for Required Items wsids.
+    """Fetch the Required Items wsids for a Workshop item via the
+    authenticated `IPublishedFileService/GetDetails` endpoint, which
+    returns the same `children` array Steam renders into the public
+    page's Required Items sidebar.

    Returns
-        None  — fetch/parse error, persistent 429, or active cooldown.
-                Caller MUST NOT overwrite the existing cached value.
-        []    — page loaded successfully but has no required items section.
-        list  — required item wsids in declaration order, deduped.
+        None  — missing/invalid `STEAM_WEB_API_KEY`, network error, or
+                non-success result. Caller MUST NOT overwrite the
+                existing cached value.
+        []    — item exists but has no Required Items.
+        list  — required item wsids in `sortorder` order, deduped.
+
+    Replaces the previous HTML-scrape path (Steam 429'd anonymous
+    /sharedfiles/filedetails/ requests aggressively, requiring throttle
+    + 1h cooldown after a 429 storm). The authenticated API has a far
+    more generous quota and stays well clear of those limits at our
+    drain rate.
    """
-    import time as _time
-    cooldown_until = _read_cooldown_until()
-    if cooldown_until and _time.time() < cooldown_until:
-        return None  # Steam recently 429'd us — back off entirely.
-    _throttle_scrape()
-    url = _WORKSHOP_PAGE_URL.format(wsid=workshop_id)
-    html: Optional[str] = None
-    for attempt in range(1, max_attempts + 1):
-        try:
-            with httpx.Client(timeout=timeout, follow_redirects=True) as client:
-                r = client.get(url)
-            if r.status_code == 429:
-                if attempt < max_attempts:
-                    _time.sleep(backoff_429 * attempt)
-                    continue
-                # Final 429 → arm the global cooldown so other workers
-                # (and this one's next call) skip Steam entirely.
-                _write_cooldown_until(_time.time() + _COOLDOWN_S)
-                print(f"  ! required_wsids 429 (gave up) for {workshop_id}; "
-                      f"cooldown {int(_COOLDOWN_S)}s armed", file=sys.stderr)
-                return None
-            r.raise_for_status()
-            html = r.text
-            break
-        except (httpx.HTTPError, httpx.TimeoutException) as e:
-            print(f"  ! required_wsids fetch failed for {workshop_id}: {e}",
-                  file=sys.stderr)
-            return None
-    if html is None:
+    key = os.environ.get("STEAM_WEB_API_KEY")
+    if not key:
+        return None
+    params = {
+        "key": key,
+        "publishedfileids[0]": workshop_id,
+        "includechildren": "true",
+    }
+    try:
+        with httpx.Client(timeout=timeout) as client:
+            r = client.get(STEAM_AUTHED_DETAILS, params=params)
+        r.raise_for_status()
+        body = r.json()
+    except (httpx.HTTPError, httpx.TimeoutException, ValueError) as e:
+        print(f"  ! required_wsids fetch failed for {workshop_id}: {e}",
+              file=sys.stderr)
+        return None
+    items = body.get("response", {}).get("publishedfiledetails") or []
+    if not items:
+        return None
+    item = items[0]
+    # Steam returns result=1 on success; 9 = file not found, etc. Treat
+    # anything else as a soft failure so we don't clobber a previously
+    # cached value with [] on a transient lookup miss.
+    if item.get("result") != 1:
        return None
-    m = _RE_REQUIRED_BLOCK.search(html)
-    if not m:
-        return []
    seen: set = set()
    out: List[str] = []
-    for w in _RE_REQUIRED_LINK.findall(m.group(1)):
-        if w not in seen and w != workshop_id:
-            seen.add(w)
-            out.append(w)
+    children = sorted(
+        item.get("children") or [],
+        key=lambda c: c.get("sortorder", 0),
+    )
+    for c in children:
+        wid = c.get("publishedfileid")
+        if wid and wid not in seen and wid != workshop_id:
+            seen.add(wid)
+            out.append(wid)
    return out