feat: stale-require filter + Steam-API-keyed required-items fetch

Drops missing-dep warnings whose source mod's mod.info `require=` is
out of sync with its Steam Workshop Required Items sidebar. Author
edits to mod.info often lag build ports; trusting the sidebar means
B42 sorts no longer raise warnings on B41-only deps the author has
already retired (e.g. tikitown's Diederiks Tile Palooza, EN_Newburbs).

Filter is conservative: only drops a dep when (a) we have a cached
wsid for it, (b) that wsid is wrong-build for the user's pz_build,
and (c) the source mod's required_wsids list (with required_scraped_at
populated as the "we have evidence" gate, since the column itself
defaults to '{}') excludes that wsid.

Also swaps worker.fetch_required_wsids from public-page HTML scrape
to authenticated IPublishedFileService/GetDetails. Same `children`
data, no 429 cooldowns. Removes the now-unused throttle/cooldown
infrastructure (SORTOF_STEAM_MIN_INTERVAL / SORTOF_STEAM_COOLDOWN
env vars are no longer read).

See docs/specs/2026-05-06-stale-requires-filter.md.
This commit is contained in:
2026-05-06 21:30:28 +00:00
parent f8b48fbacb
commit 3a34b71e54
3 changed files with 286 additions and 116 deletions

View File

@@ -40,6 +40,7 @@ from mlos_sort import parse_mod_info, ModInfo # noqa: E402
PZ_APP_ID = int(os.environ.get("PZ_APP_ID", "108600"))
DEFAULT_DD_PATH = os.environ.get("DD_PATH", "./DepotDownloader")
STEAM_API = "https://api.steampowered.com/ISteamRemoteStorage/GetPublishedFileDetails/v1/"
STEAM_AUTHED_DETAILS = "https://api.steampowered.com/IPublishedFileService/GetDetails/v1/"
# -----------------------------------------------------------------------------
@@ -71,132 +72,65 @@ def flatten_tags(detail: dict) -> List[str]:
return [t.get("tag", "") for t in detail.get("tags", []) if t.get("tag")]
# Public Steam Workshop page URL. The anonymous GetPublishedFileDetails API
# does NOT return `children` for individual mods (only collections), so to
# learn a mod's "Required Items" we have to scrape the public HTML page.
_WORKSHOP_PAGE_URL = "https://steamcommunity.com/sharedfiles/filedetails/?id={wsid}"
_RE_REQUIRED_BLOCK = re.compile(
r'<div[^>]*id="RequiredItems"[^>]*>(.*?)</div>\s*</div>',
re.DOTALL,
)
_RE_REQUIRED_LINK = re.compile(r'filedetails/\?id=(\d+)')
# ── rate-limit safety for Steam HTML scraping ─────────────────────────────
# Steam aggressively 429s anonymous /sharedfiles/filedetails/ HTML requests;
# during a 2026-05-03 backfill at ~1 RPS our IP was blocked for hours and a
# subsequent single curl probe still got 429. Two file-locked, multi-process
# safeguards now sit in front of every scrape:
#
# 1. THROTTLE FILE — records the timestamp of the last attempted scrape.
# Every worker waits via flock until at least
# `_MIN_SCRAPE_INTERVAL_S` seconds have elapsed since the last one.
# Serializes 4 concurrent drain processes so they can't burst.
#
# 2. COOLDOWN FILE — when we observe a hard 429 (after retries), we write
# `now() + _COOLDOWN_S` here. While active, every fetch returns None
# instantly without touching Steam, preserving cached values until the
# IP block ages out.
#
# Defaults: 6s spacing → ≤10 RPM steady-state, 1h cooldown after a 429
# storm. Overridable via SORTOF_STEAM_MIN_INTERVAL / SORTOF_STEAM_COOLDOWN.
import fcntl as _fcntl
_THROTTLE_FILE = "/tmp/sortof_steam_throttle"
_COOLDOWN_FILE = "/tmp/sortof_steam_cooldown"
_MIN_SCRAPE_INTERVAL_S = float(os.environ.get("SORTOF_STEAM_MIN_INTERVAL", "6"))
_COOLDOWN_S = float(os.environ.get("SORTOF_STEAM_COOLDOWN", "3600"))
def _read_cooldown_until() -> float:
try:
with open(_COOLDOWN_FILE, "r") as f:
return float(f.read().strip() or 0)
except (OSError, ValueError):
return 0.0
def _write_cooldown_until(epoch_s: float) -> None:
try:
with open(_COOLDOWN_FILE, "w") as f:
f.write(str(epoch_s))
except OSError:
pass
def _throttle_scrape() -> None:
"""Block until at least `_MIN_SCRAPE_INTERVAL_S` has elapsed since the
last scrape by ANY drain process (multi-process safe via flock)."""
import time as _t
Path(_THROTTLE_FILE).touch(exist_ok=True)
with open(_THROTTLE_FILE, "r+") as f:
_fcntl.flock(f.fileno(), _fcntl.LOCK_EX)
try:
f.seek(0)
raw = f.read().strip()
last = float(raw) if raw else 0.0
now = _t.time()
wait = _MIN_SCRAPE_INTERVAL_S - (now - last)
if wait > 0:
_t.sleep(wait)
now = _t.time()
f.seek(0); f.truncate(); f.write(str(now))
finally:
_fcntl.flock(f.fileno(), _fcntl.LOCK_UN)
def fetch_required_wsids(
workshop_id: str,
timeout: int = 15,
max_attempts: int = 4,
backoff_429: float = 30.0,
) -> Optional[List[str]]:
"""Scrape the public Workshop page for Required Items wsids.
"""Fetch the Required Items wsids for a Workshop item via the
authenticated `IPublishedFileService/GetDetails` endpoint, which
returns the same `children` array Steam renders into the public
page's Required Items sidebar.
Returns
None — fetch/parse error, persistent 429, or active cooldown.
Caller MUST NOT overwrite the existing cached value.
[] — page loaded successfully but has no required items section.
list — required item wsids in declaration order, deduped.
None — missing/invalid `STEAM_WEB_API_KEY`, network error, or
non-success result. Caller MUST NOT overwrite the
existing cached value.
[] — item exists but has no Required Items.
list — required item wsids in `sortorder` order, deduped.
Replaces the previous HTML-scrape path (Steam 429'd anonymous
/sharedfiles/filedetails/ requests aggressively, requiring throttle
+ 1h cooldown after a 429 storm). The authenticated API has a far
more generous quota and stays well clear of those limits at our
drain rate.
"""
import time as _time
cooldown_until = _read_cooldown_until()
if cooldown_until and _time.time() < cooldown_until:
return None # Steam recently 429'd us — back off entirely.
_throttle_scrape()
url = _WORKSHOP_PAGE_URL.format(wsid=workshop_id)
html: Optional[str] = None
for attempt in range(1, max_attempts + 1):
try:
with httpx.Client(timeout=timeout, follow_redirects=True) as client:
r = client.get(url)
if r.status_code == 429:
if attempt < max_attempts:
_time.sleep(backoff_429 * attempt)
continue
# Final 429 → arm the global cooldown so other workers
# (and this one's next call) skip Steam entirely.
_write_cooldown_until(_time.time() + _COOLDOWN_S)
print(f" ! required_wsids 429 (gave up) for {workshop_id}; "
f"cooldown {int(_COOLDOWN_S)}s armed", file=sys.stderr)
return None
r.raise_for_status()
html = r.text
break
except (httpx.HTTPError, httpx.TimeoutException) as e:
print(f" ! required_wsids fetch failed for {workshop_id}: {e}",
file=sys.stderr)
return None
if html is None:
key = os.environ.get("STEAM_WEB_API_KEY")
if not key:
return None
params = {
"key": key,
"publishedfileids[0]": workshop_id,
"includechildren": "true",
}
try:
with httpx.Client(timeout=timeout) as client:
r = client.get(STEAM_AUTHED_DETAILS, params=params)
r.raise_for_status()
body = r.json()
except (httpx.HTTPError, httpx.TimeoutException, ValueError) as e:
print(f" ! required_wsids fetch failed for {workshop_id}: {e}",
file=sys.stderr)
return None
items = body.get("response", {}).get("publishedfiledetails") or []
if not items:
return None
item = items[0]
# Steam returns result=1 on success; 9 = file not found, etc. Treat
# anything else as a soft failure so we don't clobber a previously
# cached value with [] on a transient lookup miss.
if item.get("result") != 1:
return None
m = _RE_REQUIRED_BLOCK.search(html)
if not m:
return []
seen: set = set()
out: List[str] = []
for w in _RE_REQUIRED_LINK.findall(m.group(1)):
if w not in seen and w != workshop_id:
seen.add(w)
out.append(w)
children = sorted(
item.get("children") or [],
key=lambda c: c.get("sortorder", 0),
)
for c in children:
wid = c.get("publishedfileid")
if wid and wid not in seen and wid != workshop_id:
seen.add(wid)
out.append(wid)
return out