feat: stale-require filter + Steam-API-keyed required-items fetch
Drops missing-dep warnings whose source mod's mod.info `require=` is
out of sync with its Steam Workshop Required Items sidebar. Author
edits to mod.info often lag build ports; trusting the sidebar means
B42 sorts no longer raise warnings on B41-only deps the author has
already retired (e.g. tikitown's Diederiks Tile Palooza, EN_Newburbs).
Filter is conservative: only drops a dep when (a) we have a cached
wsid for it, (b) that wsid is wrong-build for the user's pz_build,
and (c) the source mod's required_wsids list (with required_scraped_at
populated as the "we have evidence" gate, since the column itself
defaults to '{}') excludes that wsid.
Also swaps worker.fetch_required_wsids from public-page HTML scrape
to authenticated IPublishedFileService/GetDetails. Same `children`
data, no 429 cooldowns. Removes the now-unused throttle/cooldown
infrastructure (SORTOF_STEAM_MIN_INTERVAL / SORTOF_STEAM_COOLDOWN
env vars are no longer read).
See docs/specs/2026-05-06-stale-requires-filter.md.
This commit is contained in:
166
worker/worker.py
166
worker/worker.py
@@ -40,6 +40,7 @@ from mlos_sort import parse_mod_info, ModInfo # noqa: E402
|
||||
PZ_APP_ID = int(os.environ.get("PZ_APP_ID", "108600"))
|
||||
DEFAULT_DD_PATH = os.environ.get("DD_PATH", "./DepotDownloader")
|
||||
STEAM_API = "https://api.steampowered.com/ISteamRemoteStorage/GetPublishedFileDetails/v1/"
|
||||
STEAM_AUTHED_DETAILS = "https://api.steampowered.com/IPublishedFileService/GetDetails/v1/"
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
@@ -71,132 +72,65 @@ def flatten_tags(detail: dict) -> List[str]:
|
||||
return [t.get("tag", "") for t in detail.get("tags", []) if t.get("tag")]
|
||||
|
||||
|
||||
# Public Steam Workshop page URL. The anonymous GetPublishedFileDetails API
|
||||
# does NOT return `children` for individual mods (only collections), so to
|
||||
# learn a mod's "Required Items" we have to scrape the public HTML page.
|
||||
_WORKSHOP_PAGE_URL = "https://steamcommunity.com/sharedfiles/filedetails/?id={wsid}"
|
||||
_RE_REQUIRED_BLOCK = re.compile(
|
||||
r'<div[^>]*id="RequiredItems"[^>]*>(.*?)</div>\s*</div>',
|
||||
re.DOTALL,
|
||||
)
|
||||
_RE_REQUIRED_LINK = re.compile(r'filedetails/\?id=(\d+)')
|
||||
|
||||
# ── rate-limit safety for Steam HTML scraping ─────────────────────────────
|
||||
# Steam aggressively 429s anonymous /sharedfiles/filedetails/ HTML requests;
|
||||
# during a 2026-05-03 backfill at ~1 RPS our IP was blocked for hours and a
|
||||
# subsequent single curl probe still got 429. Two file-locked, multi-process
|
||||
# safeguards now sit in front of every scrape:
|
||||
#
|
||||
# 1. THROTTLE FILE — records the timestamp of the last attempted scrape.
|
||||
# Every worker waits via flock until at least
|
||||
# `_MIN_SCRAPE_INTERVAL_S` seconds have elapsed since the last one.
|
||||
# Serializes 4 concurrent drain processes so they can't burst.
|
||||
#
|
||||
# 2. COOLDOWN FILE — when we observe a hard 429 (after retries), we write
|
||||
# `now() + _COOLDOWN_S` here. While active, every fetch returns None
|
||||
# instantly without touching Steam, preserving cached values until the
|
||||
# IP block ages out.
|
||||
#
|
||||
# Defaults: 6s spacing → ≤10 RPM steady-state, 1h cooldown after a 429
|
||||
# storm. Overridable via SORTOF_STEAM_MIN_INTERVAL / SORTOF_STEAM_COOLDOWN.
|
||||
import fcntl as _fcntl
|
||||
|
||||
_THROTTLE_FILE = "/tmp/sortof_steam_throttle"
|
||||
_COOLDOWN_FILE = "/tmp/sortof_steam_cooldown"
|
||||
_MIN_SCRAPE_INTERVAL_S = float(os.environ.get("SORTOF_STEAM_MIN_INTERVAL", "6"))
|
||||
_COOLDOWN_S = float(os.environ.get("SORTOF_STEAM_COOLDOWN", "3600"))
|
||||
|
||||
|
||||
def _read_cooldown_until() -> float:
|
||||
try:
|
||||
with open(_COOLDOWN_FILE, "r") as f:
|
||||
return float(f.read().strip() or 0)
|
||||
except (OSError, ValueError):
|
||||
return 0.0
|
||||
|
||||
|
||||
def _write_cooldown_until(epoch_s: float) -> None:
|
||||
try:
|
||||
with open(_COOLDOWN_FILE, "w") as f:
|
||||
f.write(str(epoch_s))
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _throttle_scrape() -> None:
|
||||
"""Block until at least `_MIN_SCRAPE_INTERVAL_S` has elapsed since the
|
||||
last scrape by ANY drain process (multi-process safe via flock)."""
|
||||
import time as _t
|
||||
Path(_THROTTLE_FILE).touch(exist_ok=True)
|
||||
with open(_THROTTLE_FILE, "r+") as f:
|
||||
_fcntl.flock(f.fileno(), _fcntl.LOCK_EX)
|
||||
try:
|
||||
f.seek(0)
|
||||
raw = f.read().strip()
|
||||
last = float(raw) if raw else 0.0
|
||||
now = _t.time()
|
||||
wait = _MIN_SCRAPE_INTERVAL_S - (now - last)
|
||||
if wait > 0:
|
||||
_t.sleep(wait)
|
||||
now = _t.time()
|
||||
f.seek(0); f.truncate(); f.write(str(now))
|
||||
finally:
|
||||
_fcntl.flock(f.fileno(), _fcntl.LOCK_UN)
|
||||
|
||||
|
||||
def fetch_required_wsids(
|
||||
workshop_id: str,
|
||||
timeout: int = 15,
|
||||
max_attempts: int = 4,
|
||||
backoff_429: float = 30.0,
|
||||
) -> Optional[List[str]]:
|
||||
"""Scrape the public Workshop page for Required Items wsids.
|
||||
"""Fetch the Required Items wsids for a Workshop item via the
|
||||
authenticated `IPublishedFileService/GetDetails` endpoint, which
|
||||
returns the same `children` array Steam renders into the public
|
||||
page's Required Items sidebar.
|
||||
|
||||
Returns
|
||||
None — fetch/parse error, persistent 429, or active cooldown.
|
||||
Caller MUST NOT overwrite the existing cached value.
|
||||
[] — page loaded successfully but has no required items section.
|
||||
list — required item wsids in declaration order, deduped.
|
||||
None — missing/invalid `STEAM_WEB_API_KEY`, network error, or
|
||||
non-success result. Caller MUST NOT overwrite the
|
||||
existing cached value.
|
||||
[] — item exists but has no Required Items.
|
||||
list — required item wsids in `sortorder` order, deduped.
|
||||
|
||||
Replaces the previous HTML-scrape path (Steam 429'd anonymous
|
||||
/sharedfiles/filedetails/ requests aggressively, requiring throttle
|
||||
+ 1h cooldown after a 429 storm). The authenticated API has a far
|
||||
more generous quota and stays well clear of those limits at our
|
||||
drain rate.
|
||||
"""
|
||||
import time as _time
|
||||
cooldown_until = _read_cooldown_until()
|
||||
if cooldown_until and _time.time() < cooldown_until:
|
||||
return None # Steam recently 429'd us — back off entirely.
|
||||
_throttle_scrape()
|
||||
url = _WORKSHOP_PAGE_URL.format(wsid=workshop_id)
|
||||
html: Optional[str] = None
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
try:
|
||||
with httpx.Client(timeout=timeout, follow_redirects=True) as client:
|
||||
r = client.get(url)
|
||||
if r.status_code == 429:
|
||||
if attempt < max_attempts:
|
||||
_time.sleep(backoff_429 * attempt)
|
||||
continue
|
||||
# Final 429 → arm the global cooldown so other workers
|
||||
# (and this one's next call) skip Steam entirely.
|
||||
_write_cooldown_until(_time.time() + _COOLDOWN_S)
|
||||
print(f" ! required_wsids 429 (gave up) for {workshop_id}; "
|
||||
f"cooldown {int(_COOLDOWN_S)}s armed", file=sys.stderr)
|
||||
return None
|
||||
r.raise_for_status()
|
||||
html = r.text
|
||||
break
|
||||
except (httpx.HTTPError, httpx.TimeoutException) as e:
|
||||
print(f" ! required_wsids fetch failed for {workshop_id}: {e}",
|
||||
file=sys.stderr)
|
||||
return None
|
||||
if html is None:
|
||||
key = os.environ.get("STEAM_WEB_API_KEY")
|
||||
if not key:
|
||||
return None
|
||||
params = {
|
||||
"key": key,
|
||||
"publishedfileids[0]": workshop_id,
|
||||
"includechildren": "true",
|
||||
}
|
||||
try:
|
||||
with httpx.Client(timeout=timeout) as client:
|
||||
r = client.get(STEAM_AUTHED_DETAILS, params=params)
|
||||
r.raise_for_status()
|
||||
body = r.json()
|
||||
except (httpx.HTTPError, httpx.TimeoutException, ValueError) as e:
|
||||
print(f" ! required_wsids fetch failed for {workshop_id}: {e}",
|
||||
file=sys.stderr)
|
||||
return None
|
||||
items = body.get("response", {}).get("publishedfiledetails") or []
|
||||
if not items:
|
||||
return None
|
||||
item = items[0]
|
||||
# Steam returns result=1 on success; 9 = file not found, etc. Treat
|
||||
# anything else as a soft failure so we don't clobber a previously
|
||||
# cached value with [] on a transient lookup miss.
|
||||
if item.get("result") != 1:
|
||||
return None
|
||||
m = _RE_REQUIRED_BLOCK.search(html)
|
||||
if not m:
|
||||
return []
|
||||
seen: set = set()
|
||||
out: List[str] = []
|
||||
for w in _RE_REQUIRED_LINK.findall(m.group(1)):
|
||||
if w not in seen and w != workshop_id:
|
||||
seen.add(w)
|
||||
out.append(w)
|
||||
children = sorted(
|
||||
item.get("children") or [],
|
||||
key=lambda c: c.get("sortorder", 0),
|
||||
)
|
||||
for c in children:
|
||||
wid = c.get("publishedfileid")
|
||||
if wid and wid not in seen and wid != workshop_id:
|
||||
seen.add(wid)
|
||||
out.append(wid)
|
||||
return out
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user