diff --git a/api/app.py b/api/app.py index 72a1ecf..9875f03 100644 --- a/api/app.py +++ b/api/app.py @@ -315,6 +315,110 @@ async def _lookup_wsids_for_missing( return out +async def _filter_stale_requires( + conn, + mlos_warnings: Dict[str, Any], + source_wsids: Dict[str, str], + pz_build: Optional[str], +) -> None: + """Drop missing-dep entries that look stale: the dep is wrong-build for + `pz_build` AND the source mod's Steam-side Required Items sidebar does + NOT list the dep's wsid. Authors update Required Items per build but + routinely forget to clean up `mod.info`'s `require=` line, so a + wrong-build mod.info dep that the author has implicitly removed from + Required Items is treated as stale and silently dropped. + + Mutates `mlos_warnings["missing_requirements"]` in place. Does nothing + when pz_build is unknown, when no source has Required Items data + cached, or when the dep can't be resolved to a wsid (we have no + evidence to drop on, so keep the warning). + + Example: tikitown's mod.info still says + `require=Diederiks Tile Palooza,EN_Newburbs,...` from B41, but its + B42 Required Items list contains only Drazion's + Erika's. On a B42 + sort the two B41-only deps are dropped (they're not in the sidebar), + while the rest of the require= line continues to resolve normally. + """ + if not pz_build: + return + missing_reqs = mlos_warnings.get("missing_requirements") or {} + if not missing_reqs: + return + target_tag = "Build 42" if pz_build == "B42" else ( + "Build 41" if pz_build == "B41" else None + ) + other_tag = "Build 41" if target_tag == "Build 42" else ( + "Build 42" if target_tag == "Build 41" else None + ) + if not target_tag or not other_tag: + return + + src_wsids_to_check = list({ + source_wsids[m] for m in missing_reqs if m in source_wsids + }) + if not src_wsids_to_check: + return + # `required_wsids` has `NOT NULL DEFAULT '{}'`, so `IS NOT NULL` is + # always true and useless as a "we have evidence" gate. The real + # sentinel is `required_scraped_at` — NULL by default, set only when + # the worker successfully fetched Required Items for this wsid. + # Without this, every never-scraped source mod (most of the cache + # at time of writing) would be treated as "author lists no required + # items", and we'd silently suppress legit wrong-build warnings. + src_required_rows = await conn.fetch( + """ + SELECT workshop_id, required_wsids + FROM workshop_meta + WHERE workshop_id = ANY($1::text[]) + AND required_scraped_at IS NOT NULL + """, + src_wsids_to_check, + ) + if not src_required_rows: + return + src_required: Dict[str, set] = { + r["workshop_id"]: set(r["required_wsids"] or []) + for r in src_required_rows + } + + all_deps = {d for deps in missing_reqs.values() for d in deps if d} + if not all_deps: + return + dep_rows = await conn.fetch( + """ + SELECT DISTINCT ON (mp.mod_id) mp.mod_id, mp.workshop_id, wm.tags + FROM mod_parsed mp + JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id + WHERE mp.mod_id = ANY($1::text[]) + ORDER BY mp.mod_id, mp.parsed_at_time_updated DESC + """, + list(all_deps), + ) + dep_info: Dict[str, Tuple[str, List[str]]] = { + r["mod_id"]: (r["workshop_id"], list(r["tags"] or [])) + for r in dep_rows + } + + new_missing: Dict[str, List[str]] = {} + for src_mod_id, deps in missing_reqs.items(): + src_wsid = source_wsids.get(src_mod_id) + required = src_required.get(src_wsid) if src_wsid else None + kept: List[str] = [] + for dep in deps: + info = dep_info.get(dep) + if info and required is not None: + dep_wsid, dep_tags = info + wrong_build = ( + other_tag in dep_tags and target_tag not in dep_tags + ) + if wrong_build and dep_wsid not in required: + continue # stale: author dropped from Required Items + kept.append(dep) + if kept: + new_missing[src_mod_id] = kept + mlos_warnings["missing_requirements"] = new_missing + + # Hand-curated build-version pairs where the B41↔B42 sibling renamed its # mod_id (so the auto-discovery via shared mod_id can't find it). Bidirectional # - if the user submits either side, the OTHER side is offered. Add entries as @@ -869,6 +973,11 @@ async def _build_result_for_job( _inject_addon_loadafter(mods) sort_result = sort_mods(mods, rules) cached_ids = {r["workshop_id"] for r in rows} + await _filter_stale_requires( + conn, sort_result.get("warnings", {}) or {}, + {m.id: m.workshop_id for m in mods if m.workshop_id}, + pz_build, + ) wsid_lookup = await _lookup_wsids_for_missing( conn, sort_result.get("warnings", {}) or {}, pz_build=pz_build, @@ -1195,6 +1304,11 @@ async def sort_endpoint(req: SortRequest, request: Request) -> Dict[str, Any]: status = "partial" async with pool.acquire() as conn: + await _filter_stale_requires( + conn, sort_result.get("warnings", {}) or {}, + {m.id: m.workshop_id for m in mods if m.workshop_id}, + req.pz_build or "B42", + ) wsid_lookup = await _lookup_wsids_for_missing( conn, sort_result.get("warnings", {}) or {}, pz_build=req.pz_build or "B42", @@ -1414,6 +1528,11 @@ async def resort_endpoint(req: ResortRequest, request: Request) -> Dict[str, Any # Per spec review #12: silently drop unknown mod_ids and return status="success". # The frontend reconciles its sent list against MOD_DB to detect drops. async with pool.acquire() as conn: + await _filter_stale_requires( + conn, sort_result.get("warnings", {}) or {}, + {m.id: m.workshop_id for m in selected_mods if m.workshop_id}, + req.pz_build or "B42", + ) wsid_lookup = await _lookup_wsids_for_missing( conn, sort_result.get("warnings", {}) or {}, pz_build=req.pz_build or "B42", diff --git a/docs/specs/2026-05-06-stale-requires-filter.md b/docs/specs/2026-05-06-stale-requires-filter.md new file mode 100644 index 0000000..0f06583 --- /dev/null +++ b/docs/specs/2026-05-06-stale-requires-filter.md @@ -0,0 +1,117 @@ +# Spec — Stale `require=` filter via Steam Required Items + +**Date:** 2026-05-06 +**Status:** Implemented +**Lineage:** Builds on Spec C (`2026-05-01-build-context-dep-add.md`) — same warning-shaping path, additional filter layer between mlos_sort output and `_lookup_wsids_for_missing` / `build_warnings`. Also opportunistically swaps `worker.fetch_required_wsids` from HTML scrape to the authenticated Steam API. + +## 1. Summary + +Authors update Steam's "Required Items" sidebar per build (it visibly affects subscribe-all behavior), but routinely forget to clean up `mod.info`'s `require=` line when porting a mod from B41 → B42. The result: a B42 mod's mod.info still declares B41-era deps that the author has implicitly retired, and we surface them as **missing-dep warnings on a build the mod doesn't actually need them on**. Tikitown is the canonical case: B42 mod, B42 Required Items = `{Drazion's, Erika's}`, but `mod.info require=` still names `Diederiks Tile Palooza, EN_Newburbs` (both B41-only). Today we warn about both; we should not. + +This spec adds a small filter, `_filter_stale_requires`, that drops a missing-dep entry when **(a)** the dep resolves to a wsid we have cached, **(b)** that wsid is wrong-build for the user's `pz_build`, and **(c)** the source mod's Steam Required Items list does NOT include that wsid. The author has both labelled the dep wrong-build AND removed it from Required Items — strong evidence the `require=` line is stale. + +## 2. Problem + +`mlos_sort` builds `missing_requirements` straight from `mod.info`'s `require=` field. It has no concept of build tags or Steam-side dependency lists. Spec C's `_lookup_wsids_for_missing` already filters wrong-build wsid *suggestions* (so we don't propose adding a B41-only mod to a B42 sort), but the **warning itself still appears** with no actionable button. From the user's perspective the only signal is "Tikitown wants something I can't add" — which is incorrect: tikitown doesn't actually want it on B42, the author just forgot to update mod.info. + +The over-declaration trap (`TacHold requires modoptions` style — declared but not actually needed) is the same shape: wrong-build mod.info dep that's not in Required Items. + +## 3. Heuristic + +For each `(source_mod_id, dep_mod_id)` pair in `mlos_sort`'s `missing_requirements`, drop the dep iff ALL of: + +1. `pz_build` is set (B41 or B42; unknown → no filtering). +2. `source_mod_id` resolves to a wsid we have cached. +3. The source wsid has been **scraped** for Required Items (`workshop_meta.required_scraped_at IS NOT NULL`). The `required_wsids` column itself has `NOT NULL DEFAULT '{}'` so it can't tell us "never scraped" vs. "scraped, no items"; only `required_scraped_at` distinguishes the two. Without this gate the 2741 unscraped wsids in the live cache (as of 2026-05-06, before the authenticated-API backfill) would all look like "author lists no required items" and silently suppress legit warnings. +4. `dep_mod_id` resolves to a wsid via `mod_parsed` (latest-cached row). +5. The dep's wsid has `workshop_meta.tags` indicating it is **wrong-build** for `pz_build` — i.e., `other_tag in tags AND target_tag NOT in tags`. A mod tagged both builds is build-correct and never dropped. +6. The dep's wsid is **NOT** in the source's `required_wsids`. + +If any condition fails, the dep is kept. The filter is conservative: silence requires evidence on every axis. + +### 3.1 Worked examples + +**Tikitown on B42 (the motivating case):** +- Source: tikitown wsid 3037854728, `required_wsids = {3046728955 Drazion's, 3346506593 Erika's}`. +- Dep `Diederiks Tile Palooza` → wsid 2337452747, tags `{Build 40, Build 41}` → wrong-build for B42 → not in `{3046728955, 3346506593}` → **drop**. +- Dep `EN_Newburbs` → wsid 2774834715, tags `{Build 41, ...}` → wrong-build for B42 → not in required → **drop**. +- Dep `tikitown_tiles` → wsid 3046728955, tags include `Build 41, Build 42` → build-correct → **keep** (and the user has it in input anyway, so it doesn't appear as missing). +- Result: warning disappears entirely. + +**TacHold on B42 (over-declaration):** +- Source: TacHold wsid X, `required_wsids = {...}` (no modoptions). +- Dep `modoptions` → wsid Y, tags include `Build 41` only on the legacy wsid → wrong-build → not in required → **drop**. +- The mod runs fine without modoptions; the over-declared dep is silenced. + +**Legitimate missing dep:** +- Source: SomeMod, `required_wsids = {Z}` where Z resolves to mod_id `RealDep`. +- User omitted `RealDep` from input. mod.info: `require=RealDep`. +- `RealDep` is build-correct → **keep**. Warning surfaces with `[add RealDep]`. + +**Source has no Required Items data:** +- New wsid, drained yesterday, `required_wsids` is NULL. +- Filter does nothing for this source's deps → existing behavior. + +**Wrong-build dep that IS in Required Items:** +- Author intentionally requires a B41-only utility mod on a B42 mod (rare but real). +- Wrong-build BUT in required → **keep**. Warning surfaces; current Spec C lookup may still suppress the wrong-build add-button, but that's the existing behavior and out of scope here. + +## 4. Implementation + +### 4.1 New helper: `api/app.py:_filter_stale_requires` + +Single async function, ~80 lines. Mutates `mlos_warnings["missing_requirements"]` in place — same dict that downstream `_lookup_wsids_for_missing` and `adapters.build_warnings` already read from. Two queries: + +```sql +SELECT workshop_id, required_wsids FROM workshop_meta + WHERE workshop_id = ANY($1) AND required_scraped_at IS NOT NULL; + +SELECT DISTINCT ON (mp.mod_id) mp.mod_id, mp.workshop_id, wm.tags + FROM mod_parsed mp JOIN workshop_meta wm ON wm.workshop_id = mp.workshop_id + WHERE mp.mod_id = ANY($1) + ORDER BY mp.mod_id, mp.parsed_at_time_updated DESC; +``` + +Build correctness uses the same `target_tag` / `other_tag` logic as `_lookup_wsids_for_missing` so a future flip to a different rule (or a third build) only has to change one place. + +### 4.2 Call sites + +The filter must run **before** `_lookup_wsids_for_missing` (which would otherwise build wsid suggestions for soon-to-be-dropped deps) and **before** `adapters.build_response → build_warnings` (which is what materializes the warning payload from `missing_requirements`). Three call sites, all in `app.py`: + +1. `/api/sort` sync path (~line 870, after `sort_mods(mods, rules)`). +2. `/api/sort` async-resume path (~line 1198, after `sort_mods(mods, rules)` on the post-drain refetch). +3. `/api/resort` (~line 1417, after `sort_mods(selected_mods, auto_rules)`). + +Each call passes `{m.id: m.workshop_id for m in if m.workshop_id}` for the source map. For resort, the local mods are `selected_mods` (what was sorted) — using `all_mods` would also work but `selected_mods` is the strict superset of source mods that could have generated warnings. + +## 5. Worker swap: HTML scrape → authenticated `GetDetails` + +While we're touching this code path, replace `worker.fetch_required_wsids`'s HTML scraping with the authenticated `IPublishedFileService/GetDetails/v1/?key=…&publishedfileids[0]=…&includechildren=true`. Returns the same `children` array Steam renders into the Required Items sidebar, but: + +- No 429 rate-limiting at our drain rate. +- No throttle / 1h cooldown infrastructure needed. +- More reliable than HTML regex parsing (Steam page markup has changed in the past). + +Required env: `STEAM_WEB_API_KEY` (already in `/opt/sortof/.env`). Without it, the function returns `None` (existing semantics: don't overwrite cached value). Steam returns `result=1` on success; treat anything else as soft failure (also `None`) so a transient lookup miss doesn't clobber a previously good cached value with `[]`. + +Removed code: `_THROTTLE_FILE`, `_COOLDOWN_FILE`, `_MIN_SCRAPE_INTERVAL_S`, `_COOLDOWN_S`, `_read_cooldown_until`, `_write_cooldown_until`, `_throttle_scrape`, `_WORKSHOP_PAGE_URL`, `_RE_REQUIRED_BLOCK`, `_RE_REQUIRED_LINK`, the `import fcntl as _fcntl`, and the rate-limit comment block. `SORTOF_STEAM_MIN_INTERVAL` / `SORTOF_STEAM_COOLDOWN` env knobs are no longer read. + +## 6. Non-goals + +- Mutating `mod_parsed.requirements`. The filter operates on warning generation only; the parsed `require=` field stays as written in `mod.info` (useful for diagnostics and for future rules that may want the raw declaration). +- Surfacing a "this dep was suppressed because it looked stale" debug warning. The filter is silent by design — if it's right, the user never needed to know; if it's wrong, the user can compare against the Workshop page directly. +- Searching Workshop for B42 alternatives to a wrong-build dep. `IPublishedFileService/QueryFiles` text search is too fuzzy to be reliable (search "Diederiks Tile Palooza" + Build 42 → top hit "Rocco's Tiles", entirely unrelated). Out of scope. +- Loosening the build-correct filter in `_lookup_wsids_for_missing` to offer wrong-build add-buttons with a `(B41)` label. Considered and rejected: re-introduces the over-declaration trap. The stale-filter route handles the same cases more cleanly by suppressing the warning entirely instead of offering an action that points at a wrong-build mod. + +## 7. Verification + +Smoke test against the canonical case: + +```bash +curl -sS -X POST http://100.114.205.53:8801/api/sort \ + -H 'Content-Type: application/json' \ + -d '{"input":"3037854728;3046728955;3346506593","pz_build":"B42"}' \ + | jq '.WARNINGS[] | select(.msg | test("tikitown|Diederiks|Newburbs"))' +``` + +Expected: empty (warning suppressed). Before this change: one missing-dep warning naming Diederiks + EN_Newburbs. diff --git a/worker/worker.py b/worker/worker.py index 24c3757..418508b 100644 --- a/worker/worker.py +++ b/worker/worker.py @@ -40,6 +40,7 @@ from mlos_sort import parse_mod_info, ModInfo # noqa: E402 PZ_APP_ID = int(os.environ.get("PZ_APP_ID", "108600")) DEFAULT_DD_PATH = os.environ.get("DD_PATH", "./DepotDownloader") STEAM_API = "https://api.steampowered.com/ISteamRemoteStorage/GetPublishedFileDetails/v1/" +STEAM_AUTHED_DETAILS = "https://api.steampowered.com/IPublishedFileService/GetDetails/v1/" # ----------------------------------------------------------------------------- @@ -71,132 +72,65 @@ def flatten_tags(detail: dict) -> List[str]: return [t.get("tag", "") for t in detail.get("tags", []) if t.get("tag")] -# Public Steam Workshop page URL. The anonymous GetPublishedFileDetails API -# does NOT return `children` for individual mods (only collections), so to -# learn a mod's "Required Items" we have to scrape the public HTML page. -_WORKSHOP_PAGE_URL = "https://steamcommunity.com/sharedfiles/filedetails/?id={wsid}" -_RE_REQUIRED_BLOCK = re.compile( - r']*id="RequiredItems"[^>]*>(.*?)\s*', - re.DOTALL, -) -_RE_REQUIRED_LINK = re.compile(r'filedetails/\?id=(\d+)') - -# ── rate-limit safety for Steam HTML scraping ───────────────────────────── -# Steam aggressively 429s anonymous /sharedfiles/filedetails/ HTML requests; -# during a 2026-05-03 backfill at ~1 RPS our IP was blocked for hours and a -# subsequent single curl probe still got 429. Two file-locked, multi-process -# safeguards now sit in front of every scrape: -# -# 1. THROTTLE FILE — records the timestamp of the last attempted scrape. -# Every worker waits via flock until at least -# `_MIN_SCRAPE_INTERVAL_S` seconds have elapsed since the last one. -# Serializes 4 concurrent drain processes so they can't burst. -# -# 2. COOLDOWN FILE — when we observe a hard 429 (after retries), we write -# `now() + _COOLDOWN_S` here. While active, every fetch returns None -# instantly without touching Steam, preserving cached values until the -# IP block ages out. -# -# Defaults: 6s spacing → ≤10 RPM steady-state, 1h cooldown after a 429 -# storm. Overridable via SORTOF_STEAM_MIN_INTERVAL / SORTOF_STEAM_COOLDOWN. -import fcntl as _fcntl - -_THROTTLE_FILE = "/tmp/sortof_steam_throttle" -_COOLDOWN_FILE = "/tmp/sortof_steam_cooldown" -_MIN_SCRAPE_INTERVAL_S = float(os.environ.get("SORTOF_STEAM_MIN_INTERVAL", "6")) -_COOLDOWN_S = float(os.environ.get("SORTOF_STEAM_COOLDOWN", "3600")) - - -def _read_cooldown_until() -> float: - try: - with open(_COOLDOWN_FILE, "r") as f: - return float(f.read().strip() or 0) - except (OSError, ValueError): - return 0.0 - - -def _write_cooldown_until(epoch_s: float) -> None: - try: - with open(_COOLDOWN_FILE, "w") as f: - f.write(str(epoch_s)) - except OSError: - pass - - -def _throttle_scrape() -> None: - """Block until at least `_MIN_SCRAPE_INTERVAL_S` has elapsed since the - last scrape by ANY drain process (multi-process safe via flock).""" - import time as _t - Path(_THROTTLE_FILE).touch(exist_ok=True) - with open(_THROTTLE_FILE, "r+") as f: - _fcntl.flock(f.fileno(), _fcntl.LOCK_EX) - try: - f.seek(0) - raw = f.read().strip() - last = float(raw) if raw else 0.0 - now = _t.time() - wait = _MIN_SCRAPE_INTERVAL_S - (now - last) - if wait > 0: - _t.sleep(wait) - now = _t.time() - f.seek(0); f.truncate(); f.write(str(now)) - finally: - _fcntl.flock(f.fileno(), _fcntl.LOCK_UN) - - def fetch_required_wsids( workshop_id: str, timeout: int = 15, - max_attempts: int = 4, - backoff_429: float = 30.0, ) -> Optional[List[str]]: - """Scrape the public Workshop page for Required Items wsids. + """Fetch the Required Items wsids for a Workshop item via the + authenticated `IPublishedFileService/GetDetails` endpoint, which + returns the same `children` array Steam renders into the public + page's Required Items sidebar. Returns - None — fetch/parse error, persistent 429, or active cooldown. - Caller MUST NOT overwrite the existing cached value. - [] — page loaded successfully but has no required items section. - list — required item wsids in declaration order, deduped. + None — missing/invalid `STEAM_WEB_API_KEY`, network error, or + non-success result. Caller MUST NOT overwrite the + existing cached value. + [] — item exists but has no Required Items. + list — required item wsids in `sortorder` order, deduped. + + Replaces the previous HTML-scrape path (Steam 429'd anonymous + /sharedfiles/filedetails/ requests aggressively, requiring throttle + + 1h cooldown after a 429 storm). The authenticated API has a far + more generous quota and stays well clear of those limits at our + drain rate. """ - import time as _time - cooldown_until = _read_cooldown_until() - if cooldown_until and _time.time() < cooldown_until: - return None # Steam recently 429'd us — back off entirely. - _throttle_scrape() - url = _WORKSHOP_PAGE_URL.format(wsid=workshop_id) - html: Optional[str] = None - for attempt in range(1, max_attempts + 1): - try: - with httpx.Client(timeout=timeout, follow_redirects=True) as client: - r = client.get(url) - if r.status_code == 429: - if attempt < max_attempts: - _time.sleep(backoff_429 * attempt) - continue - # Final 429 → arm the global cooldown so other workers - # (and this one's next call) skip Steam entirely. - _write_cooldown_until(_time.time() + _COOLDOWN_S) - print(f" ! required_wsids 429 (gave up) for {workshop_id}; " - f"cooldown {int(_COOLDOWN_S)}s armed", file=sys.stderr) - return None - r.raise_for_status() - html = r.text - break - except (httpx.HTTPError, httpx.TimeoutException) as e: - print(f" ! required_wsids fetch failed for {workshop_id}: {e}", - file=sys.stderr) - return None - if html is None: + key = os.environ.get("STEAM_WEB_API_KEY") + if not key: + return None + params = { + "key": key, + "publishedfileids[0]": workshop_id, + "includechildren": "true", + } + try: + with httpx.Client(timeout=timeout) as client: + r = client.get(STEAM_AUTHED_DETAILS, params=params) + r.raise_for_status() + body = r.json() + except (httpx.HTTPError, httpx.TimeoutException, ValueError) as e: + print(f" ! required_wsids fetch failed for {workshop_id}: {e}", + file=sys.stderr) + return None + items = body.get("response", {}).get("publishedfiledetails") or [] + if not items: + return None + item = items[0] + # Steam returns result=1 on success; 9 = file not found, etc. Treat + # anything else as a soft failure so we don't clobber a previously + # cached value with [] on a transient lookup miss. + if item.get("result") != 1: return None - m = _RE_REQUIRED_BLOCK.search(html) - if not m: - return [] seen: set = set() out: List[str] = [] - for w in _RE_REQUIRED_LINK.findall(m.group(1)): - if w not in seen and w != workshop_id: - seen.add(w) - out.append(w) + children = sorted( + item.get("children") or [], + key=lambda c: c.get("sortorder", 0), + ) + for c in children: + wid = c.get("publishedfileid") + if wid and wid not in seen and wid != workshop_id: + seen.add(wid) + out.append(wid) return out