"""sortof precacher (Spec E): warm the cache by enqueueing the top-N PZ Workshop wsids across four time windows (3 months, 6 months, 1 year, all time) that aren't already known. Pure feeder for the existing drain pipeline. Inserts into download_jobs and returns; the drain workers (sortof-drain@1..4) handle the actual DD pulls. Run on demand: /opt/sortof/worker/.venv/bin/python /opt/sortof/precacher/precacher.py Reuses the worker's venv (httpx + asyncpg) since dependencies overlap exactly. Reads DATABASE_URL from /opt/sortof/.env. Skip rule: a wsid is "already known" iff a row exists in mod_parsed for it (any state) OR a row exists in download_jobs for it (any status). This is deliberately conservative - we never re-queue a wsid the system has seen before, including ones that previously failed (banned, deleted, no_mod_info). Forced re-queue is the API's job, not the precacher's. """ from __future__ import annotations import asyncio import logging import os import re import sys import urllib.parse from pathlib import Path from typing import Dict, List, Set, Tuple import asyncpg import httpx from dotenv import load_dotenv ENV_PATH = Path(__file__).resolve().parent.parent / ".env" def _build_dsn() -> str: """Mirror api/db.py: prefer DATABASE_URL, else build from POSTGRES_* parts.""" load_dotenv(ENV_PATH) explicit = os.environ.get("DATABASE_URL") if explicit: return explicit user = os.environ["POSTGRES_USER"] pw = urllib.parse.quote(os.environ["POSTGRES_PASSWORD"], safe="") name = os.environ["POSTGRES_DB"] host = os.environ.get("POSTGRES_HOST", "127.0.0.1") port = os.environ.get("POSTGRES_PORT", "5439") return f"postgresql://{user}:{pw}@{host}:{port}/{name}" PZ_APPID = 108600 BROWSE_URL = "https://steamcommunity.com/workshop/browse/" PER_PAGE = 30 # Steam HTML default; observed cap. RATE_LIMIT_S = 0.6 # polite gap between page fetches. DEFAULT_TARGET = 1000 # Window label -> (browsesort, days param). days < 0 means "all time" - # Steam's totalvotes sort doesn't take a days param. WINDOWS: List[Tuple[str, str, int]] = [ ("3m", "mostpopular", 90), ("6m", "mostpopular", 180), ("1y", "mostpopular", 365), ("all", "totalvotes", -1), ] WSID_RE = re.compile(r'data-publishedfileid="(\d{6,12})"') log = logging.getLogger("sortof.precacher") async def fetch_page(http: httpx.AsyncClient, sort: str, days: int, page: int) -> List[str]: params: Dict[str, object] = { "appid": PZ_APPID, "browsesort": sort, "section": "readytouseitems", "p": page, "numperpage": PER_PAGE, } if days > 0: params["days"] = days r = await http.get(BROWSE_URL, params=params, timeout=30.0) r.raise_for_status() # De-dupe within the page (the same wsid can appear in multiple HTML blocks). return list(dict.fromkeys(WSID_RE.findall(r.text))) async def collect_top_wsids( http: httpx.AsyncClient, sort: str, days: int, target: int, ) -> List[str]: """Walk pages until we have `target` distinct wsids or pagination exhausts.""" seen: Set[str] = set() out: List[str] = [] page = 1 consecutive_empty = 0 while len(out) < target: try: ids = await fetch_page(http, sort, days, page) except httpx.HTTPError as e: log.warning("page %d fetch failed: %s", page, e) break if not ids: consecutive_empty += 1 if consecutive_empty >= 2: break else: consecutive_empty = 0 added = 0 for wid in ids: if wid in seen: continue seen.add(wid) out.append(wid) added += 1 if len(out) >= target: break # If a page returns only duplicates we've already seen, we're cycling. if ids and added == 0: break page += 1 await asyncio.sleep(RATE_LIMIT_S) return out async def already_known(conn: asyncpg.Connection, wsids: List[str]) -> Set[str]: """Returns the subset of wsids that the system has seen - either parsed in mod_parsed or sitting in download_jobs (any status). The conservative superset; precacher never re-queues anything previously touched.""" if not wsids: return set() rows = await conn.fetch( """ SELECT workshop_id FROM mod_parsed WHERE workshop_id = ANY($1::text[]) UNION SELECT workshop_id FROM download_jobs WHERE workshop_id = ANY($1::text[]) """, wsids, ) return {r["workshop_id"] for r in rows} async def enqueue(conn: asyncpg.Connection, wsids: List[str]) -> int: """INSERT each wsid into download_jobs. Mirrors the API's queue-and-dedup pattern: skip if a row already exists (race-safe via per-iteration tx).""" n = 0 for wid in wsids: async with conn.transaction(): existing = await conn.fetchval( "SELECT 1 FROM download_jobs WHERE workshop_id = $1 LIMIT 1", wid, ) if existing is None: await conn.execute( "INSERT INTO download_jobs (workshop_id, status) VALUES ($1, 'queued')", wid, ) n += 1 return n async def main(target: int = DEFAULT_TARGET) -> int: try: dsn = _build_dsn() except KeyError as e: log.error("missing required env var: %s", e) return 2 pool = await asyncpg.create_pool(dsn=dsn, min_size=1, max_size=2) http = httpx.AsyncClient(headers={"User-Agent": "sortof-precacher/1.0"}) totals = {"fetched": 0, "skipped_known": 0, "enqueued": 0} try: for label, sort, days in WINDOWS: log.info( "window=%s sort=%s days=%d: collecting up to %d wsids", label, sort, days, target, ) wsids = await collect_top_wsids(http, sort, days, target) log.info("window=%s: collected %d wsids", label, len(wsids)) async with pool.acquire() as conn: known = await already_known(conn, wsids) fresh = [w for w in wsids if w not in known] inserted = await enqueue(conn, fresh) log.info( "window=%s: known=%d fresh=%d enqueued=%d", label, len(known), len(fresh), inserted, ) totals["fetched"] += len(wsids) totals["skipped_known"] += len(known) totals["enqueued"] += inserted finally: await http.aclose() await pool.close() log.info( "precache run done: fetched=%d skipped_known=%d enqueued=%d", totals["fetched"], totals["skipped_known"], totals["enqueued"], ) return 0 if __name__ == "__main__": logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(name)s %(message)s", ) target = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_TARGET sys.exit(asyncio.run(main(target)))