Add full sortof codebase: API, drain workers, frontend, schema, specs
This commit is contained in:
210
precacher/precacher.py
Normal file
210
precacher/precacher.py
Normal file
@@ -0,0 +1,210 @@
|
||||
"""sortof precacher (Spec E): warm the cache by enqueueing the top-N PZ
|
||||
Workshop wsids across four time windows (3 months, 6 months, 1 year, all time)
|
||||
that aren't already known.
|
||||
|
||||
Pure feeder for the existing drain pipeline. Inserts into download_jobs and
|
||||
returns; the drain workers (sortof-drain@1..4) handle the actual DD pulls.
|
||||
|
||||
Run on demand:
|
||||
/opt/sortof/worker/.venv/bin/python /opt/sortof/precacher/precacher.py
|
||||
|
||||
Reuses the worker's venv (httpx + asyncpg) since dependencies overlap exactly.
|
||||
Reads DATABASE_URL from /opt/sortof/.env.
|
||||
|
||||
Skip rule: a wsid is "already known" iff a row exists in mod_parsed for it
|
||||
(any state) OR a row exists in download_jobs for it (any status). This is
|
||||
deliberately conservative - we never re-queue a wsid the system has seen
|
||||
before, including ones that previously failed (banned, deleted, no_mod_info).
|
||||
Forced re-queue is the API's job, not the precacher's.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set, Tuple
|
||||
|
||||
import asyncpg
|
||||
import httpx
|
||||
from dotenv import load_dotenv
|
||||
|
||||
ENV_PATH = Path(__file__).resolve().parent.parent / ".env"
|
||||
|
||||
|
||||
def _build_dsn() -> str:
|
||||
"""Mirror api/db.py: prefer DATABASE_URL, else build from POSTGRES_* parts."""
|
||||
load_dotenv(ENV_PATH)
|
||||
explicit = os.environ.get("DATABASE_URL")
|
||||
if explicit:
|
||||
return explicit
|
||||
user = os.environ["POSTGRES_USER"]
|
||||
pw = urllib.parse.quote(os.environ["POSTGRES_PASSWORD"], safe="")
|
||||
name = os.environ["POSTGRES_DB"]
|
||||
host = os.environ.get("POSTGRES_HOST", "127.0.0.1")
|
||||
port = os.environ.get("POSTGRES_PORT", "5439")
|
||||
return f"postgresql://{user}:{pw}@{host}:{port}/{name}"
|
||||
|
||||
PZ_APPID = 108600
|
||||
BROWSE_URL = "https://steamcommunity.com/workshop/browse/"
|
||||
PER_PAGE = 30 # Steam HTML default; observed cap.
|
||||
RATE_LIMIT_S = 0.6 # polite gap between page fetches.
|
||||
DEFAULT_TARGET = 1000
|
||||
|
||||
# Window label -> (browsesort, days param). days < 0 means "all time" -
|
||||
# Steam's totalvotes sort doesn't take a days param.
|
||||
WINDOWS: List[Tuple[str, str, int]] = [
|
||||
("3m", "mostpopular", 90),
|
||||
("6m", "mostpopular", 180),
|
||||
("1y", "mostpopular", 365),
|
||||
("all", "totalvotes", -1),
|
||||
]
|
||||
|
||||
WSID_RE = re.compile(r'data-publishedfileid="(\d{6,12})"')
|
||||
|
||||
log = logging.getLogger("sortof.precacher")
|
||||
|
||||
|
||||
async def fetch_page(http: httpx.AsyncClient, sort: str, days: int, page: int) -> List[str]:
|
||||
params: Dict[str, object] = {
|
||||
"appid": PZ_APPID,
|
||||
"browsesort": sort,
|
||||
"section": "readytouseitems",
|
||||
"p": page,
|
||||
"numperpage": PER_PAGE,
|
||||
}
|
||||
if days > 0:
|
||||
params["days"] = days
|
||||
r = await http.get(BROWSE_URL, params=params, timeout=30.0)
|
||||
r.raise_for_status()
|
||||
# De-dupe within the page (the same wsid can appear in multiple HTML blocks).
|
||||
return list(dict.fromkeys(WSID_RE.findall(r.text)))
|
||||
|
||||
|
||||
async def collect_top_wsids(
|
||||
http: httpx.AsyncClient, sort: str, days: int, target: int,
|
||||
) -> List[str]:
|
||||
"""Walk pages until we have `target` distinct wsids or pagination exhausts."""
|
||||
seen: Set[str] = set()
|
||||
out: List[str] = []
|
||||
page = 1
|
||||
consecutive_empty = 0
|
||||
while len(out) < target:
|
||||
try:
|
||||
ids = await fetch_page(http, sort, days, page)
|
||||
except httpx.HTTPError as e:
|
||||
log.warning("page %d fetch failed: %s", page, e)
|
||||
break
|
||||
if not ids:
|
||||
consecutive_empty += 1
|
||||
if consecutive_empty >= 2:
|
||||
break
|
||||
else:
|
||||
consecutive_empty = 0
|
||||
added = 0
|
||||
for wid in ids:
|
||||
if wid in seen:
|
||||
continue
|
||||
seen.add(wid)
|
||||
out.append(wid)
|
||||
added += 1
|
||||
if len(out) >= target:
|
||||
break
|
||||
# If a page returns only duplicates we've already seen, we're cycling.
|
||||
if ids and added == 0:
|
||||
break
|
||||
page += 1
|
||||
await asyncio.sleep(RATE_LIMIT_S)
|
||||
return out
|
||||
|
||||
|
||||
async def already_known(conn: asyncpg.Connection, wsids: List[str]) -> Set[str]:
|
||||
"""Returns the subset of wsids that the system has seen - either parsed
|
||||
in mod_parsed or sitting in download_jobs (any status). The conservative
|
||||
superset; precacher never re-queues anything previously touched."""
|
||||
if not wsids:
|
||||
return set()
|
||||
rows = await conn.fetch(
|
||||
"""
|
||||
SELECT workshop_id FROM mod_parsed WHERE workshop_id = ANY($1::text[])
|
||||
UNION
|
||||
SELECT workshop_id FROM download_jobs WHERE workshop_id = ANY($1::text[])
|
||||
""",
|
||||
wsids,
|
||||
)
|
||||
return {r["workshop_id"] for r in rows}
|
||||
|
||||
|
||||
async def enqueue(conn: asyncpg.Connection, wsids: List[str]) -> int:
|
||||
"""INSERT each wsid into download_jobs. Mirrors the API's queue-and-dedup
|
||||
pattern: skip if a row already exists (race-safe via per-iteration tx)."""
|
||||
n = 0
|
||||
for wid in wsids:
|
||||
async with conn.transaction():
|
||||
existing = await conn.fetchval(
|
||||
"SELECT 1 FROM download_jobs WHERE workshop_id = $1 LIMIT 1",
|
||||
wid,
|
||||
)
|
||||
if existing is None:
|
||||
await conn.execute(
|
||||
"INSERT INTO download_jobs (workshop_id, status) VALUES ($1, 'queued')",
|
||||
wid,
|
||||
)
|
||||
n += 1
|
||||
return n
|
||||
|
||||
|
||||
async def main(target: int = DEFAULT_TARGET) -> int:
|
||||
try:
|
||||
dsn = _build_dsn()
|
||||
except KeyError as e:
|
||||
log.error("missing required env var: %s", e)
|
||||
return 2
|
||||
|
||||
pool = await asyncpg.create_pool(dsn=dsn, min_size=1, max_size=2)
|
||||
http = httpx.AsyncClient(headers={"User-Agent": "sortof-precacher/1.0"})
|
||||
|
||||
totals = {"fetched": 0, "skipped_known": 0, "enqueued": 0}
|
||||
try:
|
||||
for label, sort, days in WINDOWS:
|
||||
log.info(
|
||||
"window=%s sort=%s days=%d: collecting up to %d wsids",
|
||||
label, sort, days, target,
|
||||
)
|
||||
wsids = await collect_top_wsids(http, sort, days, target)
|
||||
log.info("window=%s: collected %d wsids", label, len(wsids))
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
known = await already_known(conn, wsids)
|
||||
fresh = [w for w in wsids if w not in known]
|
||||
inserted = await enqueue(conn, fresh)
|
||||
|
||||
log.info(
|
||||
"window=%s: known=%d fresh=%d enqueued=%d",
|
||||
label, len(known), len(fresh), inserted,
|
||||
)
|
||||
totals["fetched"] += len(wsids)
|
||||
totals["skipped_known"] += len(known)
|
||||
totals["enqueued"] += inserted
|
||||
finally:
|
||||
await http.aclose()
|
||||
await pool.close()
|
||||
|
||||
log.info(
|
||||
"precache run done: fetched=%d skipped_known=%d enqueued=%d",
|
||||
totals["fetched"], totals["skipped_known"], totals["enqueued"],
|
||||
)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||
)
|
||||
target = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_TARGET
|
||||
sys.exit(asyncio.run(main(target)))
|
||||
Reference in New Issue
Block a user