sortof/api/parse.py

"""Parse a raw textarea blob into a deduped, ordered list of workshop IDs."""

from __future__ import annotations

import re
from typing import List


def parse_workshop_input(text: str) -> List[str]:
    cleaned = re.sub(
        r"^\s*(WorkshopItems|Mods|Map)\s*=\s*",
        "",
        text,
        flags=re.MULTILINE | re.IGNORECASE,
    )
    ids = re.findall(r"\b\d{7,12}\b", cleaned)
    seen: set[str] = set()
    out: List[str] = []
    for i in ids:
        if i not in seen:
            seen.add(i)
            out.append(i)
    return out


# Steam Workshop URL form: https://steamcommunity.com/{sharedfiles,workshop}/filedetails/?id=NNNNNNN
_STEAM_URL_RE = re.compile(
    r"https?://steamcommunity\.com/(?:sharedfiles|workshop)/filedetails/\?id=(\d{7,12})",
    re.IGNORECASE,
)


def parse_with_collections(text: str) -> tuple[List[str], List[str]]:
    """Split an input blob into bare wsids and candidate collection IDs.

    A "candidate collection" is any 7-12-digit ID that appears inside a
    Steam Workshop URL. Bare numeric IDs in the same blob are treated as
    mod wsids (current behavior). Steam doesn't syntactically distinguish
    collection IDs from mod IDs; the candidate list is sent to
    GetCollectionDetails to confirm. If a candidate isn't actually a
    collection, the caller falls it back to wsids.

    Returns (wsids, collection_ids), each deduped and in first-seen order.
    """
    if not text:
        return ([], [])

    # 1. Find URL-form IDs FIRST (so they don't get double-counted as bare).
    url_ids: List[str] = []
    seen_url: set[str] = set()
    for m in _STEAM_URL_RE.finditer(text):
        i = m.group(1)
        if i not in seen_url:
            seen_url.add(i)
            url_ids.append(i)

    # 2. Strip the URLs out before extracting bare numbers.
    text_minus_urls = _STEAM_URL_RE.sub("", text)

    # 3. Bare wsids: same regex as parse_workshop_input.
    cleaned = re.sub(
        r"^\s*(WorkshopItems|Mods|Map)\s*=\s*",
        "",
        text_minus_urls,
        flags=re.MULTILINE | re.IGNORECASE,
    )
    bare_ids = re.findall(r"\b\d{7,12}\b", cleaned)
    seen_bare: set[str] = set()
    bare_unique: List[str] = []
    for i in bare_ids:
        if i not in seen_bare and i not in seen_url:
            seen_bare.add(i)
            bare_unique.append(i)

    return (bare_unique, url_ids)