boocode/packages/ion/src/format/sop-parser.ts

/**
 * SOP Markdown parser for the Ion workflow engine.
 *
 * Parses `.sop.md` files (Agent SOP format) into structured `SopDocument`
 * objects that can be converted to YAML workflow definitions.
 */

// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------

/** A single parameter declared in the SOP's Parameters section. */
export interface SopParameter {
  /** Parameter name (camelCase by convention). */
  name: string;
  /** Whether the parameter is required or optional. */
  type: 'required' | 'optional';
  /** Default value (only present when type is 'optional'). */
  default?: string;
  /** Human-readable description of the parameter. */
  description: string;
}

/** A single step declared in the SOP's Steps section. */
export interface SopStep {
  /** Step number (1-based). */
  number: number;
  /** Short human-readable step name. */
  name: string;
  /** Full body text of the step (may be multi-line). */
  body: string;
  /** Constraints text extracted from the step, if any. */
  constraints?: string;
}

/** The fully-parsed SOP document. */
export interface SopDocument {
  /** Title extracted from the first `# heading`. */
  title: string;
  /** Overview section content. */
  overview: string;
  /** Parsed parameters (empty array if section absent). */
  parameters: SopParameter[];
  /** Parsed steps (empty array if section absent). */
  steps: SopStep[];
  /** Optional examples section content. */
  examples?: string;
}

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/**
 * Extract a section body from markdown text.
 *
 * A section starts with `## <Title>` and ends at the next `## ` or `# `
 * heading (or end of string).
 */
function extractSection(markdown: string, heading: string): string | null {
  const pattern = new RegExp(
    `^##\\s+${escapeRegex(heading)}\\s*\\n([\\s\\S]*?)(?=\\n##|\\n#|$)`,
    'm',
  );
  const match = markdown.match(pattern);
  return match?.[1]?.trim() ?? null;
}

/** Escape special regex characters in a literal string. */
function escapeRegex(str: string): string {
  return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}

// ---------------------------------------------------------------------------
// Section parsers
// ---------------------------------------------------------------------------

/** Parse the Parameters section into structured `SopParameter` objects. */
function parseParameters(raw: string): SopParameter[] {
  const parameters: SopParameter[] = [];
  // Match lines like: - **paramName** (required): Description here
  //                   - **paramName** (optional, default: value): Description here
  const paramRegex =
    /^-\s+\*\*(\w+)\*\*\s+\((required|optional)(?:,\s*default:\s*([^)]+))?\):\s+(.+)$/gm;

  let match: RegExpExecArray | null;
  while ((match = paramRegex.exec(raw)) !== null) {
    const name = match[1]!;
    const type = match[2]! as 'required' | 'optional';
    const defaultVal = match[3]; // may be undefined (optional group)
    const description = match[4]!;

    const param: SopParameter = {
      name,
      type,
      description,
    };
    if (defaultVal !== undefined) {
      param.default = defaultVal.trim();
    }
    parameters.push(param);
  }

  return parameters;
}

/** Parse the Steps section into structured `SopStep` objects. */
function parseSteps(raw: string): SopStep[] {
  const steps: SopStep[] = [];

  // Find all ### sub-headings like "### 1. Step Name"
  const stepHeadingRegex = /^###\s+(\d+)\.\s+(.+)$/gm;

  // Collect heading positions: [startIndex, endIndex, number, name]
  const headings: { number: number; name: string; start: number; end: number }[] = [];

  let match: RegExpExecArray | null;
  while ((match = stepHeadingRegex.exec(raw)) !== null) {
    headings.push({
      number: parseInt(match[1]!, 10),
      name: match[2]!.trim(),
      start: match.index,
      end: -1, // filled in below
    });
  }

  // Set end positions: each heading ends where the next one starts, or at EOF
  for (let i = 0; i < headings.length; i++) {
    const heading = headings[i]!;
    heading.end =
      i + 1 < headings.length ? headings[i + 1]!.start : raw.length;
  }

  for (const heading of headings) {
    // The body starts after the heading line itself
    const headingLineEnd = raw.indexOf('\n', heading.start);
    const bodyStart = headingLineEnd === -1 ? raw.length : headingLineEnd + 1;
    const sectionText = raw.slice(bodyStart, heading.end).trim();

    // Extract constraints if present
    const constraintsMatch = sectionText.match(
      /\*\*Constraints:\*\*\s*\n([\s\S]*?)(?=\n###|\n##|$)/,
    );
    const constraints = constraintsMatch?.[1]?.trim();

    // Body is everything before the Constraints heading (or the whole text)
    let body: string;
    if (constraintsMatch?.index !== undefined) {
      body = sectionText.slice(0, constraintsMatch.index).trim();
    } else {
      body = sectionText;
    }

    steps.push({
      number: heading.number,
      name: heading.name,
      body,
      ...(constraints ? { constraints } : {}),
    });
  }

  return steps;
}

// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------

/**
 * Parse a `.sop.md` markdown string into a structured `SopDocument`.
 *
 * @param markdown - The raw markdown content of a `.sop.md` file.
 * @returns A parsed `SopDocument` with title, overview, parameters, steps,
 *          and optional examples.
 */
export function parseSopContent(markdown: string): SopDocument {
  // --- Title (first h1) ---
  const titleMatch = markdown.match(/^#\s+(.+)$/m);
  const title = titleMatch?.[1]?.trim() ?? 'Untitled SOP';

  // --- Overview ---
  const overviewRaw = extractSection(markdown, 'Overview');
  const overview = overviewRaw ?? '';

  // --- Parameters ---
  const parametersRaw = extractSection(markdown, 'Parameters');
  const parameters = parametersRaw ? parseParameters(parametersRaw) : [];

  // --- Steps ---
  const stepsRaw = extractSection(markdown, 'Steps');
  const steps = stepsRaw ? parseSteps(stepsRaw) : [];

  // --- Examples (optional) ---
  const examplesRaw = extractSection(markdown, 'Examples');

  return {
    title,
    overview,
    parameters,
    steps,
    ...(examplesRaw !== null ? { examples: examplesRaw } : {}),
  };
}