ms-haushalt/scripts/extract-pg-sections.mjs

#!/usr/bin/env node
//
// Extract per-Produktgruppe Beschreibung + Erläuterungen sections from
// the Haushaltsplan 2026/2027 Band 1 PDF.
//
// Each Produktgruppe in Münster's NKF-style budget has a stable section
// layout. We use pdftotext -layout to get text with form-feed page
// breaks, find each PG's pages by its running header, then split each
// PG's combined text by its section headings ("Beschreibung",
// "Erläuterungen", etc.).
//
// Usage:
//   node scripts/extract-pg-sections.mjs
// Output:
//   data/extracted/pg-sections-2026.json

import { execSync } from "node:child_process";
import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";

const PDF_PATH = "docs/sources/2026_2027/Haushaltsplanentwurf_2026-2027_Band_1.pdf";
const OUT_PATH = "data/extracted/pg-sections-2026.json";

// ── Step 1: PDF → text with form-feed page boundaries ─────────────
const txtPath = join(tmpdir(), "ms-haushalt-band1.txt");
execSync(`pdftotext -layout "${PDF_PATH}" "${txtPath}"`);
const rawText = readFileSync(txtPath, "utf8");
const pages = rawText.split("\f");
console.log(`PDF pages: ${pages.length}`);

// ── Step 2: Group pages by Produktgruppe number ───────────────────
// Each Teilplan page has a running header with this layout:
//   Haushaltsplan 2026/2027    <PG-NAME>             Dezernat <X>
//   Ausschuss: <ABBR>           Produktgruppe NNNN    <Amt>
// The PG number appears on the second header line. The name appears on
// the first header line, sandwiched between the "Haushaltsplan" cell
// and the "Dezernat …" cell.
const HEADER_RE_PG = /Produktgruppe\s+(\d{4})\b/;
const HEADER_RE_NAME = /^Haushaltsplan\s+\d{4}\/\d{4}\s{2,}(.+?)\s{2,}Dezernat\s/m;

const pagesByPg = new Map(); // pgNumber → array of page texts
const namesByPg = new Map(); // pgNumber → display name

for (const page of pages) {
  const headerSlice = page.slice(0, 600);
  const pgMatch = headerSlice.match(HEADER_RE_PG);
  if (!pgMatch) continue;
  const pgNum = pgMatch[1];
  if (!pagesByPg.has(pgNum)) pagesByPg.set(pgNum, []);
  pagesByPg.get(pgNum).push(page);
  if (!namesByPg.has(pgNum)) {
    const nameMatch = headerSlice.match(HEADER_RE_NAME);
    if (nameMatch) namesByPg.set(pgNum, nameMatch[1].trim());
  }
}
console.log(`Produktgruppen found: ${pagesByPg.size}`);

// ── Step 3: Extract sections per PG ───────────────────────────────
const SECTION_HEADINGS = [
  "Beschreibung",
  "Besonderheiten in den Planjahren",
  "Ziele",
  "Zielkennzahlen",
  "Standardkennzahlen",
  "Bewirtschaftungsregeln",
  "Erläuterungen",
];

/** Strip page-running headers/footers and stray page numbers. */
function cleanPageNoise(text) {
  return text
    .split("\n")
    .filter((line) => {
      const t = line.trim();
      if (t === "") return true;
      if (/^Haushaltsplan\s+\d{4}\/\d{4}/.test(t)) return false;
      if (/^Ausschuss:/.test(t)) return false;
      // Page-number lines: a single integer (with possible surrounding ws)
      if (/^\d{1,4}$/.test(t)) return false;
      return true;
    })
    .join("\n");
}

/** Find a section by its heading and return text up to the next known
 *  heading. Headings are matched as full-line tokens — they appear
 *  flush-left, sometimes with trailing colon. */
function extractSection(text, heading, otherHeadings) {
  const startRe = new RegExp(`^${escapeRe(heading)}:?\\s*$`, "m");
  const startMatch = startRe.exec(text);
  if (!startMatch) return null;
  const sliceStart = startMatch.index + startMatch[0].length;
  const tail = text.slice(sliceStart);

  let endIdx = tail.length;
  for (const h of otherHeadings) {
    if (h === heading) continue;
    const r = new RegExp(`^${escapeRe(h)}:?\\s*$`, "m");
    const m = r.exec(tail);
    if (m && m.index < endIdx) endIdx = m.index;
  }
  // Also stop at the start of a per-Produkt block ("Produkt NNNNNN -")
  // and at the financial table headers — these can sit indented with
  // leading whitespace on the line, so allow `^\s*`.
  for (const r of [
    /^\s*Produkt\s+\d{6}\s*-/m,
    /^\s*Teilergebnisplan\b/m,
    /^\s*Teilfinanzplan\b/m,
    /^\s*Investitionsmaßnahmen\b/m,
    /^\s*Investitionen\s+gesamt\b/m,
    /^\s*Verpflichtungsermächtigungen\b/m,
  ]) {
    const m = r.exec(tail);
    if (m && m.index < endIdx) endIdx = m.index;
  }
  return tail.slice(0, endIdx).trim();
}

function escapeRe(s) {
  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}

/** Strip leading + trailing whitespace per line, collapse multiple
 *  blank lines to one. Preserves paragraph breaks. */
function tidyParagraphs(text) {
  if (!text) return text;
  return text
    .split("\n")
    .map((l) => l.trim())
    .join("\n")
    .replace(/\n{3,}/g, "\n\n")
    .trim();
}

/**
 * Parse breakdown tables embedded in Erläuterungen text.
 *
 * Format we recognize (PG 1601's "zu Zeile 01"):
 *   zu Zeile NN:
 *   …prose…
 *   Aufwandsart    Ansatz YYYY in Mio. €    Ansatz YYYY in Mio. €
 *   <name>             <value>                  <value>
 *   …
 *
 * Returns: { lineNumber: [ { name, values: { year: euro } } ] }
 *
 * Numbers in the table are written in Mio. €; we convert to plain
 * euros to match our CSV value units.
 */
function parseBreakdowns(erlText) {
  if (!erlText) return {};
  const out = {};

  // Split into "zu Zeile NN" sections (case-insensitive — some PGs
  // capitalize "Zu Zeile" at the start of a paragraph).
  const sectionRe = /zu\s+Zeile\s+(\d{1,2})(?:[\s,–-]+\d+)*\s*:?/gi;
  const sections = [];
  let m;
  while ((m = sectionRe.exec(erlText)) !== null) {
    const lineNum = parseInt(m[1], 10);
    sections.push({ lineNum, start: m.index, headerEnd: sectionRe.lastIndex });
  }
  for (let i = 0; i < sections.length; i++) {
    const s = sections[i];
    const end = i + 1 < sections.length ? sections[i + 1].start : erlText.length;
    const body = erlText.slice(s.headerEnd, end);

    // Look for a table header line: at least one "Ansatz YYYY" near a
    // "Mio." unit. If there's no such header, skip the table parse
    // and try the bullet-list fallback at the bottom of this loop.
    const headerLineRe = /^.*Ansatz\s+\d{4}.*Mio\..*$/m;
    const hMatch = body.match(headerLineRe);
    let tableItems = [];
    if (hMatch) {
      const years = [...hMatch[0].matchAll(/Ansatz\s+(\d{4})/g)].map(
        (m) => Number(m[1])
      );
      if (years.length > 0) {
        tableItems = parseTableRows(body, hMatch, years);
      }
    }

    if (tableItems.length > 0) {
      out[s.lineNum] = tableItems;
      continue;
    }
    // No table — try bullet-list parsing on the section body.
    const bulletItems = parseBulletItems(body);
    if (bulletItems.length > 0) out[s.lineNum] = bulletItems;
  }

  return out;
}

/** Extract table rows from a section body given the matched header
 *  line and the years that header announced. Returns an array of
 *  { name, values: { year: euro } } items. */
function parseTableRows(body, hMatch, years) {
  const headerEndIdx = (hMatch.index ?? 0) + hMatch[0].length;
  const remainder = body.slice(headerEndIdx);
  const rowLines = [];
  for (const line of remainder.split("\n")) {
    const trimmed = line.trim();
    if (trimmed === "") {
      if (rowLines.length > 0) break; // table ended
      continue;
    }
    // Heuristic: a row has a decimal/integer number near the end.
    // Lines without trailing numbers are paragraph continuations.
    if (!/[\d]+(?:[.,]\d+)?\s*$/.test(trimmed)) {
      if (rowLines.length > 0) break;
      continue;
    }
    rowLines.push(trimmed);
  }

  const items = [];
  for (const row of rowLines) {
    const numRe = /(-?\d{1,3}(?:\.\d{3})*(?:,\d+)?|-?\d+(?:,\d+)?)/g;
    const nums = [...row.matchAll(numRe)].map((m) => m[0]);
    if (nums.length < years.length) continue;
    const tailNums = nums.slice(-years.length);
    let name = row;
    for (const n of [...tailNums].reverse()) {
      const idx = name.lastIndexOf(n);
      if (idx >= 0) name = name.slice(0, idx);
    }
    name = name.trim();
    if (!name) continue;

    const values = {};
    for (let k = 0; k < years.length; k++) {
      const raw = tailNums[k];
      const num = parseFloat(raw.replace(/\./g, "").replace(",", "."));
      if (!Number.isFinite(num)) continue;
      values[years[k]] = num * 1_000_000; // Mio. € → €
    }
    if (Object.keys(values).length > 0) items.push({ name, values });
  }
  return items;
}

/**
 * Parse bullet-list breakdowns. Each line starting with "-" or "•"
 * with one or more "X Mio. Euro (YYYY)" / "X Euro (YYYY)" patterns
 * becomes an item. Common forms in Münster's Erläuterungen:
 *
 *   - Westf. Zoo Münster GmbH i. H. v. 4,10 Mio. Euro (2026) / 4,1 Mio Euro (2027)
 *   - Lernmittel = 1.435.000 Euro (2026), 1.435.000 Euro (2027)
 *   - Erträge … in Höhe von 247.630 Euro (2026), 252.680 Euro (2027)
 */
function parseBulletItems(text) {
  const items = [];
  const valueRe =
    /(\d{1,3}(?:\.\d{3})*(?:,\d+)?|\d+(?:,\d+)?)\s*(Mio\.?\s*(?:Euro|€)|Euro|€)\s*\(?(\d{4})\)?/gi;
  // Strip "(Gesamtsumme: NNN Euro (YYYY))" parentheticals down to
  // just "(YYYY)" so the per-item parser sees a clean
  // `value Euro (year)` shape. The PG 0301 line 13/16 bullets use
  // this nested form to reference the category total inline; we keep
  // the year tag (which the item's value belongs to) and drop the
  // total reference.
  const gesamtRe =
    /\(\s*Gesamtsumme:[^()]*\((20\d{2})\)[^()]*\)/g;

  for (const rawLine of text.split("\n")) {
    if (!/^\s*[-•]\s+/.test(rawLine)) continue;
    const line = rawLine.replace(gesamtRe, "($1)");

    const values = [];
    let firstValIdx = -1;
    let m;
    valueRe.lastIndex = 0;
    while ((m = valueRe.exec(line)) !== null) {
      if (firstValIdx === -1) firstValIdx = m.index;
      const num = parseFloat(
        m[1].replace(/\./g, "").replace(",", ".")
      );
      if (!Number.isFinite(num)) continue;
      const isMio = /Mio/i.test(m[2]);
      const value = isMio ? num * 1_000_000 : num;
      const year = parseInt(m[3], 10);
      values.push({ year, value });
    }
    if (values.length === 0) continue;

    // The name is everything before the first value, minus the
    // bullet marker and trailing value indicators.
    let name = line.slice(0, firstValIdx).replace(/^\s*[-•]\s+/, "").trim();
    name = name
      .replace(/\s+(?:i\.\s*H\.\s*v\.?|in\s+H[öo]he\s+von|=)\s*$/i, "")
      .replace(/\s+\(.*?\)\s*$/, "") // drop trailing parenthetical
      .replace(/\s+[-–]\s+hier:.*$/i, "") // drop "- hier: ..." asides
      .trim();
    if (!name || name.length < 2) continue;

    const valuesByYear = {};
    for (const v of values) valuesByYear[v.year] = v.value;
    items.push({ name, values: valuesByYear });
  }
  return items;
}

const out = {};
let withBeschreibung = 0;
let withErlauterungen = 0;
for (const [pgNum, pgPages] of pagesByPg) {
  const fullText = cleanPageNoise(pgPages.join("\n\n"));
  const beschreibung = tidyParagraphs(
    extractSection(fullText, "Beschreibung", SECTION_HEADINGS)
  );
  const erlaeuterungen = tidyParagraphs(
    extractSection(fullText, "Erläuterungen", SECTION_HEADINGS)
  );
  if (beschreibung) withBeschreibung++;
  if (erlaeuterungen) withErlauterungen++;
  const breakdowns = parseBreakdowns(erlaeuterungen);
  out[pgNum] = {
    pgNumber: pgNum,
    name: namesByPg.get(pgNum) ?? "",
    beschreibung: beschreibung ?? null,
    erlaeuterungen: erlaeuterungen ?? null,
    breakdowns, // { lineNum: [ { name, values: { year: euro } } ] }
  };
}

console.log(`  with Beschreibung:   ${withBeschreibung}`);
console.log(`  with Erläuterungen:  ${withErlauterungen}`);

// ── Step 4: Write JSON ────────────────────────────────────────────
mkdirSync("data/extracted", { recursive: true });
writeFileSync(OUT_PATH, JSON.stringify(out, null, 2), "utf8");
console.log(`Wrote ${OUT_PATH}`);