#!/usr/bin/env node // // Extract per-Produktgruppe Beschreibung + Erläuterungen sections from // the Haushaltsplan 2026/2027 Band 1 PDF. // // Each Produktgruppe in Münster's NKF-style budget has a stable section // layout. We use pdftotext -layout to get text with form-feed page // breaks, find each PG's pages by its running header, then split each // PG's combined text by its section headings ("Beschreibung", // "Erläuterungen", etc.). // // Usage: // node scripts/extract-pg-sections.mjs // Output: // data/extracted/pg-sections-2026.json import { execSync } from "node:child_process"; import { mkdirSync, readFileSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; const PDF_PATH = "docs/sources/2026_2027/Haushaltsplanentwurf_2026-2027_Band_1.pdf"; const OUT_PATH = "data/extracted/pg-sections-2026.json"; // ── Step 1: PDF → text with form-feed page boundaries ───────────── const txtPath = join(tmpdir(), "ms-haushalt-band1.txt"); execSync(`pdftotext -layout "${PDF_PATH}" "${txtPath}"`); const rawText = readFileSync(txtPath, "utf8"); const pages = rawText.split("\f"); console.log(`PDF pages: ${pages.length}`); // ── Step 2: Group pages by Produktgruppe number ─────────────────── // Each Teilplan page has a running header with this layout: // Haushaltsplan 2026/2027 Dezernat // Ausschuss: Produktgruppe NNNN // The PG number appears on the second header line. The name appears on // the first header line, sandwiched between the "Haushaltsplan" cell // and the "Dezernat …" cell. const HEADER_RE_PG = /Produktgruppe\s+(\d{4})\b/; const HEADER_RE_NAME = /^Haushaltsplan\s+\d{4}\/\d{4}\s{2,}(.+?)\s{2,}Dezernat\s/m; const pagesByPg = new Map(); // pgNumber → array of page texts const namesByPg = new Map(); // pgNumber → display name for (const page of pages) { const headerSlice = page.slice(0, 600); const pgMatch = headerSlice.match(HEADER_RE_PG); if (!pgMatch) continue; const pgNum = pgMatch[1]; if (!pagesByPg.has(pgNum)) pagesByPg.set(pgNum, []); pagesByPg.get(pgNum).push(page); if (!namesByPg.has(pgNum)) { const nameMatch = headerSlice.match(HEADER_RE_NAME); if (nameMatch) namesByPg.set(pgNum, nameMatch[1].trim()); } } console.log(`Produktgruppen found: ${pagesByPg.size}`); // ── Step 3: Extract sections per PG ─────────────────────────────── const SECTION_HEADINGS = [ "Beschreibung", "Besonderheiten in den Planjahren", "Ziele", "Zielkennzahlen", "Standardkennzahlen", "Bewirtschaftungsregeln", "Erläuterungen", ]; /** Strip page-running headers/footers and stray page numbers. */ function cleanPageNoise(text) { return text .split("\n") .filter((line) => { const t = line.trim(); if (t === "") return true; if (/^Haushaltsplan\s+\d{4}\/\d{4}/.test(t)) return false; if (/^Ausschuss:/.test(t)) return false; // Page-number lines: a single integer (with possible surrounding ws) if (/^\d{1,4}$/.test(t)) return false; return true; }) .join("\n"); } /** Find a section by its heading and return text up to the next known * heading. Headings are matched as full-line tokens — they appear * flush-left, sometimes with trailing colon. */ function extractSection(text, heading, otherHeadings) { const startRe = new RegExp(`^${escapeRe(heading)}:?\\s*$`, "m"); const startMatch = startRe.exec(text); if (!startMatch) return null; const sliceStart = startMatch.index + startMatch[0].length; const tail = text.slice(sliceStart); let endIdx = tail.length; for (const h of otherHeadings) { if (h === heading) continue; const r = new RegExp(`^${escapeRe(h)}:?\\s*$`, "m"); const m = r.exec(tail); if (m && m.index < endIdx) endIdx = m.index; } // Also stop at the start of a per-Produkt block ("Produkt NNNNNN -") // and at the financial table headers — these can sit indented with // leading whitespace on the line, so allow `^\s*`. for (const r of [ /^\s*Produkt\s+\d{6}\s*-/m, /^\s*Teilergebnisplan\b/m, /^\s*Teilfinanzplan\b/m, /^\s*Investitionsmaßnahmen\b/m, /^\s*Investitionen\s+gesamt\b/m, /^\s*Verpflichtungsermächtigungen\b/m, ]) { const m = r.exec(tail); if (m && m.index < endIdx) endIdx = m.index; } return tail.slice(0, endIdx).trim(); } function escapeRe(s) { return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } /** Strip leading + trailing whitespace per line, collapse multiple * blank lines to one. Preserves paragraph breaks. */ function tidyParagraphs(text) { if (!text) return text; return text .split("\n") .map((l) => l.trim()) .join("\n") .replace(/\n{3,}/g, "\n\n") .trim(); } /** * Parse breakdown tables embedded in Erläuterungen text. * * Format we recognize (PG 1601's "zu Zeile 01"): * zu Zeile NN: * …prose… * Aufwandsart Ansatz YYYY in Mio. € Ansatz YYYY in Mio. € * * … * * Returns: { lineNumber: [ { name, values: { year: euro } } ] } * * Numbers in the table are written in Mio. €; we convert to plain * euros to match our CSV value units. */ function parseBreakdowns(erlText) { if (!erlText) return {}; const out = {}; // Split into "zu Zeile NN" sections (case-insensitive — some PGs // capitalize "Zu Zeile" at the start of a paragraph). const sectionRe = /zu\s+Zeile\s+(\d{1,2})(?:[\s,–-]+\d+)*\s*:?/gi; const sections = []; let m; while ((m = sectionRe.exec(erlText)) !== null) { const lineNum = parseInt(m[1], 10); sections.push({ lineNum, start: m.index, headerEnd: sectionRe.lastIndex }); } for (let i = 0; i < sections.length; i++) { const s = sections[i]; const end = i + 1 < sections.length ? sections[i + 1].start : erlText.length; const body = erlText.slice(s.headerEnd, end); // Look for a table header line: at least one "Ansatz YYYY" near a // "Mio." unit. If there's no such header, skip the table parse // and try the bullet-list fallback at the bottom of this loop. const headerLineRe = /^.*Ansatz\s+\d{4}.*Mio\..*$/m; const hMatch = body.match(headerLineRe); let tableItems = []; if (hMatch) { const years = [...hMatch[0].matchAll(/Ansatz\s+(\d{4})/g)].map( (m) => Number(m[1]) ); if (years.length > 0) { tableItems = parseTableRows(body, hMatch, years); } } if (tableItems.length > 0) { out[s.lineNum] = tableItems; continue; } // No table — try bullet-list parsing on the section body. const bulletItems = parseBulletItems(body); if (bulletItems.length > 0) out[s.lineNum] = bulletItems; } return out; } /** Extract table rows from a section body given the matched header * line and the years that header announced. Returns an array of * { name, values: { year: euro } } items. */ function parseTableRows(body, hMatch, years) { const headerEndIdx = (hMatch.index ?? 0) + hMatch[0].length; const remainder = body.slice(headerEndIdx); const rowLines = []; for (const line of remainder.split("\n")) { const trimmed = line.trim(); if (trimmed === "") { if (rowLines.length > 0) break; // table ended continue; } // Heuristic: a row has a decimal/integer number near the end. // Lines without trailing numbers are paragraph continuations. if (!/[\d]+(?:[.,]\d+)?\s*$/.test(trimmed)) { if (rowLines.length > 0) break; continue; } rowLines.push(trimmed); } const items = []; for (const row of rowLines) { const numRe = /(-?\d{1,3}(?:\.\d{3})*(?:,\d+)?|-?\d+(?:,\d+)?)/g; const nums = [...row.matchAll(numRe)].map((m) => m[0]); if (nums.length < years.length) continue; const tailNums = nums.slice(-years.length); let name = row; for (const n of [...tailNums].reverse()) { const idx = name.lastIndexOf(n); if (idx >= 0) name = name.slice(0, idx); } name = name.trim(); if (!name) continue; const values = {}; for (let k = 0; k < years.length; k++) { const raw = tailNums[k]; const num = parseFloat(raw.replace(/\./g, "").replace(",", ".")); if (!Number.isFinite(num)) continue; values[years[k]] = num * 1_000_000; // Mio. € → € } if (Object.keys(values).length > 0) items.push({ name, values }); } return items; } /** * Parse bullet-list breakdowns. Each line starting with "-" or "•" * with one or more "X Mio. Euro (YYYY)" / "X Euro (YYYY)" patterns * becomes an item. Common forms in Münster's Erläuterungen: * * - Westf. Zoo Münster GmbH i. H. v. 4,10 Mio. Euro (2026) / 4,1 Mio Euro (2027) * - Lernmittel = 1.435.000 Euro (2026), 1.435.000 Euro (2027) * - Erträge … in Höhe von 247.630 Euro (2026), 252.680 Euro (2027) */ function parseBulletItems(text) { const items = []; const valueRe = /(\d{1,3}(?:\.\d{3})*(?:,\d+)?|\d+(?:,\d+)?)\s*(Mio\.?\s*(?:Euro|€)|Euro|€)\s*\(?(\d{4})\)?/gi; // Strip "(Gesamtsumme: NNN Euro (YYYY))" parentheticals down to // just "(YYYY)" so the per-item parser sees a clean // `value Euro (year)` shape. The PG 0301 line 13/16 bullets use // this nested form to reference the category total inline; we keep // the year tag (which the item's value belongs to) and drop the // total reference. const gesamtRe = /\(\s*Gesamtsumme:[^()]*\((20\d{2})\)[^()]*\)/g; for (const rawLine of text.split("\n")) { if (!/^\s*[-•]\s+/.test(rawLine)) continue; const line = rawLine.replace(gesamtRe, "($1)"); const values = []; let firstValIdx = -1; let m; valueRe.lastIndex = 0; while ((m = valueRe.exec(line)) !== null) { if (firstValIdx === -1) firstValIdx = m.index; const num = parseFloat( m[1].replace(/\./g, "").replace(",", ".") ); if (!Number.isFinite(num)) continue; const isMio = /Mio/i.test(m[2]); const value = isMio ? num * 1_000_000 : num; const year = parseInt(m[3], 10); values.push({ year, value }); } if (values.length === 0) continue; // The name is everything before the first value, minus the // bullet marker and trailing value indicators. let name = line.slice(0, firstValIdx).replace(/^\s*[-•]\s+/, "").trim(); name = name .replace(/\s+(?:i\.\s*H\.\s*v\.?|in\s+H[öo]he\s+von|=)\s*$/i, "") .replace(/\s+\(.*?\)\s*$/, "") // drop trailing parenthetical .replace(/\s+[-–]\s+hier:.*$/i, "") // drop "- hier: ..." asides .trim(); if (!name || name.length < 2) continue; const valuesByYear = {}; for (const v of values) valuesByYear[v.year] = v.value; items.push({ name, values: valuesByYear }); } return items; } const out = {}; let withBeschreibung = 0; let withErlauterungen = 0; for (const [pgNum, pgPages] of pagesByPg) { const fullText = cleanPageNoise(pgPages.join("\n\n")); const beschreibung = tidyParagraphs( extractSection(fullText, "Beschreibung", SECTION_HEADINGS) ); const erlaeuterungen = tidyParagraphs( extractSection(fullText, "Erläuterungen", SECTION_HEADINGS) ); if (beschreibung) withBeschreibung++; if (erlaeuterungen) withErlauterungen++; const breakdowns = parseBreakdowns(erlaeuterungen); out[pgNum] = { pgNumber: pgNum, name: namesByPg.get(pgNum) ?? "", beschreibung: beschreibung ?? null, erlaeuterungen: erlaeuterungen ?? null, breakdowns, // { lineNum: [ { name, values: { year: euro } } ] } }; } console.log(` with Beschreibung: ${withBeschreibung}`); console.log(` with Erläuterungen: ${withErlauterungen}`); // ── Step 4: Write JSON ──────────────────────────────────────────── mkdirSync("data/extracted", { recursive: true }); writeFileSync(OUT_PATH, JSON.stringify(out, null, 2), "utf8"); console.log(`Wrote ${OUT_PATH}`);