Files
ms-haushalt/scripts/extract-pg-sections.mjs
Flo 9a958c0051 feat: initial Münster Haushalt icicle viewer
Editorial single-page viewer for the City of Münster's 2026/2027
budget draft, built as an Astro v6 SPA with a 4-level zoomable
icicle (Produktbereich → Produktgruppe → Category → Breakdown).

Highlights:
- Multi-flow data layer over the official open-data CSVs
  (Aufwendungen + Erträge, 2008–2028) with overlap reconciliation
  across plan years.
- Year slider as a 21-year mini-histogram of both flows;
  drag-to-scrub and click-to-jump, with bars morphing via CSS
  transitions on SVG geometry attributes.
- Vertically centred icicle with year-outline rectangles framing
  each year's relative budget size, à la Bostock's animated treemap.
- Headline "ausgibt / einnimmt" toggle; sidebar Aufwendungen/Erträge
  rows double as flow toggles. Active flow in Aufwendungen-purple /
  Erträge-orange (OKLCH).
- Click-to-zoom via path-keyed lookup with ZOOM_COL_BOUNDS that
  reallocate the depth axis per zoom state. Zoomed item moves to the
  sidebar; canvas shows its descendants only (no adjacent-block leaks).
- Sidebar shows path-specific Aufwendungen/Erträge/Saldo plus the
  source-PDF Beschreibung; Erläuterungen behind a collapsed details.
- Build-time PDF extraction (scripts/extract-pg-sections.mjs) parses
  68 Produktgruppen' Beschreibung + Erläuterungen sections from
  Band 1, including 10 cells of structured Mio.-€ breakdowns
  (Steuern, Transferaufwendungen, etc.) that drive the level-4 view.
- URL state sync for path, year, and flow via history.replaceState
  so any zoom is shareable.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2026-05-07 16:27:45 +02:00

334 lines
12 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
//
// Extract per-Produktgruppe Beschreibung + Erläuterungen sections from
// the Haushaltsplan 2026/2027 Band 1 PDF.
//
// Each Produktgruppe in Münster's NKF-style budget has a stable section
// layout. We use pdftotext -layout to get text with form-feed page
// breaks, find each PG's pages by its running header, then split each
// PG's combined text by its section headings ("Beschreibung",
// "Erläuterungen", etc.).
//
// Usage:
// node scripts/extract-pg-sections.mjs
// Output:
// data/extracted/pg-sections-2026.json
import { execSync } from "node:child_process";
import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
const PDF_PATH = "docs/sources/2026_2027/Haushaltsplanentwurf_2026-2027_Band_1.pdf";
const OUT_PATH = "data/extracted/pg-sections-2026.json";
// ── Step 1: PDF → text with form-feed page boundaries ─────────────
const txtPath = join(tmpdir(), "ms-haushalt-band1.txt");
execSync(`pdftotext -layout "${PDF_PATH}" "${txtPath}"`);
const rawText = readFileSync(txtPath, "utf8");
const pages = rawText.split("\f");
console.log(`PDF pages: ${pages.length}`);
// ── Step 2: Group pages by Produktgruppe number ───────────────────
// Each Teilplan page has a running header with this layout:
// Haushaltsplan 2026/2027 <PG-NAME> Dezernat <X>
// Ausschuss: <ABBR> Produktgruppe NNNN <Amt>
// The PG number appears on the second header line. The name appears on
// the first header line, sandwiched between the "Haushaltsplan" cell
// and the "Dezernat …" cell.
const HEADER_RE_PG = /Produktgruppe\s+(\d{4})\b/;
const HEADER_RE_NAME = /^Haushaltsplan\s+\d{4}\/\d{4}\s{2,}(.+?)\s{2,}Dezernat\s/m;
const pagesByPg = new Map(); // pgNumber → array of page texts
const namesByPg = new Map(); // pgNumber → display name
for (const page of pages) {
const headerSlice = page.slice(0, 600);
const pgMatch = headerSlice.match(HEADER_RE_PG);
if (!pgMatch) continue;
const pgNum = pgMatch[1];
if (!pagesByPg.has(pgNum)) pagesByPg.set(pgNum, []);
pagesByPg.get(pgNum).push(page);
if (!namesByPg.has(pgNum)) {
const nameMatch = headerSlice.match(HEADER_RE_NAME);
if (nameMatch) namesByPg.set(pgNum, nameMatch[1].trim());
}
}
console.log(`Produktgruppen found: ${pagesByPg.size}`);
// ── Step 3: Extract sections per PG ───────────────────────────────
const SECTION_HEADINGS = [
"Beschreibung",
"Besonderheiten in den Planjahren",
"Ziele",
"Zielkennzahlen",
"Standardkennzahlen",
"Bewirtschaftungsregeln",
"Erläuterungen",
];
/** Strip page-running headers/footers and stray page numbers. */
function cleanPageNoise(text) {
return text
.split("\n")
.filter((line) => {
const t = line.trim();
if (t === "") return true;
if (/^Haushaltsplan\s+\d{4}\/\d{4}/.test(t)) return false;
if (/^Ausschuss:/.test(t)) return false;
// Page-number lines: a single integer (with possible surrounding ws)
if (/^\d{1,4}$/.test(t)) return false;
return true;
})
.join("\n");
}
/** Find a section by its heading and return text up to the next known
* heading. Headings are matched as full-line tokens — they appear
* flush-left, sometimes with trailing colon. */
function extractSection(text, heading, otherHeadings) {
const startRe = new RegExp(`^${escapeRe(heading)}:?\\s*$`, "m");
const startMatch = startRe.exec(text);
if (!startMatch) return null;
const sliceStart = startMatch.index + startMatch[0].length;
const tail = text.slice(sliceStart);
let endIdx = tail.length;
for (const h of otherHeadings) {
if (h === heading) continue;
const r = new RegExp(`^${escapeRe(h)}:?\\s*$`, "m");
const m = r.exec(tail);
if (m && m.index < endIdx) endIdx = m.index;
}
// Also stop at the start of a per-Produkt block ("Produkt NNNNNN -")
// and at the financial table headers — these can sit indented with
// leading whitespace on the line, so allow `^\s*`.
for (const r of [
/^\s*Produkt\s+\d{6}\s*-/m,
/^\s*Teilergebnisplan\b/m,
/^\s*Teilfinanzplan\b/m,
/^\s*Investitionsmaßnahmen\b/m,
/^\s*Investitionen\s+gesamt\b/m,
/^\s*Verpflichtungsermächtigungen\b/m,
]) {
const m = r.exec(tail);
if (m && m.index < endIdx) endIdx = m.index;
}
return tail.slice(0, endIdx).trim();
}
function escapeRe(s) {
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
/** Strip leading + trailing whitespace per line, collapse multiple
* blank lines to one. Preserves paragraph breaks. */
function tidyParagraphs(text) {
if (!text) return text;
return text
.split("\n")
.map((l) => l.trim())
.join("\n")
.replace(/\n{3,}/g, "\n\n")
.trim();
}
/**
* Parse breakdown tables embedded in Erläuterungen text.
*
* Format we recognize (PG 1601's "zu Zeile 01"):
* zu Zeile NN:
* …prose…
* Aufwandsart Ansatz YYYY in Mio. € Ansatz YYYY in Mio. €
* <name> <value> <value>
* …
*
* Returns: { lineNumber: [ { name, values: { year: euro } } ] }
*
* Numbers in the table are written in Mio. €; we convert to plain
* euros to match our CSV value units.
*/
function parseBreakdowns(erlText) {
if (!erlText) return {};
const out = {};
// Split into "zu Zeile NN" sections (case-insensitive — some PGs
// capitalize "Zu Zeile" at the start of a paragraph).
const sectionRe = /zu\s+Zeile\s+(\d{1,2})(?:[\s,-]+\d+)*\s*:?/gi;
const sections = [];
let m;
while ((m = sectionRe.exec(erlText)) !== null) {
const lineNum = parseInt(m[1], 10);
sections.push({ lineNum, start: m.index, headerEnd: sectionRe.lastIndex });
}
for (let i = 0; i < sections.length; i++) {
const s = sections[i];
const end = i + 1 < sections.length ? sections[i + 1].start : erlText.length;
const body = erlText.slice(s.headerEnd, end);
// Look for a table header line: at least one "Ansatz YYYY" near a
// "Mio." unit. If there's no such header, skip the table parse
// and try the bullet-list fallback at the bottom of this loop.
const headerLineRe = /^.*Ansatz\s+\d{4}.*Mio\..*$/m;
const hMatch = body.match(headerLineRe);
let tableItems = [];
if (hMatch) {
const years = [...hMatch[0].matchAll(/Ansatz\s+(\d{4})/g)].map(
(m) => Number(m[1])
);
if (years.length > 0) {
tableItems = parseTableRows(body, hMatch, years);
}
}
if (tableItems.length > 0) {
out[s.lineNum] = tableItems;
continue;
}
// No table — try bullet-list parsing on the section body.
const bulletItems = parseBulletItems(body);
if (bulletItems.length > 0) out[s.lineNum] = bulletItems;
}
return out;
}
/** Extract table rows from a section body given the matched header
* line and the years that header announced. Returns an array of
* { name, values: { year: euro } } items. */
function parseTableRows(body, hMatch, years) {
const headerEndIdx = (hMatch.index ?? 0) + hMatch[0].length;
const remainder = body.slice(headerEndIdx);
const rowLines = [];
for (const line of remainder.split("\n")) {
const trimmed = line.trim();
if (trimmed === "") {
if (rowLines.length > 0) break; // table ended
continue;
}
// Heuristic: a row has a decimal/integer number near the end.
// Lines without trailing numbers are paragraph continuations.
if (!/[\d]+(?:[.,]\d+)?\s*$/.test(trimmed)) {
if (rowLines.length > 0) break;
continue;
}
rowLines.push(trimmed);
}
const items = [];
for (const row of rowLines) {
const numRe = /(-?\d{1,3}(?:\.\d{3})*(?:,\d+)?|-?\d+(?:,\d+)?)/g;
const nums = [...row.matchAll(numRe)].map((m) => m[0]);
if (nums.length < years.length) continue;
const tailNums = nums.slice(-years.length);
let name = row;
for (const n of [...tailNums].reverse()) {
const idx = name.lastIndexOf(n);
if (idx >= 0) name = name.slice(0, idx);
}
name = name.trim();
if (!name) continue;
const values = {};
for (let k = 0; k < years.length; k++) {
const raw = tailNums[k];
const num = parseFloat(raw.replace(/\./g, "").replace(",", "."));
if (!Number.isFinite(num)) continue;
values[years[k]] = num * 1_000_000; // Mio. € → €
}
if (Object.keys(values).length > 0) items.push({ name, values });
}
return items;
}
/**
* Parse bullet-list breakdowns. Each line starting with "-" or "•"
* with one or more "X Mio. Euro (YYYY)" / "X Euro (YYYY)" patterns
* becomes an item. Common forms in Münster's Erläuterungen:
*
* - Westf. Zoo Münster GmbH i. H. v. 4,10 Mio. Euro (2026) / 4,1 Mio Euro (2027)
* - Lernmittel = 1.435.000 Euro (2026), 1.435.000 Euro (2027)
* - Erträge … in Höhe von 247.630 Euro (2026), 252.680 Euro (2027)
*/
function parseBulletItems(text) {
const items = [];
const valueRe =
/(\d{1,3}(?:\.\d{3})*(?:,\d+)?|\d+(?:,\d+)?)\s*(Mio\.?\s*(?:Euro|€)|Euro|€)\s*\(?(\d{4})\)?/gi;
// Strip "(Gesamtsumme: NNN Euro (YYYY))" parentheticals down to
// just "(YYYY)" so the per-item parser sees a clean
// `value Euro (year)` shape. The PG 0301 line 13/16 bullets use
// this nested form to reference the category total inline; we keep
// the year tag (which the item's value belongs to) and drop the
// total reference.
const gesamtRe =
/\(\s*Gesamtsumme:[^()]*\((20\d{2})\)[^()]*\)/g;
for (const rawLine of text.split("\n")) {
if (!/^\s*[-•]\s+/.test(rawLine)) continue;
const line = rawLine.replace(gesamtRe, "($1)");
const values = [];
let firstValIdx = -1;
let m;
valueRe.lastIndex = 0;
while ((m = valueRe.exec(line)) !== null) {
if (firstValIdx === -1) firstValIdx = m.index;
const num = parseFloat(
m[1].replace(/\./g, "").replace(",", ".")
);
if (!Number.isFinite(num)) continue;
const isMio = /Mio/i.test(m[2]);
const value = isMio ? num * 1_000_000 : num;
const year = parseInt(m[3], 10);
values.push({ year, value });
}
if (values.length === 0) continue;
// The name is everything before the first value, minus the
// bullet marker and trailing value indicators.
let name = line.slice(0, firstValIdx).replace(/^\s*[-•]\s+/, "").trim();
name = name
.replace(/\s+(?:i\.\s*H\.\s*v\.?|in\s+H[öo]he\s+von|=)\s*$/i, "")
.replace(/\s+\(.*?\)\s*$/, "") // drop trailing parenthetical
.replace(/\s+[-]\s+hier:.*$/i, "") // drop "- hier: ..." asides
.trim();
if (!name || name.length < 2) continue;
const valuesByYear = {};
for (const v of values) valuesByYear[v.year] = v.value;
items.push({ name, values: valuesByYear });
}
return items;
}
const out = {};
let withBeschreibung = 0;
let withErlauterungen = 0;
for (const [pgNum, pgPages] of pagesByPg) {
const fullText = cleanPageNoise(pgPages.join("\n\n"));
const beschreibung = tidyParagraphs(
extractSection(fullText, "Beschreibung", SECTION_HEADINGS)
);
const erlaeuterungen = tidyParagraphs(
extractSection(fullText, "Erläuterungen", SECTION_HEADINGS)
);
if (beschreibung) withBeschreibung++;
if (erlaeuterungen) withErlauterungen++;
const breakdowns = parseBreakdowns(erlaeuterungen);
out[pgNum] = {
pgNumber: pgNum,
name: namesByPg.get(pgNum) ?? "",
beschreibung: beschreibung ?? null,
erlaeuterungen: erlaeuterungen ?? null,
breakdowns, // { lineNum: [ { name, values: { year: euro } } ] }
};
}
console.log(` with Beschreibung: ${withBeschreibung}`);
console.log(` with Erläuterungen: ${withErlauterungen}`);
// ── Step 4: Write JSON ────────────────────────────────────────────
mkdirSync("data/extracted", { recursive: true });
writeFileSync(OUT_PATH, JSON.stringify(out, null, 2), "utf8");
console.log(`Wrote ${OUT_PATH}`);