feat: initial Münster Haushalt icicle viewer

Editorial single-page viewer for the City of Münster's 2026/2027
budget draft, built as an Astro v6 SPA with a 4-level zoomable
icicle (Produktbereich → Produktgruppe → Category → Breakdown).

Highlights:
- Multi-flow data layer over the official open-data CSVs
  (Aufwendungen + Erträge, 2008–2028) with overlap reconciliation
  across plan years.
- Year slider as a 21-year mini-histogram of both flows;
  drag-to-scrub and click-to-jump, with bars morphing via CSS
  transitions on SVG geometry attributes.
- Vertically centred icicle with year-outline rectangles framing
  each year's relative budget size, à la Bostock's animated treemap.
- Headline "ausgibt / einnimmt" toggle; sidebar Aufwendungen/Erträge
  rows double as flow toggles. Active flow in Aufwendungen-purple /
  Erträge-orange (OKLCH).
- Click-to-zoom via path-keyed lookup with ZOOM_COL_BOUNDS that
  reallocate the depth axis per zoom state. Zoomed item moves to the
  sidebar; canvas shows its descendants only (no adjacent-block leaks).
- Sidebar shows path-specific Aufwendungen/Erträge/Saldo plus the
  source-PDF Beschreibung; Erläuterungen behind a collapsed details.
- Build-time PDF extraction (scripts/extract-pg-sections.mjs) parses
  68 Produktgruppen' Beschreibung + Erläuterungen sections from
  Band 1, including 10 cells of structured Mio.-€ breakdowns
  (Steuern, Transferaufwendungen, etc.) that drive the level-4 view.
- URL state sync for path, year, and flow via history.replaceState
  so any zoom is shareable.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Flo
2026-05-07 14:22:53 +02:00
parent 0f10f8507f
commit 9a958c0051
46 changed files with 10056 additions and 1 deletions
+333
View File
@@ -0,0 +1,333 @@
#!/usr/bin/env node
//
// Extract per-Produktgruppe Beschreibung + Erläuterungen sections from
// the Haushaltsplan 2026/2027 Band 1 PDF.
//
// Each Produktgruppe in Münster's NKF-style budget has a stable section
// layout. We use pdftotext -layout to get text with form-feed page
// breaks, find each PG's pages by its running header, then split each
// PG's combined text by its section headings ("Beschreibung",
// "Erläuterungen", etc.).
//
// Usage:
// node scripts/extract-pg-sections.mjs
// Output:
// data/extracted/pg-sections-2026.json
import { execSync } from "node:child_process";
import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
const PDF_PATH = "docs/sources/2026_2027/Haushaltsplanentwurf_2026-2027_Band_1.pdf";
const OUT_PATH = "data/extracted/pg-sections-2026.json";
// ── Step 1: PDF → text with form-feed page boundaries ─────────────
const txtPath = join(tmpdir(), "ms-haushalt-band1.txt");
execSync(`pdftotext -layout "${PDF_PATH}" "${txtPath}"`);
const rawText = readFileSync(txtPath, "utf8");
const pages = rawText.split("\f");
console.log(`PDF pages: ${pages.length}`);
// ── Step 2: Group pages by Produktgruppe number ───────────────────
// Each Teilplan page has a running header with this layout:
// Haushaltsplan 2026/2027 <PG-NAME> Dezernat <X>
// Ausschuss: <ABBR> Produktgruppe NNNN <Amt>
// The PG number appears on the second header line. The name appears on
// the first header line, sandwiched between the "Haushaltsplan" cell
// and the "Dezernat …" cell.
const HEADER_RE_PG = /Produktgruppe\s+(\d{4})\b/;
const HEADER_RE_NAME = /^Haushaltsplan\s+\d{4}\/\d{4}\s{2,}(.+?)\s{2,}Dezernat\s/m;
const pagesByPg = new Map(); // pgNumber → array of page texts
const namesByPg = new Map(); // pgNumber → display name
for (const page of pages) {
const headerSlice = page.slice(0, 600);
const pgMatch = headerSlice.match(HEADER_RE_PG);
if (!pgMatch) continue;
const pgNum = pgMatch[1];
if (!pagesByPg.has(pgNum)) pagesByPg.set(pgNum, []);
pagesByPg.get(pgNum).push(page);
if (!namesByPg.has(pgNum)) {
const nameMatch = headerSlice.match(HEADER_RE_NAME);
if (nameMatch) namesByPg.set(pgNum, nameMatch[1].trim());
}
}
console.log(`Produktgruppen found: ${pagesByPg.size}`);
// ── Step 3: Extract sections per PG ───────────────────────────────
const SECTION_HEADINGS = [
"Beschreibung",
"Besonderheiten in den Planjahren",
"Ziele",
"Zielkennzahlen",
"Standardkennzahlen",
"Bewirtschaftungsregeln",
"Erläuterungen",
];
/** Strip page-running headers/footers and stray page numbers. */
function cleanPageNoise(text) {
return text
.split("\n")
.filter((line) => {
const t = line.trim();
if (t === "") return true;
if (/^Haushaltsplan\s+\d{4}\/\d{4}/.test(t)) return false;
if (/^Ausschuss:/.test(t)) return false;
// Page-number lines: a single integer (with possible surrounding ws)
if (/^\d{1,4}$/.test(t)) return false;
return true;
})
.join("\n");
}
/** Find a section by its heading and return text up to the next known
* heading. Headings are matched as full-line tokens — they appear
* flush-left, sometimes with trailing colon. */
function extractSection(text, heading, otherHeadings) {
const startRe = new RegExp(`^${escapeRe(heading)}:?\\s*$`, "m");
const startMatch = startRe.exec(text);
if (!startMatch) return null;
const sliceStart = startMatch.index + startMatch[0].length;
const tail = text.slice(sliceStart);
let endIdx = tail.length;
for (const h of otherHeadings) {
if (h === heading) continue;
const r = new RegExp(`^${escapeRe(h)}:?\\s*$`, "m");
const m = r.exec(tail);
if (m && m.index < endIdx) endIdx = m.index;
}
// Also stop at the start of a per-Produkt block ("Produkt NNNNNN -")
// and at the financial table headers — these can sit indented with
// leading whitespace on the line, so allow `^\s*`.
for (const r of [
/^\s*Produkt\s+\d{6}\s*-/m,
/^\s*Teilergebnisplan\b/m,
/^\s*Teilfinanzplan\b/m,
/^\s*Investitionsmaßnahmen\b/m,
/^\s*Investitionen\s+gesamt\b/m,
/^\s*Verpflichtungsermächtigungen\b/m,
]) {
const m = r.exec(tail);
if (m && m.index < endIdx) endIdx = m.index;
}
return tail.slice(0, endIdx).trim();
}
function escapeRe(s) {
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
/** Strip leading + trailing whitespace per line, collapse multiple
* blank lines to one. Preserves paragraph breaks. */
function tidyParagraphs(text) {
if (!text) return text;
return text
.split("\n")
.map((l) => l.trim())
.join("\n")
.replace(/\n{3,}/g, "\n\n")
.trim();
}
/**
* Parse breakdown tables embedded in Erläuterungen text.
*
* Format we recognize (PG 1601's "zu Zeile 01"):
* zu Zeile NN:
* …prose…
* Aufwandsart Ansatz YYYY in Mio. € Ansatz YYYY in Mio. €
* <name> <value> <value>
* …
*
* Returns: { lineNumber: [ { name, values: { year: euro } } ] }
*
* Numbers in the table are written in Mio. €; we convert to plain
* euros to match our CSV value units.
*/
function parseBreakdowns(erlText) {
if (!erlText) return {};
const out = {};
// Split into "zu Zeile NN" sections (case-insensitive — some PGs
// capitalize "Zu Zeile" at the start of a paragraph).
const sectionRe = /zu\s+Zeile\s+(\d{1,2})(?:[\s,-]+\d+)*\s*:?/gi;
const sections = [];
let m;
while ((m = sectionRe.exec(erlText)) !== null) {
const lineNum = parseInt(m[1], 10);
sections.push({ lineNum, start: m.index, headerEnd: sectionRe.lastIndex });
}
for (let i = 0; i < sections.length; i++) {
const s = sections[i];
const end = i + 1 < sections.length ? sections[i + 1].start : erlText.length;
const body = erlText.slice(s.headerEnd, end);
// Look for a table header line: at least one "Ansatz YYYY" near a
// "Mio." unit. If there's no such header, skip the table parse
// and try the bullet-list fallback at the bottom of this loop.
const headerLineRe = /^.*Ansatz\s+\d{4}.*Mio\..*$/m;
const hMatch = body.match(headerLineRe);
let tableItems = [];
if (hMatch) {
const years = [...hMatch[0].matchAll(/Ansatz\s+(\d{4})/g)].map(
(m) => Number(m[1])
);
if (years.length > 0) {
tableItems = parseTableRows(body, hMatch, years);
}
}
if (tableItems.length > 0) {
out[s.lineNum] = tableItems;
continue;
}
// No table — try bullet-list parsing on the section body.
const bulletItems = parseBulletItems(body);
if (bulletItems.length > 0) out[s.lineNum] = bulletItems;
}
return out;
}
/** Extract table rows from a section body given the matched header
* line and the years that header announced. Returns an array of
* { name, values: { year: euro } } items. */
function parseTableRows(body, hMatch, years) {
const headerEndIdx = (hMatch.index ?? 0) + hMatch[0].length;
const remainder = body.slice(headerEndIdx);
const rowLines = [];
for (const line of remainder.split("\n")) {
const trimmed = line.trim();
if (trimmed === "") {
if (rowLines.length > 0) break; // table ended
continue;
}
// Heuristic: a row has a decimal/integer number near the end.
// Lines without trailing numbers are paragraph continuations.
if (!/[\d]+(?:[.,]\d+)?\s*$/.test(trimmed)) {
if (rowLines.length > 0) break;
continue;
}
rowLines.push(trimmed);
}
const items = [];
for (const row of rowLines) {
const numRe = /(-?\d{1,3}(?:\.\d{3})*(?:,\d+)?|-?\d+(?:,\d+)?)/g;
const nums = [...row.matchAll(numRe)].map((m) => m[0]);
if (nums.length < years.length) continue;
const tailNums = nums.slice(-years.length);
let name = row;
for (const n of [...tailNums].reverse()) {
const idx = name.lastIndexOf(n);
if (idx >= 0) name = name.slice(0, idx);
}
name = name.trim();
if (!name) continue;
const values = {};
for (let k = 0; k < years.length; k++) {
const raw = tailNums[k];
const num = parseFloat(raw.replace(/\./g, "").replace(",", "."));
if (!Number.isFinite(num)) continue;
values[years[k]] = num * 1_000_000; // Mio. € → €
}
if (Object.keys(values).length > 0) items.push({ name, values });
}
return items;
}
/**
* Parse bullet-list breakdowns. Each line starting with "-" or "•"
* with one or more "X Mio. Euro (YYYY)" / "X Euro (YYYY)" patterns
* becomes an item. Common forms in Münster's Erläuterungen:
*
* - Westf. Zoo Münster GmbH i. H. v. 4,10 Mio. Euro (2026) / 4,1 Mio Euro (2027)
* - Lernmittel = 1.435.000 Euro (2026), 1.435.000 Euro (2027)
* - Erträge … in Höhe von 247.630 Euro (2026), 252.680 Euro (2027)
*/
function parseBulletItems(text) {
const items = [];
const valueRe =
/(\d{1,3}(?:\.\d{3})*(?:,\d+)?|\d+(?:,\d+)?)\s*(Mio\.?\s*(?:Euro|€)|Euro|€)\s*\(?(\d{4})\)?/gi;
// Strip "(Gesamtsumme: NNN Euro (YYYY))" parentheticals down to
// just "(YYYY)" so the per-item parser sees a clean
// `value Euro (year)` shape. The PG 0301 line 13/16 bullets use
// this nested form to reference the category total inline; we keep
// the year tag (which the item's value belongs to) and drop the
// total reference.
const gesamtRe =
/\(\s*Gesamtsumme:[^()]*\((20\d{2})\)[^()]*\)/g;
for (const rawLine of text.split("\n")) {
if (!/^\s*[-•]\s+/.test(rawLine)) continue;
const line = rawLine.replace(gesamtRe, "($1)");
const values = [];
let firstValIdx = -1;
let m;
valueRe.lastIndex = 0;
while ((m = valueRe.exec(line)) !== null) {
if (firstValIdx === -1) firstValIdx = m.index;
const num = parseFloat(
m[1].replace(/\./g, "").replace(",", ".")
);
if (!Number.isFinite(num)) continue;
const isMio = /Mio/i.test(m[2]);
const value = isMio ? num * 1_000_000 : num;
const year = parseInt(m[3], 10);
values.push({ year, value });
}
if (values.length === 0) continue;
// The name is everything before the first value, minus the
// bullet marker and trailing value indicators.
let name = line.slice(0, firstValIdx).replace(/^\s*[-•]\s+/, "").trim();
name = name
.replace(/\s+(?:i\.\s*H\.\s*v\.?|in\s+H[öo]he\s+von|=)\s*$/i, "")
.replace(/\s+\(.*?\)\s*$/, "") // drop trailing parenthetical
.replace(/\s+[-]\s+hier:.*$/i, "") // drop "- hier: ..." asides
.trim();
if (!name || name.length < 2) continue;
const valuesByYear = {};
for (const v of values) valuesByYear[v.year] = v.value;
items.push({ name, values: valuesByYear });
}
return items;
}
const out = {};
let withBeschreibung = 0;
let withErlauterungen = 0;
for (const [pgNum, pgPages] of pagesByPg) {
const fullText = cleanPageNoise(pgPages.join("\n\n"));
const beschreibung = tidyParagraphs(
extractSection(fullText, "Beschreibung", SECTION_HEADINGS)
);
const erlaeuterungen = tidyParagraphs(
extractSection(fullText, "Erläuterungen", SECTION_HEADINGS)
);
if (beschreibung) withBeschreibung++;
if (erlaeuterungen) withErlauterungen++;
const breakdowns = parseBreakdowns(erlaeuterungen);
out[pgNum] = {
pgNumber: pgNum,
name: namesByPg.get(pgNum) ?? "",
beschreibung: beschreibung ?? null,
erlaeuterungen: erlaeuterungen ?? null,
breakdowns, // { lineNum: [ { name, values: { year: euro } } ] }
};
}
console.log(` with Beschreibung: ${withBeschreibung}`);
console.log(` with Erläuterungen: ${withErlauterungen}`);
// ── Step 4: Write JSON ────────────────────────────────────────────
mkdirSync("data/extracted", { recursive: true });
writeFileSync(OUT_PATH, JSON.stringify(out, null, 2), "utf8");
console.log(`Wrote ${OUT_PATH}`);