feat: initial Münster Haushalt icicle viewer

Editorial single-page viewer for the City of Münster's 2026/2027 budget draft, built as an Astro v6 SPA with a 4-level zoomable icicle (Produktbereich → Produktgruppe → Category → Breakdown). Highlights: - Multi-flow data layer over the official open-data CSVs (Aufwendungen + Erträge, 2008–2028) with overlap reconciliation across plan years. - Year slider as a 21-year mini-histogram of both flows; drag-to-scrub and click-to-jump, with bars morphing via CSS transitions on SVG geometry attributes. - Vertically centred icicle with year-outline rectangles framing each year's relative budget size, à la Bostock's animated treemap. - Headline "ausgibt / einnimmt" toggle; sidebar Aufwendungen/Erträge rows double as flow toggles. Active flow in Aufwendungen-purple / Erträge-orange (OKLCH). - Click-to-zoom via path-keyed lookup with ZOOM_COL_BOUNDS that reallocate the depth axis per zoom state. Zoomed item moves to the sidebar; canvas shows its descendants only (no adjacent-block leaks). - Sidebar shows path-specific Aufwendungen/Erträge/Saldo plus the source-PDF Beschreibung; Erläuterungen behind a collapsed details. - Build-time PDF extraction (scripts/extract-pg-sections.mjs) parses 68 Produktgruppen' Beschreibung + Erläuterungen sections from Band 1, including 10 cells of structured Mio.-€ breakdowns (Steuern, Transferaufwendungen, etc.) that drive the level-4 view. - URL state sync for path, year, and flow via history.replaceState so any zoom is shareable. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2026-05-07 14:22:53 +02:00
parent 0f10f8507f
commit 9a958c0051
46 changed files with 10056 additions and 1 deletions
@@ -0,0 +1,333 @@
+#!/usr/bin/env node
+//
+// Extract per-Produktgruppe Beschreibung + Erläuterungen sections from
+// the Haushaltsplan 2026/2027 Band 1 PDF.
+//
+// Each Produktgruppe in Münster's NKF-style budget has a stable section
+// layout. We use pdftotext -layout to get text with form-feed page
+// breaks, find each PG's pages by its running header, then split each
+// PG's combined text by its section headings ("Beschreibung",
+// "Erläuterungen", etc.).
+//
+// Usage:
+//   node scripts/extract-pg-sections.mjs
+// Output:
+//   data/extracted/pg-sections-2026.json
+
+import { execSync } from "node:child_process";
+import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+const PDF_PATH = "docs/sources/2026_2027/Haushaltsplanentwurf_2026-2027_Band_1.pdf";
+const OUT_PATH = "data/extracted/pg-sections-2026.json";
+
+// ── Step 1: PDF → text with form-feed page boundaries ─────────────
+const txtPath = join(tmpdir(), "ms-haushalt-band1.txt");
+execSync(`pdftotext -layout "${PDF_PATH}" "${txtPath}"`);
+const rawText = readFileSync(txtPath, "utf8");
+const pages = rawText.split("\f");
+console.log(`PDF pages: ${pages.length}`);
+
+// ── Step 2: Group pages by Produktgruppe number ───────────────────
+// Each Teilplan page has a running header with this layout:
+//   Haushaltsplan 2026/2027    <PG-NAME>             Dezernat <X>
+//   Ausschuss: <ABBR>           Produktgruppe NNNN    <Amt>
+// The PG number appears on the second header line. The name appears on
+// the first header line, sandwiched between the "Haushaltsplan" cell
+// and the "Dezernat …" cell.
+const HEADER_RE_PG = /Produktgruppe\s+(\d{4})\b/;
+const HEADER_RE_NAME = /^Haushaltsplan\s+\d{4}\/\d{4}\s{2,}(.+?)\s{2,}Dezernat\s/m;
+
+const pagesByPg = new Map(); // pgNumber → array of page texts
+const namesByPg = new Map(); // pgNumber → display name
+
+for (const page of pages) {
+  const headerSlice = page.slice(0, 600);
+  const pgMatch = headerSlice.match(HEADER_RE_PG);
+  if (!pgMatch) continue;
+  const pgNum = pgMatch[1];
+  if (!pagesByPg.has(pgNum)) pagesByPg.set(pgNum, []);
+  pagesByPg.get(pgNum).push(page);
+  if (!namesByPg.has(pgNum)) {
+    const nameMatch = headerSlice.match(HEADER_RE_NAME);
+    if (nameMatch) namesByPg.set(pgNum, nameMatch[1].trim());
+  }
+}
+console.log(`Produktgruppen found: ${pagesByPg.size}`);
+
+// ── Step 3: Extract sections per PG ───────────────────────────────
+const SECTION_HEADINGS = [
+  "Beschreibung",
+  "Besonderheiten in den Planjahren",
+  "Ziele",
+  "Zielkennzahlen",
+  "Standardkennzahlen",
+  "Bewirtschaftungsregeln",
+  "Erläuterungen",
+];
+
+/** Strip page-running headers/footers and stray page numbers. */
+function cleanPageNoise(text) {
+  return text
+    .split("\n")
+    .filter((line) => {
+      const t = line.trim();
+      if (t === "") return true;
+      if (/^Haushaltsplan\s+\d{4}\/\d{4}/.test(t)) return false;
+      if (/^Ausschuss:/.test(t)) return false;
+      // Page-number lines: a single integer (with possible surrounding ws)
+      if (/^\d{1,4}$/.test(t)) return false;
+      return true;
+    })
+    .join("\n");
+}
+
+/** Find a section by its heading and return text up to the next known
+ *  heading. Headings are matched as full-line tokens — they appear
+ *  flush-left, sometimes with trailing colon. */
+function extractSection(text, heading, otherHeadings) {
+  const startRe = new RegExp(`^${escapeRe(heading)}:?\\s*$`, "m");
+  const startMatch = startRe.exec(text);
+  if (!startMatch) return null;
+  const sliceStart = startMatch.index + startMatch[0].length;
+  const tail = text.slice(sliceStart);
+
+  let endIdx = tail.length;
+  for (const h of otherHeadings) {
+    if (h === heading) continue;
+    const r = new RegExp(`^${escapeRe(h)}:?\\s*$`, "m");
+    const m = r.exec(tail);
+    if (m && m.index < endIdx) endIdx = m.index;
+  }
+  // Also stop at the start of a per-Produkt block ("Produkt NNNNNN -")
+  // and at the financial table headers — these can sit indented with
+  // leading whitespace on the line, so allow `^\s*`.
+  for (const r of [
+    /^\s*Produkt\s+\d{6}\s*-/m,
+    /^\s*Teilergebnisplan\b/m,
+    /^\s*Teilfinanzplan\b/m,
+    /^\s*Investitionsmaßnahmen\b/m,
+    /^\s*Investitionen\s+gesamt\b/m,
+    /^\s*Verpflichtungsermächtigungen\b/m,
+  ]) {
+    const m = r.exec(tail);
+    if (m && m.index < endIdx) endIdx = m.index;
+  }
+  return tail.slice(0, endIdx).trim();
+}
+
+function escapeRe(s) {
+  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
+/** Strip leading + trailing whitespace per line, collapse multiple
+ *  blank lines to one. Preserves paragraph breaks. */
+function tidyParagraphs(text) {
+  if (!text) return text;
+  return text
+    .split("\n")
+    .map((l) => l.trim())
+    .join("\n")
+    .replace(/\n{3,}/g, "\n\n")
+    .trim();
+}
+
+/**
+ * Parse breakdown tables embedded in Erläuterungen text.
+ *
+ * Format we recognize (PG 1601's "zu Zeile 01"):
+ *   zu Zeile NN:
+ *   …prose…
+ *   Aufwandsart    Ansatz YYYY in Mio. €    Ansatz YYYY in Mio. €
+ *   <name>             <value>                  <value>
+ *   …
+ *
+ * Returns: { lineNumber: [ { name, values: { year: euro } } ] }
+ *
+ * Numbers in the table are written in Mio. €; we convert to plain
+ * euros to match our CSV value units.
+ */
+function parseBreakdowns(erlText) {
+  if (!erlText) return {};
+  const out = {};
+
+  // Split into "zu Zeile NN" sections (case-insensitive — some PGs
+  // capitalize "Zu Zeile" at the start of a paragraph).
+  const sectionRe = /zu\s+Zeile\s+(\d{1,2})(?:[\s,–-]+\d+)*\s*:?/gi;
+  const sections = [];
+  let m;
+  while ((m = sectionRe.exec(erlText)) !== null) {
+    const lineNum = parseInt(m[1], 10);
+    sections.push({ lineNum, start: m.index, headerEnd: sectionRe.lastIndex });
+  }
+  for (let i = 0; i < sections.length; i++) {
+    const s = sections[i];
+    const end = i + 1 < sections.length ? sections[i + 1].start : erlText.length;
+    const body = erlText.slice(s.headerEnd, end);
+
+    // Look for a table header line: at least one "Ansatz YYYY" near a
+    // "Mio." unit. If there's no such header, skip the table parse
+    // and try the bullet-list fallback at the bottom of this loop.
+    const headerLineRe = /^.*Ansatz\s+\d{4}.*Mio\..*$/m;
+    const hMatch = body.match(headerLineRe);
+    let tableItems = [];
+    if (hMatch) {
+      const years = [...hMatch[0].matchAll(/Ansatz\s+(\d{4})/g)].map(
+        (m) => Number(m[1])
+      );
+      if (years.length > 0) {
+        tableItems = parseTableRows(body, hMatch, years);
+      }
+    }
+
+    if (tableItems.length > 0) {
+      out[s.lineNum] = tableItems;
+      continue;
+    }
+    // No table — try bullet-list parsing on the section body.
+    const bulletItems = parseBulletItems(body);
+    if (bulletItems.length > 0) out[s.lineNum] = bulletItems;
+  }
+
+  return out;
+}
+
+/** Extract table rows from a section body given the matched header
+ *  line and the years that header announced. Returns an array of
+ *  { name, values: { year: euro } } items. */
+function parseTableRows(body, hMatch, years) {
+  const headerEndIdx = (hMatch.index ?? 0) + hMatch[0].length;
+  const remainder = body.slice(headerEndIdx);
+  const rowLines = [];
+  for (const line of remainder.split("\n")) {
+    const trimmed = line.trim();
+    if (trimmed === "") {
+      if (rowLines.length > 0) break; // table ended
+      continue;
+    }
+    // Heuristic: a row has a decimal/integer number near the end.
+    // Lines without trailing numbers are paragraph continuations.
+    if (!/[\d]+(?:[.,]\d+)?\s*$/.test(trimmed)) {
+      if (rowLines.length > 0) break;
+      continue;
+    }
+    rowLines.push(trimmed);
+  }
+
+  const items = [];
+  for (const row of rowLines) {
+    const numRe = /(-?\d{1,3}(?:\.\d{3})*(?:,\d+)?|-?\d+(?:,\d+)?)/g;
+    const nums = [...row.matchAll(numRe)].map((m) => m[0]);
+    if (nums.length < years.length) continue;
+    const tailNums = nums.slice(-years.length);
+    let name = row;
+    for (const n of [...tailNums].reverse()) {
+      const idx = name.lastIndexOf(n);
+      if (idx >= 0) name = name.slice(0, idx);
+    }
+    name = name.trim();
+    if (!name) continue;
+
+    const values = {};
+    for (let k = 0; k < years.length; k++) {
+      const raw = tailNums[k];
+      const num = parseFloat(raw.replace(/\./g, "").replace(",", "."));
+      if (!Number.isFinite(num)) continue;
+      values[years[k]] = num * 1_000_000; // Mio. € → €
+    }
+    if (Object.keys(values).length > 0) items.push({ name, values });
+  }
+  return items;
+}
+
+/**
+ * Parse bullet-list breakdowns. Each line starting with "-" or "•"
+ * with one or more "X Mio. Euro (YYYY)" / "X Euro (YYYY)" patterns
+ * becomes an item. Common forms in Münster's Erläuterungen:
+ *
+ *   - Westf. Zoo Münster GmbH i. H. v. 4,10 Mio. Euro (2026) / 4,1 Mio Euro (2027)
+ *   - Lernmittel = 1.435.000 Euro (2026), 1.435.000 Euro (2027)
+ *   - Erträge … in Höhe von 247.630 Euro (2026), 252.680 Euro (2027)
+ */
+function parseBulletItems(text) {
+  const items = [];
+  const valueRe =
+    /(\d{1,3}(?:\.\d{3})*(?:,\d+)?|\d+(?:,\d+)?)\s*(Mio\.?\s*(?:Euro|€)|Euro|€)\s*\(?(\d{4})\)?/gi;
+  // Strip "(Gesamtsumme: NNN Euro (YYYY))" parentheticals down to
+  // just "(YYYY)" so the per-item parser sees a clean
+  // `value Euro (year)` shape. The PG 0301 line 13/16 bullets use
+  // this nested form to reference the category total inline; we keep
+  // the year tag (which the item's value belongs to) and drop the
+  // total reference.
+  const gesamtRe =
+    /\(\s*Gesamtsumme:[^()]*\((20\d{2})\)[^()]*\)/g;
+
+  for (const rawLine of text.split("\n")) {
+    if (!/^\s*[-•]\s+/.test(rawLine)) continue;
+    const line = rawLine.replace(gesamtRe, "($1)");
+
+    const values = [];
+    let firstValIdx = -1;
+    let m;
+    valueRe.lastIndex = 0;
+    while ((m = valueRe.exec(line)) !== null) {
+      if (firstValIdx === -1) firstValIdx = m.index;
+      const num = parseFloat(
+        m[1].replace(/\./g, "").replace(",", ".")
+      );
+      if (!Number.isFinite(num)) continue;
+      const isMio = /Mio/i.test(m[2]);
+      const value = isMio ? num * 1_000_000 : num;
+      const year = parseInt(m[3], 10);
+      values.push({ year, value });
+    }
+    if (values.length === 0) continue;
+
+    // The name is everything before the first value, minus the
+    // bullet marker and trailing value indicators.
+    let name = line.slice(0, firstValIdx).replace(/^\s*[-•]\s+/, "").trim();
+    name = name
+      .replace(/\s+(?:i\.\s*H\.\s*v\.?|in\s+H[öo]he\s+von|=)\s*$/i, "")
+      .replace(/\s+\(.*?\)\s*$/, "") // drop trailing parenthetical
+      .replace(/\s+[-–]\s+hier:.*$/i, "") // drop "- hier: ..." asides
+      .trim();
+    if (!name || name.length < 2) continue;
+
+    const valuesByYear = {};
+    for (const v of values) valuesByYear[v.year] = v.value;
+    items.push({ name, values: valuesByYear });
+  }
+  return items;
+}
+
+const out = {};
+let withBeschreibung = 0;
+let withErlauterungen = 0;
+for (const [pgNum, pgPages] of pagesByPg) {
+  const fullText = cleanPageNoise(pgPages.join("\n\n"));
+  const beschreibung = tidyParagraphs(
+    extractSection(fullText, "Beschreibung", SECTION_HEADINGS)
+  );
+  const erlaeuterungen = tidyParagraphs(
+    extractSection(fullText, "Erläuterungen", SECTION_HEADINGS)
+  );
+  if (beschreibung) withBeschreibung++;
+  if (erlaeuterungen) withErlauterungen++;
+  const breakdowns = parseBreakdowns(erlaeuterungen);
+  out[pgNum] = {
+    pgNumber: pgNum,
+    name: namesByPg.get(pgNum) ?? "",
+    beschreibung: beschreibung ?? null,
+    erlaeuterungen: erlaeuterungen ?? null,
+    breakdowns, // { lineNum: [ { name, values: { year: euro } } ] }
+  };
+}
+
+console.log(`  with Beschreibung:   ${withBeschreibung}`);
+console.log(`  with Erläuterungen:  ${withErlauterungen}`);
+
+// ── Step 4: Write JSON ────────────────────────────────────────────
+mkdirSync("data/extracted", { recursive: true });
+writeFileSync(OUT_PATH, JSON.stringify(out, null, 2), "utf8");
+console.log(`Wrote ${OUT_PATH}`);