feat: initial Münster Haushalt icicle viewer
Editorial single-page viewer for the City of Münster's 2026/2027 budget draft, built as an Astro v6 SPA with a 4-level zoomable icicle (Produktbereich → Produktgruppe → Category → Breakdown). Highlights: - Multi-flow data layer over the official open-data CSVs (Aufwendungen + Erträge, 2008–2028) with overlap reconciliation across plan years. - Year slider as a 21-year mini-histogram of both flows; drag-to-scrub and click-to-jump, with bars morphing via CSS transitions on SVG geometry attributes. - Vertically centred icicle with year-outline rectangles framing each year's relative budget size, à la Bostock's animated treemap. - Headline "ausgibt / einnimmt" toggle; sidebar Aufwendungen/Erträge rows double as flow toggles. Active flow in Aufwendungen-purple / Erträge-orange (OKLCH). - Click-to-zoom via path-keyed lookup with ZOOM_COL_BOUNDS that reallocate the depth axis per zoom state. Zoomed item moves to the sidebar; canvas shows its descendants only (no adjacent-block leaks). - Sidebar shows path-specific Aufwendungen/Erträge/Saldo plus the source-PDF Beschreibung; Erläuterungen behind a collapsed details. - Build-time PDF extraction (scripts/extract-pg-sections.mjs) parses 68 Produktgruppen' Beschreibung + Erläuterungen sections from Band 1, including 10 cells of structured Mio.-€ breakdowns (Steuern, Transferaufwendungen, etc.) that drive the level-4 view. - URL state sync for path, year, and flow via history.replaceState so any zoom is shareable. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,333 @@
|
||||
#!/usr/bin/env node
|
||||
//
|
||||
// Extract per-Produktgruppe Beschreibung + Erläuterungen sections from
|
||||
// the Haushaltsplan 2026/2027 Band 1 PDF.
|
||||
//
|
||||
// Each Produktgruppe in Münster's NKF-style budget has a stable section
|
||||
// layout. We use pdftotext -layout to get text with form-feed page
|
||||
// breaks, find each PG's pages by its running header, then split each
|
||||
// PG's combined text by its section headings ("Beschreibung",
|
||||
// "Erläuterungen", etc.).
|
||||
//
|
||||
// Usage:
|
||||
// node scripts/extract-pg-sections.mjs
|
||||
// Output:
|
||||
// data/extracted/pg-sections-2026.json
|
||||
|
||||
import { execSync } from "node:child_process";
|
||||
import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
|
||||
const PDF_PATH = "docs/sources/2026_2027/Haushaltsplanentwurf_2026-2027_Band_1.pdf";
|
||||
const OUT_PATH = "data/extracted/pg-sections-2026.json";
|
||||
|
||||
// ── Step 1: PDF → text with form-feed page boundaries ─────────────
|
||||
const txtPath = join(tmpdir(), "ms-haushalt-band1.txt");
|
||||
execSync(`pdftotext -layout "${PDF_PATH}" "${txtPath}"`);
|
||||
const rawText = readFileSync(txtPath, "utf8");
|
||||
const pages = rawText.split("\f");
|
||||
console.log(`PDF pages: ${pages.length}`);
|
||||
|
||||
// ── Step 2: Group pages by Produktgruppe number ───────────────────
|
||||
// Each Teilplan page has a running header with this layout:
|
||||
// Haushaltsplan 2026/2027 <PG-NAME> Dezernat <X>
|
||||
// Ausschuss: <ABBR> Produktgruppe NNNN <Amt>
|
||||
// The PG number appears on the second header line. The name appears on
|
||||
// the first header line, sandwiched between the "Haushaltsplan" cell
|
||||
// and the "Dezernat …" cell.
|
||||
const HEADER_RE_PG = /Produktgruppe\s+(\d{4})\b/;
|
||||
const HEADER_RE_NAME = /^Haushaltsplan\s+\d{4}\/\d{4}\s{2,}(.+?)\s{2,}Dezernat\s/m;
|
||||
|
||||
const pagesByPg = new Map(); // pgNumber → array of page texts
|
||||
const namesByPg = new Map(); // pgNumber → display name
|
||||
|
||||
for (const page of pages) {
|
||||
const headerSlice = page.slice(0, 600);
|
||||
const pgMatch = headerSlice.match(HEADER_RE_PG);
|
||||
if (!pgMatch) continue;
|
||||
const pgNum = pgMatch[1];
|
||||
if (!pagesByPg.has(pgNum)) pagesByPg.set(pgNum, []);
|
||||
pagesByPg.get(pgNum).push(page);
|
||||
if (!namesByPg.has(pgNum)) {
|
||||
const nameMatch = headerSlice.match(HEADER_RE_NAME);
|
||||
if (nameMatch) namesByPg.set(pgNum, nameMatch[1].trim());
|
||||
}
|
||||
}
|
||||
console.log(`Produktgruppen found: ${pagesByPg.size}`);
|
||||
|
||||
// ── Step 3: Extract sections per PG ───────────────────────────────
|
||||
const SECTION_HEADINGS = [
|
||||
"Beschreibung",
|
||||
"Besonderheiten in den Planjahren",
|
||||
"Ziele",
|
||||
"Zielkennzahlen",
|
||||
"Standardkennzahlen",
|
||||
"Bewirtschaftungsregeln",
|
||||
"Erläuterungen",
|
||||
];
|
||||
|
||||
/** Strip page-running headers/footers and stray page numbers. */
|
||||
function cleanPageNoise(text) {
|
||||
return text
|
||||
.split("\n")
|
||||
.filter((line) => {
|
||||
const t = line.trim();
|
||||
if (t === "") return true;
|
||||
if (/^Haushaltsplan\s+\d{4}\/\d{4}/.test(t)) return false;
|
||||
if (/^Ausschuss:/.test(t)) return false;
|
||||
// Page-number lines: a single integer (with possible surrounding ws)
|
||||
if (/^\d{1,4}$/.test(t)) return false;
|
||||
return true;
|
||||
})
|
||||
.join("\n");
|
||||
}
|
||||
|
||||
/** Find a section by its heading and return text up to the next known
|
||||
* heading. Headings are matched as full-line tokens — they appear
|
||||
* flush-left, sometimes with trailing colon. */
|
||||
function extractSection(text, heading, otherHeadings) {
|
||||
const startRe = new RegExp(`^${escapeRe(heading)}:?\\s*$`, "m");
|
||||
const startMatch = startRe.exec(text);
|
||||
if (!startMatch) return null;
|
||||
const sliceStart = startMatch.index + startMatch[0].length;
|
||||
const tail = text.slice(sliceStart);
|
||||
|
||||
let endIdx = tail.length;
|
||||
for (const h of otherHeadings) {
|
||||
if (h === heading) continue;
|
||||
const r = new RegExp(`^${escapeRe(h)}:?\\s*$`, "m");
|
||||
const m = r.exec(tail);
|
||||
if (m && m.index < endIdx) endIdx = m.index;
|
||||
}
|
||||
// Also stop at the start of a per-Produkt block ("Produkt NNNNNN -")
|
||||
// and at the financial table headers — these can sit indented with
|
||||
// leading whitespace on the line, so allow `^\s*`.
|
||||
for (const r of [
|
||||
/^\s*Produkt\s+\d{6}\s*-/m,
|
||||
/^\s*Teilergebnisplan\b/m,
|
||||
/^\s*Teilfinanzplan\b/m,
|
||||
/^\s*Investitionsmaßnahmen\b/m,
|
||||
/^\s*Investitionen\s+gesamt\b/m,
|
||||
/^\s*Verpflichtungsermächtigungen\b/m,
|
||||
]) {
|
||||
const m = r.exec(tail);
|
||||
if (m && m.index < endIdx) endIdx = m.index;
|
||||
}
|
||||
return tail.slice(0, endIdx).trim();
|
||||
}
|
||||
|
||||
function escapeRe(s) {
|
||||
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
/** Strip leading + trailing whitespace per line, collapse multiple
|
||||
* blank lines to one. Preserves paragraph breaks. */
|
||||
function tidyParagraphs(text) {
|
||||
if (!text) return text;
|
||||
return text
|
||||
.split("\n")
|
||||
.map((l) => l.trim())
|
||||
.join("\n")
|
||||
.replace(/\n{3,}/g, "\n\n")
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse breakdown tables embedded in Erläuterungen text.
|
||||
*
|
||||
* Format we recognize (PG 1601's "zu Zeile 01"):
|
||||
* zu Zeile NN:
|
||||
* …prose…
|
||||
* Aufwandsart Ansatz YYYY in Mio. € Ansatz YYYY in Mio. €
|
||||
* <name> <value> <value>
|
||||
* …
|
||||
*
|
||||
* Returns: { lineNumber: [ { name, values: { year: euro } } ] }
|
||||
*
|
||||
* Numbers in the table are written in Mio. €; we convert to plain
|
||||
* euros to match our CSV value units.
|
||||
*/
|
||||
function parseBreakdowns(erlText) {
|
||||
if (!erlText) return {};
|
||||
const out = {};
|
||||
|
||||
// Split into "zu Zeile NN" sections (case-insensitive — some PGs
|
||||
// capitalize "Zu Zeile" at the start of a paragraph).
|
||||
const sectionRe = /zu\s+Zeile\s+(\d{1,2})(?:[\s,–-]+\d+)*\s*:?/gi;
|
||||
const sections = [];
|
||||
let m;
|
||||
while ((m = sectionRe.exec(erlText)) !== null) {
|
||||
const lineNum = parseInt(m[1], 10);
|
||||
sections.push({ lineNum, start: m.index, headerEnd: sectionRe.lastIndex });
|
||||
}
|
||||
for (let i = 0; i < sections.length; i++) {
|
||||
const s = sections[i];
|
||||
const end = i + 1 < sections.length ? sections[i + 1].start : erlText.length;
|
||||
const body = erlText.slice(s.headerEnd, end);
|
||||
|
||||
// Look for a table header line: at least one "Ansatz YYYY" near a
|
||||
// "Mio." unit. If there's no such header, skip the table parse
|
||||
// and try the bullet-list fallback at the bottom of this loop.
|
||||
const headerLineRe = /^.*Ansatz\s+\d{4}.*Mio\..*$/m;
|
||||
const hMatch = body.match(headerLineRe);
|
||||
let tableItems = [];
|
||||
if (hMatch) {
|
||||
const years = [...hMatch[0].matchAll(/Ansatz\s+(\d{4})/g)].map(
|
||||
(m) => Number(m[1])
|
||||
);
|
||||
if (years.length > 0) {
|
||||
tableItems = parseTableRows(body, hMatch, years);
|
||||
}
|
||||
}
|
||||
|
||||
if (tableItems.length > 0) {
|
||||
out[s.lineNum] = tableItems;
|
||||
continue;
|
||||
}
|
||||
// No table — try bullet-list parsing on the section body.
|
||||
const bulletItems = parseBulletItems(body);
|
||||
if (bulletItems.length > 0) out[s.lineNum] = bulletItems;
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/** Extract table rows from a section body given the matched header
|
||||
* line and the years that header announced. Returns an array of
|
||||
* { name, values: { year: euro } } items. */
|
||||
function parseTableRows(body, hMatch, years) {
|
||||
const headerEndIdx = (hMatch.index ?? 0) + hMatch[0].length;
|
||||
const remainder = body.slice(headerEndIdx);
|
||||
const rowLines = [];
|
||||
for (const line of remainder.split("\n")) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed === "") {
|
||||
if (rowLines.length > 0) break; // table ended
|
||||
continue;
|
||||
}
|
||||
// Heuristic: a row has a decimal/integer number near the end.
|
||||
// Lines without trailing numbers are paragraph continuations.
|
||||
if (!/[\d]+(?:[.,]\d+)?\s*$/.test(trimmed)) {
|
||||
if (rowLines.length > 0) break;
|
||||
continue;
|
||||
}
|
||||
rowLines.push(trimmed);
|
||||
}
|
||||
|
||||
const items = [];
|
||||
for (const row of rowLines) {
|
||||
const numRe = /(-?\d{1,3}(?:\.\d{3})*(?:,\d+)?|-?\d+(?:,\d+)?)/g;
|
||||
const nums = [...row.matchAll(numRe)].map((m) => m[0]);
|
||||
if (nums.length < years.length) continue;
|
||||
const tailNums = nums.slice(-years.length);
|
||||
let name = row;
|
||||
for (const n of [...tailNums].reverse()) {
|
||||
const idx = name.lastIndexOf(n);
|
||||
if (idx >= 0) name = name.slice(0, idx);
|
||||
}
|
||||
name = name.trim();
|
||||
if (!name) continue;
|
||||
|
||||
const values = {};
|
||||
for (let k = 0; k < years.length; k++) {
|
||||
const raw = tailNums[k];
|
||||
const num = parseFloat(raw.replace(/\./g, "").replace(",", "."));
|
||||
if (!Number.isFinite(num)) continue;
|
||||
values[years[k]] = num * 1_000_000; // Mio. € → €
|
||||
}
|
||||
if (Object.keys(values).length > 0) items.push({ name, values });
|
||||
}
|
||||
return items;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse bullet-list breakdowns. Each line starting with "-" or "•"
|
||||
* with one or more "X Mio. Euro (YYYY)" / "X Euro (YYYY)" patterns
|
||||
* becomes an item. Common forms in Münster's Erläuterungen:
|
||||
*
|
||||
* - Westf. Zoo Münster GmbH i. H. v. 4,10 Mio. Euro (2026) / 4,1 Mio Euro (2027)
|
||||
* - Lernmittel = 1.435.000 Euro (2026), 1.435.000 Euro (2027)
|
||||
* - Erträge … in Höhe von 247.630 Euro (2026), 252.680 Euro (2027)
|
||||
*/
|
||||
function parseBulletItems(text) {
|
||||
const items = [];
|
||||
const valueRe =
|
||||
/(\d{1,3}(?:\.\d{3})*(?:,\d+)?|\d+(?:,\d+)?)\s*(Mio\.?\s*(?:Euro|€)|Euro|€)\s*\(?(\d{4})\)?/gi;
|
||||
// Strip "(Gesamtsumme: NNN Euro (YYYY))" parentheticals down to
|
||||
// just "(YYYY)" so the per-item parser sees a clean
|
||||
// `value Euro (year)` shape. The PG 0301 line 13/16 bullets use
|
||||
// this nested form to reference the category total inline; we keep
|
||||
// the year tag (which the item's value belongs to) and drop the
|
||||
// total reference.
|
||||
const gesamtRe =
|
||||
/\(\s*Gesamtsumme:[^()]*\((20\d{2})\)[^()]*\)/g;
|
||||
|
||||
for (const rawLine of text.split("\n")) {
|
||||
if (!/^\s*[-•]\s+/.test(rawLine)) continue;
|
||||
const line = rawLine.replace(gesamtRe, "($1)");
|
||||
|
||||
const values = [];
|
||||
let firstValIdx = -1;
|
||||
let m;
|
||||
valueRe.lastIndex = 0;
|
||||
while ((m = valueRe.exec(line)) !== null) {
|
||||
if (firstValIdx === -1) firstValIdx = m.index;
|
||||
const num = parseFloat(
|
||||
m[1].replace(/\./g, "").replace(",", ".")
|
||||
);
|
||||
if (!Number.isFinite(num)) continue;
|
||||
const isMio = /Mio/i.test(m[2]);
|
||||
const value = isMio ? num * 1_000_000 : num;
|
||||
const year = parseInt(m[3], 10);
|
||||
values.push({ year, value });
|
||||
}
|
||||
if (values.length === 0) continue;
|
||||
|
||||
// The name is everything before the first value, minus the
|
||||
// bullet marker and trailing value indicators.
|
||||
let name = line.slice(0, firstValIdx).replace(/^\s*[-•]\s+/, "").trim();
|
||||
name = name
|
||||
.replace(/\s+(?:i\.\s*H\.\s*v\.?|in\s+H[öo]he\s+von|=)\s*$/i, "")
|
||||
.replace(/\s+\(.*?\)\s*$/, "") // drop trailing parenthetical
|
||||
.replace(/\s+[-–]\s+hier:.*$/i, "") // drop "- hier: ..." asides
|
||||
.trim();
|
||||
if (!name || name.length < 2) continue;
|
||||
|
||||
const valuesByYear = {};
|
||||
for (const v of values) valuesByYear[v.year] = v.value;
|
||||
items.push({ name, values: valuesByYear });
|
||||
}
|
||||
return items;
|
||||
}
|
||||
|
||||
const out = {};
|
||||
let withBeschreibung = 0;
|
||||
let withErlauterungen = 0;
|
||||
for (const [pgNum, pgPages] of pagesByPg) {
|
||||
const fullText = cleanPageNoise(pgPages.join("\n\n"));
|
||||
const beschreibung = tidyParagraphs(
|
||||
extractSection(fullText, "Beschreibung", SECTION_HEADINGS)
|
||||
);
|
||||
const erlaeuterungen = tidyParagraphs(
|
||||
extractSection(fullText, "Erläuterungen", SECTION_HEADINGS)
|
||||
);
|
||||
if (beschreibung) withBeschreibung++;
|
||||
if (erlaeuterungen) withErlauterungen++;
|
||||
const breakdowns = parseBreakdowns(erlaeuterungen);
|
||||
out[pgNum] = {
|
||||
pgNumber: pgNum,
|
||||
name: namesByPg.get(pgNum) ?? "",
|
||||
beschreibung: beschreibung ?? null,
|
||||
erlaeuterungen: erlaeuterungen ?? null,
|
||||
breakdowns, // { lineNum: [ { name, values: { year: euro } } ] }
|
||||
};
|
||||
}
|
||||
|
||||
console.log(` with Beschreibung: ${withBeschreibung}`);
|
||||
console.log(` with Erläuterungen: ${withErlauterungen}`);
|
||||
|
||||
// ── Step 4: Write JSON ────────────────────────────────────────────
|
||||
mkdirSync("data/extracted", { recursive: true });
|
||||
writeFileSync(OUT_PATH, JSON.stringify(out, null, 2), "utf8");
|
||||
console.log(`Wrote ${OUT_PATH}`);
|
||||
Reference in New Issue
Block a user