9a958c0051
Editorial single-page viewer for the City of Münster's 2026/2027 budget draft, built as an Astro v6 SPA with a 4-level zoomable icicle (Produktbereich → Produktgruppe → Category → Breakdown). Highlights: - Multi-flow data layer over the official open-data CSVs (Aufwendungen + Erträge, 2008–2028) with overlap reconciliation across plan years. - Year slider as a 21-year mini-histogram of both flows; drag-to-scrub and click-to-jump, with bars morphing via CSS transitions on SVG geometry attributes. - Vertically centred icicle with year-outline rectangles framing each year's relative budget size, à la Bostock's animated treemap. - Headline "ausgibt / einnimmt" toggle; sidebar Aufwendungen/Erträge rows double as flow toggles. Active flow in Aufwendungen-purple / Erträge-orange (OKLCH). - Click-to-zoom via path-keyed lookup with ZOOM_COL_BOUNDS that reallocate the depth axis per zoom state. Zoomed item moves to the sidebar; canvas shows its descendants only (no adjacent-block leaks). - Sidebar shows path-specific Aufwendungen/Erträge/Saldo plus the source-PDF Beschreibung; Erläuterungen behind a collapsed details. - Build-time PDF extraction (scripts/extract-pg-sections.mjs) parses 68 Produktgruppen' Beschreibung + Erläuterungen sections from Band 1, including 10 cells of structured Mio.-€ breakdowns (Steuern, Transferaufwendungen, etc.) that drive the level-4 view. - URL state sync for path, year, and flow via history.replaceState so any zoom is shareable. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
334 lines
12 KiB
JavaScript
334 lines
12 KiB
JavaScript
#!/usr/bin/env node
|
||
//
|
||
// Extract per-Produktgruppe Beschreibung + Erläuterungen sections from
|
||
// the Haushaltsplan 2026/2027 Band 1 PDF.
|
||
//
|
||
// Each Produktgruppe in Münster's NKF-style budget has a stable section
|
||
// layout. We use pdftotext -layout to get text with form-feed page
|
||
// breaks, find each PG's pages by its running header, then split each
|
||
// PG's combined text by its section headings ("Beschreibung",
|
||
// "Erläuterungen", etc.).
|
||
//
|
||
// Usage:
|
||
// node scripts/extract-pg-sections.mjs
|
||
// Output:
|
||
// data/extracted/pg-sections-2026.json
|
||
|
||
import { execSync } from "node:child_process";
|
||
import { mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
||
import { tmpdir } from "node:os";
|
||
import { join } from "node:path";
|
||
|
||
const PDF_PATH = "docs/sources/2026_2027/Haushaltsplanentwurf_2026-2027_Band_1.pdf";
|
||
const OUT_PATH = "data/extracted/pg-sections-2026.json";
|
||
|
||
// ── Step 1: PDF → text with form-feed page boundaries ─────────────
|
||
const txtPath = join(tmpdir(), "ms-haushalt-band1.txt");
|
||
execSync(`pdftotext -layout "${PDF_PATH}" "${txtPath}"`);
|
||
const rawText = readFileSync(txtPath, "utf8");
|
||
const pages = rawText.split("\f");
|
||
console.log(`PDF pages: ${pages.length}`);
|
||
|
||
// ── Step 2: Group pages by Produktgruppe number ───────────────────
|
||
// Each Teilplan page has a running header with this layout:
|
||
// Haushaltsplan 2026/2027 <PG-NAME> Dezernat <X>
|
||
// Ausschuss: <ABBR> Produktgruppe NNNN <Amt>
|
||
// The PG number appears on the second header line. The name appears on
|
||
// the first header line, sandwiched between the "Haushaltsplan" cell
|
||
// and the "Dezernat …" cell.
|
||
const HEADER_RE_PG = /Produktgruppe\s+(\d{4})\b/;
|
||
const HEADER_RE_NAME = /^Haushaltsplan\s+\d{4}\/\d{4}\s{2,}(.+?)\s{2,}Dezernat\s/m;
|
||
|
||
const pagesByPg = new Map(); // pgNumber → array of page texts
|
||
const namesByPg = new Map(); // pgNumber → display name
|
||
|
||
for (const page of pages) {
|
||
const headerSlice = page.slice(0, 600);
|
||
const pgMatch = headerSlice.match(HEADER_RE_PG);
|
||
if (!pgMatch) continue;
|
||
const pgNum = pgMatch[1];
|
||
if (!pagesByPg.has(pgNum)) pagesByPg.set(pgNum, []);
|
||
pagesByPg.get(pgNum).push(page);
|
||
if (!namesByPg.has(pgNum)) {
|
||
const nameMatch = headerSlice.match(HEADER_RE_NAME);
|
||
if (nameMatch) namesByPg.set(pgNum, nameMatch[1].trim());
|
||
}
|
||
}
|
||
console.log(`Produktgruppen found: ${pagesByPg.size}`);
|
||
|
||
// ── Step 3: Extract sections per PG ───────────────────────────────
|
||
const SECTION_HEADINGS = [
|
||
"Beschreibung",
|
||
"Besonderheiten in den Planjahren",
|
||
"Ziele",
|
||
"Zielkennzahlen",
|
||
"Standardkennzahlen",
|
||
"Bewirtschaftungsregeln",
|
||
"Erläuterungen",
|
||
];
|
||
|
||
/** Strip page-running headers/footers and stray page numbers. */
|
||
function cleanPageNoise(text) {
|
||
return text
|
||
.split("\n")
|
||
.filter((line) => {
|
||
const t = line.trim();
|
||
if (t === "") return true;
|
||
if (/^Haushaltsplan\s+\d{4}\/\d{4}/.test(t)) return false;
|
||
if (/^Ausschuss:/.test(t)) return false;
|
||
// Page-number lines: a single integer (with possible surrounding ws)
|
||
if (/^\d{1,4}$/.test(t)) return false;
|
||
return true;
|
||
})
|
||
.join("\n");
|
||
}
|
||
|
||
/** Find a section by its heading and return text up to the next known
|
||
* heading. Headings are matched as full-line tokens — they appear
|
||
* flush-left, sometimes with trailing colon. */
|
||
function extractSection(text, heading, otherHeadings) {
|
||
const startRe = new RegExp(`^${escapeRe(heading)}:?\\s*$`, "m");
|
||
const startMatch = startRe.exec(text);
|
||
if (!startMatch) return null;
|
||
const sliceStart = startMatch.index + startMatch[0].length;
|
||
const tail = text.slice(sliceStart);
|
||
|
||
let endIdx = tail.length;
|
||
for (const h of otherHeadings) {
|
||
if (h === heading) continue;
|
||
const r = new RegExp(`^${escapeRe(h)}:?\\s*$`, "m");
|
||
const m = r.exec(tail);
|
||
if (m && m.index < endIdx) endIdx = m.index;
|
||
}
|
||
// Also stop at the start of a per-Produkt block ("Produkt NNNNNN -")
|
||
// and at the financial table headers — these can sit indented with
|
||
// leading whitespace on the line, so allow `^\s*`.
|
||
for (const r of [
|
||
/^\s*Produkt\s+\d{6}\s*-/m,
|
||
/^\s*Teilergebnisplan\b/m,
|
||
/^\s*Teilfinanzplan\b/m,
|
||
/^\s*Investitionsmaßnahmen\b/m,
|
||
/^\s*Investitionen\s+gesamt\b/m,
|
||
/^\s*Verpflichtungsermächtigungen\b/m,
|
||
]) {
|
||
const m = r.exec(tail);
|
||
if (m && m.index < endIdx) endIdx = m.index;
|
||
}
|
||
return tail.slice(0, endIdx).trim();
|
||
}
|
||
|
||
function escapeRe(s) {
|
||
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||
}
|
||
|
||
/** Strip leading + trailing whitespace per line, collapse multiple
|
||
* blank lines to one. Preserves paragraph breaks. */
|
||
function tidyParagraphs(text) {
|
||
if (!text) return text;
|
||
return text
|
||
.split("\n")
|
||
.map((l) => l.trim())
|
||
.join("\n")
|
||
.replace(/\n{3,}/g, "\n\n")
|
||
.trim();
|
||
}
|
||
|
||
/**
|
||
* Parse breakdown tables embedded in Erläuterungen text.
|
||
*
|
||
* Format we recognize (PG 1601's "zu Zeile 01"):
|
||
* zu Zeile NN:
|
||
* …prose…
|
||
* Aufwandsart Ansatz YYYY in Mio. € Ansatz YYYY in Mio. €
|
||
* <name> <value> <value>
|
||
* …
|
||
*
|
||
* Returns: { lineNumber: [ { name, values: { year: euro } } ] }
|
||
*
|
||
* Numbers in the table are written in Mio. €; we convert to plain
|
||
* euros to match our CSV value units.
|
||
*/
|
||
function parseBreakdowns(erlText) {
|
||
if (!erlText) return {};
|
||
const out = {};
|
||
|
||
// Split into "zu Zeile NN" sections (case-insensitive — some PGs
|
||
// capitalize "Zu Zeile" at the start of a paragraph).
|
||
const sectionRe = /zu\s+Zeile\s+(\d{1,2})(?:[\s,–-]+\d+)*\s*:?/gi;
|
||
const sections = [];
|
||
let m;
|
||
while ((m = sectionRe.exec(erlText)) !== null) {
|
||
const lineNum = parseInt(m[1], 10);
|
||
sections.push({ lineNum, start: m.index, headerEnd: sectionRe.lastIndex });
|
||
}
|
||
for (let i = 0; i < sections.length; i++) {
|
||
const s = sections[i];
|
||
const end = i + 1 < sections.length ? sections[i + 1].start : erlText.length;
|
||
const body = erlText.slice(s.headerEnd, end);
|
||
|
||
// Look for a table header line: at least one "Ansatz YYYY" near a
|
||
// "Mio." unit. If there's no such header, skip the table parse
|
||
// and try the bullet-list fallback at the bottom of this loop.
|
||
const headerLineRe = /^.*Ansatz\s+\d{4}.*Mio\..*$/m;
|
||
const hMatch = body.match(headerLineRe);
|
||
let tableItems = [];
|
||
if (hMatch) {
|
||
const years = [...hMatch[0].matchAll(/Ansatz\s+(\d{4})/g)].map(
|
||
(m) => Number(m[1])
|
||
);
|
||
if (years.length > 0) {
|
||
tableItems = parseTableRows(body, hMatch, years);
|
||
}
|
||
}
|
||
|
||
if (tableItems.length > 0) {
|
||
out[s.lineNum] = tableItems;
|
||
continue;
|
||
}
|
||
// No table — try bullet-list parsing on the section body.
|
||
const bulletItems = parseBulletItems(body);
|
||
if (bulletItems.length > 0) out[s.lineNum] = bulletItems;
|
||
}
|
||
|
||
return out;
|
||
}
|
||
|
||
/** Extract table rows from a section body given the matched header
|
||
* line and the years that header announced. Returns an array of
|
||
* { name, values: { year: euro } } items. */
|
||
function parseTableRows(body, hMatch, years) {
|
||
const headerEndIdx = (hMatch.index ?? 0) + hMatch[0].length;
|
||
const remainder = body.slice(headerEndIdx);
|
||
const rowLines = [];
|
||
for (const line of remainder.split("\n")) {
|
||
const trimmed = line.trim();
|
||
if (trimmed === "") {
|
||
if (rowLines.length > 0) break; // table ended
|
||
continue;
|
||
}
|
||
// Heuristic: a row has a decimal/integer number near the end.
|
||
// Lines without trailing numbers are paragraph continuations.
|
||
if (!/[\d]+(?:[.,]\d+)?\s*$/.test(trimmed)) {
|
||
if (rowLines.length > 0) break;
|
||
continue;
|
||
}
|
||
rowLines.push(trimmed);
|
||
}
|
||
|
||
const items = [];
|
||
for (const row of rowLines) {
|
||
const numRe = /(-?\d{1,3}(?:\.\d{3})*(?:,\d+)?|-?\d+(?:,\d+)?)/g;
|
||
const nums = [...row.matchAll(numRe)].map((m) => m[0]);
|
||
if (nums.length < years.length) continue;
|
||
const tailNums = nums.slice(-years.length);
|
||
let name = row;
|
||
for (const n of [...tailNums].reverse()) {
|
||
const idx = name.lastIndexOf(n);
|
||
if (idx >= 0) name = name.slice(0, idx);
|
||
}
|
||
name = name.trim();
|
||
if (!name) continue;
|
||
|
||
const values = {};
|
||
for (let k = 0; k < years.length; k++) {
|
||
const raw = tailNums[k];
|
||
const num = parseFloat(raw.replace(/\./g, "").replace(",", "."));
|
||
if (!Number.isFinite(num)) continue;
|
||
values[years[k]] = num * 1_000_000; // Mio. € → €
|
||
}
|
||
if (Object.keys(values).length > 0) items.push({ name, values });
|
||
}
|
||
return items;
|
||
}
|
||
|
||
/**
|
||
* Parse bullet-list breakdowns. Each line starting with "-" or "•"
|
||
* with one or more "X Mio. Euro (YYYY)" / "X Euro (YYYY)" patterns
|
||
* becomes an item. Common forms in Münster's Erläuterungen:
|
||
*
|
||
* - Westf. Zoo Münster GmbH i. H. v. 4,10 Mio. Euro (2026) / 4,1 Mio Euro (2027)
|
||
* - Lernmittel = 1.435.000 Euro (2026), 1.435.000 Euro (2027)
|
||
* - Erträge … in Höhe von 247.630 Euro (2026), 252.680 Euro (2027)
|
||
*/
|
||
function parseBulletItems(text) {
|
||
const items = [];
|
||
const valueRe =
|
||
/(\d{1,3}(?:\.\d{3})*(?:,\d+)?|\d+(?:,\d+)?)\s*(Mio\.?\s*(?:Euro|€)|Euro|€)\s*\(?(\d{4})\)?/gi;
|
||
// Strip "(Gesamtsumme: NNN Euro (YYYY))" parentheticals down to
|
||
// just "(YYYY)" so the per-item parser sees a clean
|
||
// `value Euro (year)` shape. The PG 0301 line 13/16 bullets use
|
||
// this nested form to reference the category total inline; we keep
|
||
// the year tag (which the item's value belongs to) and drop the
|
||
// total reference.
|
||
const gesamtRe =
|
||
/\(\s*Gesamtsumme:[^()]*\((20\d{2})\)[^()]*\)/g;
|
||
|
||
for (const rawLine of text.split("\n")) {
|
||
if (!/^\s*[-•]\s+/.test(rawLine)) continue;
|
||
const line = rawLine.replace(gesamtRe, "($1)");
|
||
|
||
const values = [];
|
||
let firstValIdx = -1;
|
||
let m;
|
||
valueRe.lastIndex = 0;
|
||
while ((m = valueRe.exec(line)) !== null) {
|
||
if (firstValIdx === -1) firstValIdx = m.index;
|
||
const num = parseFloat(
|
||
m[1].replace(/\./g, "").replace(",", ".")
|
||
);
|
||
if (!Number.isFinite(num)) continue;
|
||
const isMio = /Mio/i.test(m[2]);
|
||
const value = isMio ? num * 1_000_000 : num;
|
||
const year = parseInt(m[3], 10);
|
||
values.push({ year, value });
|
||
}
|
||
if (values.length === 0) continue;
|
||
|
||
// The name is everything before the first value, minus the
|
||
// bullet marker and trailing value indicators.
|
||
let name = line.slice(0, firstValIdx).replace(/^\s*[-•]\s+/, "").trim();
|
||
name = name
|
||
.replace(/\s+(?:i\.\s*H\.\s*v\.?|in\s+H[öo]he\s+von|=)\s*$/i, "")
|
||
.replace(/\s+\(.*?\)\s*$/, "") // drop trailing parenthetical
|
||
.replace(/\s+[-–]\s+hier:.*$/i, "") // drop "- hier: ..." asides
|
||
.trim();
|
||
if (!name || name.length < 2) continue;
|
||
|
||
const valuesByYear = {};
|
||
for (const v of values) valuesByYear[v.year] = v.value;
|
||
items.push({ name, values: valuesByYear });
|
||
}
|
||
return items;
|
||
}
|
||
|
||
const out = {};
|
||
let withBeschreibung = 0;
|
||
let withErlauterungen = 0;
|
||
for (const [pgNum, pgPages] of pagesByPg) {
|
||
const fullText = cleanPageNoise(pgPages.join("\n\n"));
|
||
const beschreibung = tidyParagraphs(
|
||
extractSection(fullText, "Beschreibung", SECTION_HEADINGS)
|
||
);
|
||
const erlaeuterungen = tidyParagraphs(
|
||
extractSection(fullText, "Erläuterungen", SECTION_HEADINGS)
|
||
);
|
||
if (beschreibung) withBeschreibung++;
|
||
if (erlaeuterungen) withErlauterungen++;
|
||
const breakdowns = parseBreakdowns(erlaeuterungen);
|
||
out[pgNum] = {
|
||
pgNumber: pgNum,
|
||
name: namesByPg.get(pgNum) ?? "",
|
||
beschreibung: beschreibung ?? null,
|
||
erlaeuterungen: erlaeuterungen ?? null,
|
||
breakdowns, // { lineNum: [ { name, values: { year: euro } } ] }
|
||
};
|
||
}
|
||
|
||
console.log(` with Beschreibung: ${withBeschreibung}`);
|
||
console.log(` with Erläuterungen: ${withErlauterungen}`);
|
||
|
||
// ── Step 4: Write JSON ────────────────────────────────────────────
|
||
mkdirSync("data/extracted", { recursive: true });
|
||
writeFileSync(OUT_PATH, JSON.stringify(out, null, 2), "utf8");
|
||
console.log(`Wrote ${OUT_PATH}`);
|