Files
xsl-validator/src/saxon_pool_s9api.py
T
info f59e51c081 Performance: Lazy Worker-Pool Init + XSL-Stylesheet-Caching
RAM-Optimierung (Lazy Loading):
- Worker-Pools werden erst bei Transformation gestartet (nicht beim Projekt-Öffnen)
- Worker-Pools werden nach Transformation automatisch beendet
- RAM im Ruhezustand: 0 MB (vorher: ~1.2 GB)
- Temporäre Verzeichnisse werden sauber aufgeräumt

XSL-Stylesheet-Caching (Massive Performance-Steigerung):
- Saxon s9api: HashMap<String, XsltExecutable> Cache
- Saxon JAXP: HashMap<String, Templates> Cache
- Kompilierte Stylesheets werden pro Worker wiederverwendet
- Bei 82 Transformationen mit 8 XSL-Dateien:
  * 1. Durchlauf: 8× Kompilierung
  * Weitere 74×: Cache-Treffer (sehr schnell!)

Technische Details:
- Worker-Pool-Init verschoben von _on_project_opened zu _start_transformation
- Worker-Pool-Shutdown in _on_all_transformations_finished
- Java-seitiger HashMap-Cache für beide Saxon-Varianten
- Cache-Logging für Debugging

Perfekt für Dauerbetrieb im Hintergrund!

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-07 18:44:32 +01:00

535 lines
21 KiB
Python

"""
Saxon Worker Pool (s9api) - Persistente JVM-Prozesse für XSLT 2.0/3.0 Transformationen.
Diese Variante verwendet die Saxon s9api API anstatt JAXP und ist für XSLT 2.0 und 3.0 geeignet.
Eliminiert JVM-Startup-Overhead durch Vorinitialisierung von N Worker-Prozessen.
Jeder Worker läuft als Daemon und verarbeitet mehrere Transformationen nacheinander.
"""
import logging
import subprocess
import threading
import time
import psutil
from pathlib import Path
from queue import Queue
from typing import Optional
import tempfile
from worker_metrics import WorkerPoolMetrics
logger = logging.getLogger(__name__)
# Java-Worker-Code für s9api (wird zur Laufzeit kompiliert)
SAXON_S9API_WORKER_JAVA = """
import net.sf.saxon.s9api.*;
import javax.xml.transform.stream.StreamSource;
import java.io.*;
import java.util.HashMap;
import java.util.Map;
public class SaxonS9ApiWorker {
public static void main(String[] args) {
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
String line;
// Create Processor once and reuse (equivalent to TransformerFactory)
Processor processor = new Processor(false);
// Cache für kompilierte Stylesheets (Performance-Optimierung)
Map<String, XsltExecutable> stylesheetCache = new HashMap<>();
System.err.println("SaxonS9ApiWorker started and ready (using s9api for XSLT 2.0/3.0 with stylesheet caching)");
System.err.flush();
try {
while ((line = reader.readLine()) != null) {
System.err.println("DEBUG: Received line: " + line.substring(0, Math.min(100, line.length())));
System.err.flush();
if ("EXIT".equals(line.trim())) {
System.err.println("SaxonS9ApiWorker exiting");
break;
}
try {
// Parse job
System.err.println("DEBUG: Parsing job...");
System.err.flush();
String[] parts = line.split("\\\\t");
System.err.println("DEBUG: Parts count: " + parts.length);
System.err.flush();
if (parts.length < 3) {
System.out.println("ERROR: Invalid job format");
System.out.flush();
continue;
}
String sourceXml = parts[0];
String xslStylesheet = parts[1];
String outputFo = parts[2];
// Prüfe ob Stylesheet bereits im Cache ist
XsltExecutable executable;
if (stylesheetCache.containsKey(xslStylesheet)) {
executable = stylesheetCache.get(xslStylesheet);
System.err.println("DEBUG: Using cached stylesheet: " + xslStylesheet);
System.err.flush();
} else {
System.err.println("DEBUG: Compiling and caching stylesheet: " + xslStylesheet);
System.err.flush();
XsltCompiler compiler = processor.newXsltCompiler();
executable = compiler.compile(new StreamSource(new File(xslStylesheet)));
stylesheetCache.put(xslStylesheet, executable);
System.err.println("DEBUG: Stylesheet compiled and cached (cache size: " + stylesheetCache.size() + ")");
System.err.flush();
}
System.err.println("DEBUG: Creating transformer...");
System.err.flush();
// Create transformer
XsltTransformer transformer = executable.load();
// Set source
transformer.setSource(new StreamSource(new File(sourceXml)));
// Set destination
Serializer serializer = processor.newSerializer(new File(outputFo));
transformer.setDestination(serializer);
// Set parameters if present
if (parts.length > 3 && !parts[3].isEmpty()) {
String[] params = parts[3].split("\\\\|\\\\|\\\\|");
for (String param : params) {
if (!param.isEmpty() && param.contains("=")) {
String[] kv = param.split("=", 2);
transformer.setParameter(new QName(kv[0]), new XdmAtomicValue(kv[1]));
System.err.println("DEBUG: Set parameter: " + kv[0] + " = " + kv[1]);
}
}
System.err.flush();
}
System.err.println("DEBUG: Running transformation...");
System.err.flush();
// Transform
transformer.transform();
System.err.println("DEBUG: Transformation completed");
System.err.flush();
System.out.println("OK");
System.out.flush();
} catch (SaxonApiException e) {
System.err.println("DEBUG: SaxonApiException: " + e.getClass().getName());
System.err.flush();
e.printStackTrace(System.err);
String errorMsg = e.getMessage();
if (errorMsg == null || errorMsg.isEmpty()) {
errorMsg = e.getClass().getSimpleName();
}
System.out.println("ERROR: " + errorMsg);
System.out.flush();
} catch (Exception e) {
System.err.println("DEBUG: Job processing exception: " + e.getClass().getName());
System.err.flush();
e.printStackTrace(System.err);
System.out.println("ERROR: " + (e.getMessage() != null ? e.getMessage() : e.getClass().getName()));
System.out.flush();
}
}
} catch (IOException e) {
System.err.println("SaxonS9ApiWorker I/O error: " + e.getMessage());
e.printStackTrace(System.err);
}
}
}
"""
class SaxonWorkerPoolS9Api:
"""
Pool von lang-laufenden JVM-Prozessen für Saxon-Transformationen mit s9api.
Diese Variante verwendet die Saxon s9api API anstatt JAXP und unterstützt
vollständig XSLT 2.0 und 3.0 Transformationen.
"""
def __init__(
self,
num_workers: int,
java_vm_path: Path,
saxon_jar_path: Path,
classpath_cache: dict[Path, str],
log_dir: Optional[Path] = None,
):
"""
Initialisiert den Saxon-Worker-Pool mit s9api.
Args:
num_workers: Anzahl der Worker-Prozesse
java_vm_path: Pfad zur Java VM Binary
saxon_jar_path: Pfad zur Saxon JAR-Datei
classpath_cache: Cache für Saxon-Classpaths
log_dir: Optionales Verzeichnis für Worker-Logs (Standard: temp_dir/temp)
"""
self.num_workers = num_workers
self.java_vm_path = java_vm_path
self.saxon_jar_path = saxon_jar_path
self.classpath_cache = classpath_cache
self.log_dir = log_dir
# Worker-Prozesse und Queues
self.workers: list[subprocess.Popen] = []
self.job_queue: Queue = Queue()
self.result_queue: Queue = Queue()
self.worker_locks: list[threading.Lock] = []
# Temporäres Verzeichnis für kompilierte Java-Klasse
self.temp_dir: Optional[Path] = None
self.worker_class_path: Optional[Path] = None
self.worker_log_dir: Optional[Path] = None
# Performance-Metriken
self.metrics = WorkerPoolMetrics()
# Initialisierung
self._compile_worker_class()
self._start_workers()
logger.info(f"SaxonWorkerPoolS9Api initialisiert mit {num_workers} Workern (XSLT 2.0/3.0)")
def _compile_worker_class(self):
"""Kompiliert die SaxonS9ApiWorker-Java-Klasse."""
start_time = time.time()
try:
# Erstelle temporäres Verzeichnis
self.temp_dir = Path(tempfile.mkdtemp(prefix="saxon_s9api_worker_"))
# Schreibe Java-Quellcode
java_file = self.temp_dir / "SaxonS9ApiWorker.java"
java_file.write_text(SAXON_S9API_WORKER_JAVA, encoding="utf-8")
# Hole Classpath
saxon_dir = self.saxon_jar_path.parent
if saxon_dir in self.classpath_cache:
classpath = self.classpath_cache[saxon_dir]
else:
# Fallback: Baue Classpath neu
import glob
import sys
all_jars = glob.glob(str(saxon_dir / "*.jar"))
lib_dir = saxon_dir / "lib"
if lib_dir.exists():
all_jars.extend(glob.glob(str(lib_dir / "*.jar")))
classpath_separator = ";" if sys.platform == "win32" else ":"
classpath = classpath_separator.join(all_jars)
# Kompiliere Java-Klasse
javac_cmd = [str(self.java_vm_path).replace("java", "javac"), "-cp", classpath, str(java_file)]
logger.debug(f"Kompiliere SaxonS9ApiWorker: {' '.join(javac_cmd)}")
result = subprocess.run(javac_cmd, capture_output=True, text=True, timeout=30)
if result.returncode != 0:
raise RuntimeError(f"Java-Kompilierung fehlgeschlagen: {result.stderr}")
self.worker_class_path = self.temp_dir
# Speichere Kompilierungszeit
self.metrics.compilation_time_seconds = time.time() - start_time
logger.info(
f"SaxonS9ApiWorker erfolgreich kompiliert: {self.temp_dir} "
f"({self.metrics.compilation_time_seconds:.3f}s)"
)
except Exception as e:
logger.error(f"Fehler beim Kompilieren von SaxonS9ApiWorker: {e}")
raise
def _start_workers(self):
"""Startet N Worker-Prozesse."""
# Hole Classpath
saxon_dir = self.saxon_jar_path.parent
if saxon_dir in self.classpath_cache:
classpath = self.classpath_cache[saxon_dir]
else:
# Fallback: Baue Classpath neu (sollte nicht nötig sein, aber zur Sicherheit)
import glob
import sys
all_jars = glob.glob(str(saxon_dir / "*.jar"))
lib_dir = saxon_dir / "lib"
if lib_dir.exists():
all_jars.extend(glob.glob(str(lib_dir / "*.jar")))
classpath_separator = ";" if sys.platform == "win32" else ":"
classpath = classpath_separator.join(all_jars)
# Cache für zukünftige Verwendung
self.classpath_cache[saxon_dir] = classpath
logger.debug(f"Classpath für {saxon_dir} neu erstellt und gecacht")
# Füge Worker-Classpath hinzu
import sys
classpath_separator = ";" if sys.platform == "win32" else ":"
full_classpath = str(self.worker_class_path) + classpath_separator + classpath
logger.debug(f"S9Api Worker Classpath: {full_classpath[:200]}...")
# Bestimme Log-Verzeichnis
self.worker_log_dir = self.log_dir if self.log_dir else self.temp_dir
if self.log_dir:
self.worker_log_dir.mkdir(parents=True, exist_ok=True)
for i in range(self.num_workers):
worker_start_time = time.time()
try:
# Starte JVM-Prozess mit SaxonS9ApiWorker
cmd = [str(self.java_vm_path), "-cp", full_classpath, "SaxonS9ApiWorker"]
# Öffne stderr-Log-Datei für diesen Worker
stderr_log = self.worker_log_dir / f"s9api_worker_{i}_stderr.log"
stderr_file = open(stderr_log, "w", encoding="utf-8")
process = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=stderr_file, # Redirect stderr to file
text=True,
bufsize=1, # Line buffered
)
self.workers.append(process)
self.worker_locks.append(threading.Lock())
logger.debug(f"S9Api Worker {i} gestartet (PID: {process.pid}, stderr: {stderr_log})")
# Warte kurz damit Worker initialisieren kann
time.sleep(0.1)
# Prüfe ob Worker noch läuft
if process.poll() is not None:
# Worker ist bereits beendet - Fehler!
stderr_file.close()
with open(stderr_log, "r") as f:
stderr_content = f.read()
raise RuntimeError(
f"S9Api Worker {i} ist sofort beendet (Exit Code: {process.returncode})\nstderr:\n{stderr_content}"
)
# Speichere Worker-Startzeit
worker_elapsed = time.time() - worker_start_time
self.metrics.worker_start_times.append(worker_elapsed)
except Exception as e:
logger.error(f"Fehler beim Starten von S9Api Worker {i}: {e}")
raise
# Berechne Aggregat-Werte für Worker-Startzeiten
self.metrics.calculate_aggregates()
logger.info(
f"{len(self.workers)} Saxon-S9Api-Worker erfolgreich gestartet "
f"(Summe: {self.metrics.total_worker_start_time_seconds:.3f}s, "
f"Durchschnitt: {self.metrics.average_worker_start_time_seconds:.3f}s)"
)
def transform(
self, source_xml: Path, xsl_stylesheet: Path, output_fo: Path, xslt_params: dict[str, str]
) -> tuple[bool, str]:
"""
Führt eine XSLT-Transformation mit einem Worker aus dem Pool aus.
Args:
source_xml: Pfad zur XML-Eingabedatei
xsl_stylesheet: Pfad zur XSL-Stylesheet-Datei
output_fo: Pfad zur FO-Ausgabedatei
xslt_params: Dictionary mit XSLT-Parametern
Returns:
tuple[bool, str]: (Erfolg, Fehlermeldung/Info)
"""
# Finde freien Worker
worker_idx = None
for i, lock in enumerate(self.worker_locks):
if lock.acquire(blocking=False):
worker_idx = i
break
if worker_idx is None:
# Kein freier Worker, warte auf ersten verfügbaren
for i, lock in enumerate(self.worker_locks):
lock.acquire()
worker_idx = i
break
try:
worker = self.workers[worker_idx]
# Prüfe ob Worker noch läuft
if worker.poll() is not None:
# Worker ist tot!
stderr_log = self.worker_log_dir / f"s9api_worker_{worker_idx}_stderr.log"
try:
with open(stderr_log, "r") as f:
stderr_content = f.read()
error_msg = (
f"S9Api Worker {worker_idx} ist beendet (Exit: {worker.returncode})\nstderr:\n{stderr_content}"
)
except Exception:
error_msg = f"S9Api Worker {worker_idx} ist beendet (Exit: {worker.returncode})"
logger.error(error_msg)
return False, error_msg
# Formatiere Parameter
params_str = "|||".join([f"{key}={value}" for key, value in xslt_params.items()])
# Erstelle Job-String (Tab-separated)
job = f"{source_xml}\t{xsl_stylesheet}\t{output_fo}\t{params_str}\n"
logger.debug(f"Sende Job an S9Api Worker {worker_idx}: {source_xml.name}")
# Sende Job an Worker
worker.stdin.write(job)
worker.stdin.flush()
# Warte auf Antwort
response = worker.stdout.readline().strip()
logger.debug(f"S9Api Worker {worker_idx} Antwort: '{response}'")
if response == "OK":
return True, "Erfolgreich"
elif response.startswith("ERROR:"):
error_msg = response[6:].strip()
return False, f"Saxon-Fehler (s9api): {error_msg}"
else:
# Leere Antwort bedeutet Worker ist crashed
if not response:
stderr_log = self.worker_log_dir / f"s9api_worker_{worker_idx}_stderr.log"
try:
with open(stderr_log, "r") as f:
stderr_content = f.read()[-500:] # Letzte 500 Zeichen
return False, f"S9Api Worker {worker_idx} crashed (keine Antwort)\nstderr:\n{stderr_content}"
except Exception:
return False, f"S9Api Worker {worker_idx} crashed (keine Antwort)"
return False, f"Unerwartete Antwort: {response}"
except Exception as e:
logger.error(f"Fehler bei S9Api Worker {worker_idx}: {e}")
return False, f"Worker-Fehler: {str(e)}"
finally:
# Gebe Worker-Lock frei
self.worker_locks[worker_idx].release()
def measure_ram_usage(self) -> tuple[float, float, list[float]]:
"""
Misst den aktuellen RAM-Verbrauch aller Worker-Prozesse.
Returns:
tuple: (total_mb, average_mb, per_worker_mb_list)
"""
ram_per_worker = []
for i, worker in enumerate(self.workers):
try:
if worker.poll() is None: # Worker läuft noch
process = psutil.Process(worker.pid)
# Hole Speicherinfo (RSS = Resident Set Size in Bytes)
mem_info = process.memory_info()
ram_mb = mem_info.rss / (1024 * 1024) # Konvertiere zu MB
ram_per_worker.append(ram_mb)
else:
logger.warning(f"Worker {i} ist nicht mehr aktiv (kann RAM nicht messen)")
except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
logger.warning(f"Konnte RAM für Worker {i} nicht messen: {e}")
total_ram = sum(ram_per_worker)
average_ram = total_ram / len(ram_per_worker) if ram_per_worker else 0.0
return total_ram, average_ram, ram_per_worker
def capture_ram_before_transform(self):
"""Erfasst RAM-Verbrauch vor der ersten Transformation."""
total, average, per_worker = self.measure_ram_usage()
self.metrics.ram_before_transform_mb_per_worker = per_worker
self.metrics.total_ram_before_mb = total
self.metrics.average_ram_before_mb = average
logger.info(
f"RAM vor Transformation: {self.metrics.total_ram_before_mb:.1f} MB "
f"(Durchschnitt: {self.metrics.average_ram_before_mb:.1f} MB/Worker)"
)
def capture_ram_after_transform(self):
"""Erfasst RAM-Verbrauch nach allen Transformationen."""
total, average, per_worker = self.measure_ram_usage()
self.metrics.ram_after_transform_mb_per_worker = per_worker
self.metrics.total_ram_after_mb = total
self.metrics.average_ram_after_mb = average
logger.info(
f"RAM nach Transformation: {self.metrics.total_ram_after_mb:.1f} MB "
f"(Durchschnitt: {self.metrics.average_ram_after_mb:.1f} MB/Worker)"
)
def shutdown(self):
"""Beendet alle Worker-Prozesse sauber."""
logger.info("Beende Saxon-S9Api-Worker-Pool...")
for i, worker in enumerate(self.workers):
try:
# Sende EXIT-Befehl
if worker.stdin and not worker.stdin.closed:
worker.stdin.write("EXIT\n")
worker.stdin.flush()
# Warte auf Beendigung (max 2 Sekunden)
worker.wait(timeout=2)
logger.debug(f"S9Api Worker {i} beendet")
except subprocess.TimeoutExpired:
# Force kill falls nötig
worker.kill()
logger.warning(f"S9Api Worker {i} musste gekillt werden")
except Exception as e:
logger.error(f"Fehler beim Beenden von S9Api Worker {i}: {e}")
# Lösche temporäres Verzeichnis
if self.temp_dir and self.temp_dir.exists():
try:
import shutil
shutil.rmtree(self.temp_dir)
logger.debug(f"Temporäres Verzeichnis gelöscht: {self.temp_dir}")
except Exception as e:
logger.warning(f"Konnte temporäres Verzeichnis nicht löschen: {e}")
logger.info("Saxon-S9Api-Worker-Pool beendet")
def __enter__(self):
"""Context manager entry."""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit."""
self.shutdown()