diff --git a/.idea/easycode.ignore b/.idea/easycode.ignore new file mode 100644 index 0000000..92e0c37 --- /dev/null +++ b/.idea/easycode.ignore @@ -0,0 +1,13 @@ +.idea +.vscode +node_modules/ +dist/ +vendor/ +cache/ +.*/ +*.min.* +*.test.* +*.spec.* +*.bundle.* +*.bundle-min.* +*.log \ No newline at end of file diff --git a/.idea/easycode/codebase-v2.xml b/.idea/easycode/codebase-v2.xml new file mode 100644 index 0000000..543dfc7 --- /dev/null +++ b/.idea/easycode/codebase-v2.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index e122dea..ee700a1 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -8,7 +8,7 @@ - + \ No newline at end of file diff --git a/.idea/uiDesigner.xml b/.idea/uiDesigner.xml new file mode 100644 index 0000000..2b63946 --- /dev/null +++ b/.idea/uiDesigner.xml @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/archion.db b/archion.db index 2fa40a1..94f22ec 100644 Binary files a/archion.db and b/archion.db differ diff --git a/pom.xml b/pom.xml index 14cfb80..21b8363 100644 --- a/pom.xml +++ b/pom.xml @@ -9,8 +9,8 @@ 1.0-SNAPSHOT - 23 - 23 + 21 + 21 UTF-8 @@ -18,7 +18,12 @@ org.seleniumhq.selenium selenium-java - 4.21.0 + 4.15.0 + + + com.google.guava + guava + 31.1-jre io.github.bonigarcia @@ -41,6 +46,16 @@ 5.10.0 test + + net.lightbody.bmp + browsermob-core + 2.1.5 + + + org.seleniumhq.selenium + selenium-java + 4.15.0 + \ No newline at end of file diff --git a/src/main/java/de/roko/genalogy/downloader/ImageDownloader.java b/src/main/java/de/roko/genalogy/downloader/ImageDownloader.java new file mode 100644 index 0000000..6ef7ee6 --- /dev/null +++ b/src/main/java/de/roko/genalogy/downloader/ImageDownloader.java @@ -0,0 +1,32 @@ +package de.roko.genalogy.downloader; + +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.nio.file.*; + +public class ImageDownloader { + + public static void main(String[] args) { + String imageUrl = "https://www.archion.de/typo3conf/ext/archion_sitepackage/Resources/Public/Images/logo_quer_weiss.svg"; + String targetDirectory = System.getProperty("user.home") + "/Pictures"; + + try { + downloadImage(imageUrl, targetDirectory); + } catch (IOException e) { + System.err.println("❌ Fehler beim Herunterladen: " + e.getMessage()); + } + } + + public static void downloadImage(String imageUrl, String targetDirPath) throws IOException { + URL url = new URL(imageUrl); + String fileName = Paths.get(url.getPath()).getFileName().toString(); // Datei extrahieren + Path targetPath = Paths.get(targetDirPath, fileName); + + try (InputStream in = url.openStream()) { + Files.createDirectories(Paths.get(targetDirPath)); // Sicherstellen, dass Verzeichnis existiert + Files.copy(in, targetPath, StandardCopyOption.REPLACE_EXISTING); + System.out.println("✅ Bild gespeichert unter: " + targetPath); + } + } +} diff --git a/src/main/java/de/roko/genalogy/downloader/Main.java b/src/main/java/de/roko/genalogy/downloader/Main.java new file mode 100644 index 0000000..8c8406f --- /dev/null +++ b/src/main/java/de/roko/genalogy/downloader/Main.java @@ -0,0 +1,81 @@ +package de.roko.genalogy.downloader; + +import de.roko.genalogy.downloader.archion.ArchionLoginHelper; +import de.roko.genalogy.downloader.database.*; +import io.github.bonigarcia.wdm.WebDriverManager; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; + +import java.io.File; + + + +public class Main { + + public static ChromeDriver driver; + + public static void main(String[] args) throws Exception { + + String username = "robatkoch"; + String password = "PaLiNa2016$$"; + + String userHome = System.getProperty("user.home"); + String downloadFolder = userHome + "/Pictures/archion"; + new File(downloadFolder).mkdirs(); + + // ChromeDriver automatisch verwalten + WebDriverManager.chromedriver().setup(); + + + ChromeOptions options = new ChromeOptions(); + options.addArguments("--remote-allow-origins=*"); + + // WebDriver starten + driver = new ChromeDriver(options); + + ArchionLoginHelper archionLoginHelper = new ArchionLoginHelper(driver); + + archionLoginHelper.login(username, password); + + if(archionLoginHelper.isLoggedIn()) { + + + DatenbankReset.reset(); + DatenbankDebugger datenbankDebuggerdebugger = new DatenbankDebugger(); + + //Abgleich + archiveAuslesen(); + datenbankDebuggerdebugger.printTable("archive"); + + kirchenkreiseAuslesen(); + datenbankDebuggerdebugger.printTable("kreis"); + + orteAuslesen(); + + + + + System.out.println("break"); + } + } + + public static void archiveAuslesen() { + //Lese Archive + ArchiveInserter archiveInserter = new ArchiveInserter(); + try { + archiveInserter.run(driver); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public static void kirchenkreiseAuslesen() throws Exception { + KirchenkreisExtractor kirchenkreisExtractor = new KirchenkreisExtractor(); + kirchenkreisExtractor.readNwrite(); + } + + public static void orteAuslesen() throws Exception { + KreisOrtExtractor kreisOrtExtractor = new KreisOrtExtractor(); + kreisOrtExtractor.readNwrite(); + } +} \ No newline at end of file diff --git a/src/main/java/de/roko/genalogy/downloader/database/ArchionDatabaseSetup.java b/src/main/java/de/roko/genalogy/downloader/database/ArchionDatabaseSetup.java index 8f1500e..ac8d53c 100644 --- a/src/main/java/de/roko/genalogy/downloader/database/ArchionDatabaseSetup.java +++ b/src/main/java/de/roko/genalogy/downloader/database/ArchionDatabaseSetup.java @@ -35,23 +35,35 @@ public class ArchionDatabaseSetup { stmt.execute(""" CREATE TABLE IF NOT EXISTS ort ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - kreis_id INTEGER, -- NULL, wenn Ort direkt unter Archiv hängt - archiv_id INTEGER NOT NULL, - name TEXT NOT NULL, - link TEXT NOT NULL, - UNIQUE(kreis_id, archiv_id, name) -- ← Diese Kombination muss zum ON CONFLICT passen - ); + id INTEGER PRIMARY KEY AUTOINCREMENT, + kreis_id INTEGER, + archiv_id INTEGER NOT NULL, + name TEXT NOT NULL, + link TEXT NOT NULL, + UNIQUE(archiv_id, name), + FOREIGN KEY (kreis_id) REFERENCES kreis(id), + FOREIGN KEY (archiv_id) REFERENCES archive(id) + ); """); stmt.execute(""" CREATE TABLE IF NOT EXISTS buch ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - ort_id INTEGER NOT NULL, - titel TEXT NOT NULL, - zeitraum TEXT, - FOREIGN KEY (ort_id) REFERENCES ort(id) - ); + id INTEGER PRIMARY KEY AUTOINCREMENT, -- technische ID + + ort_id INTEGER NOT NULL, -- Bezug zum Ort + titel TEXT NOT NULL, -- Titel des Buchs + typ TEXT, -- Symboltyp (z. B. taufbuch, beerdigungsbuch) + zeitraum TEXT, -- z. B. 1700–1750 + enthaelt TEXT, -- Zusatzangaben, z. B. auch Konfirmationen + anmerkung TEXT, -- redaktionelle Hinweise + signatur TEXT, -- Signatur lokal + archivname TEXT, -- Archivname aus Detailseite + link TEXT NOT NULL, -- Detail-Link auf Archion + viewer_link TEXT, -- direkter Link zum Viewer + + FOREIGN KEY (ort_id) REFERENCES ort(id), + UNIQUE (ort_id, titel) -- wichtig für conflict-handling + ); """); stmt.execute(""" diff --git a/src/main/java/de/roko/genalogy/downloader/database/ArchivStrukturParser.java b/src/main/java/de/roko/genalogy/downloader/database/ArchivStrukturParser.java index 1dce677..96d5b84 100644 --- a/src/main/java/de/roko/genalogy/downloader/database/ArchivStrukturParser.java +++ b/src/main/java/de/roko/genalogy/downloader/database/ArchivStrukturParser.java @@ -44,7 +44,25 @@ public class ArchivStrukturParser { String name = a.text().trim(); String href = a.absUrl("href").trim(); - if (name.toLowerCase().contains("kirchenkreis") || name.toLowerCase().contains("dekanat")) { + if (name.toLowerCase().contains("kirchenkreis") + || name.toLowerCase().contains("dekanat") + || name.toLowerCase().contains("juden") + || name.toLowerCase().contains("mennoiten") + || name.toLowerCase().contains("militärseelsorge") + || name.toLowerCase().contains("reformierte kirche") + || name.toLowerCase().contains("auslandsgemeinde") + || name.toLowerCase().contains("thüringen") + || name.toLowerCase().contains("israeliten") + || name.toLowerCase().contains("krankenhausseelsorge") + || name.toLowerCase().contains("kreis") + || name.toLowerCase().contains("reformierter kirchenkreis") + || name.toLowerCase().contains("sonderbestände") + || name.toLowerCase().contains("allgemeine hilfsmittel") + || name.toLowerCase().contains("allgemeines ortschaftsverzeichnis") + || name.toLowerCase().contains("kirchenbezirk") + || name.toLowerCase().contains("hinterpommern") + || name.toLowerCase().contains("militärkirchenbücher") + ){ kreise.add(new Kirchenkreis(archivId, name, href)); } else { orte.add(new Ort(null, archivId, name, href)); diff --git a/src/main/java/de/roko/genalogy/downloader/database/DatenbankDebugger.java b/src/main/java/de/roko/genalogy/downloader/database/DatenbankDebugger.java new file mode 100644 index 0000000..4f920c1 --- /dev/null +++ b/src/main/java/de/roko/genalogy/downloader/database/DatenbankDebugger.java @@ -0,0 +1,43 @@ +package de.roko.genalogy.downloader.database; + +import java.sql.*; + +public class DatenbankDebugger { + + private static final String DB = "archion.db"; + + + public DatenbankDebugger() { + + } + + public void printTable(String tableName) throws Exception { + System.out.println("\n📋 Inhalt der Tabelle: " + tableName); + + try (Connection conn = DriverManager.getConnection("jdbc:sqlite:" + DB); + Statement stmt = conn.createStatement(); + ResultSet rs = stmt.executeQuery("SELECT * FROM " + tableName)) { + + ResultSetMetaData meta = rs.getMetaData(); + int columnCount = meta.getColumnCount(); + + // Spaltenüberschriften ausgeben + for (int i = 1; i <= columnCount; i++) { + System.out.print(meta.getColumnName(i) + "\t"); + } + System.out.println("\n" + "-".repeat(60)); + + // Zeilen ausgeben + while (rs.next()) { + for (int i = 1; i <= columnCount; i++) { + Object val = rs.getObject(i); + System.out.print((val != null ? val.toString() : "NULL") + "\t"); + } + System.out.println(); + } + + } catch (SQLException e) { + System.err.println("⚠️ Fehler beim Zugriff auf Tabelle '" + tableName + "': " + e.getMessage()); + } + } +} diff --git a/src/main/java/de/roko/genalogy/downloader/database/DatenbankReset.java b/src/main/java/de/roko/genalogy/downloader/database/DatenbankReset.java new file mode 100644 index 0000000..8927aa2 --- /dev/null +++ b/src/main/java/de/roko/genalogy/downloader/database/DatenbankReset.java @@ -0,0 +1,29 @@ +package de.roko.genalogy.downloader.database; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.Statement; + +public class DatenbankReset { + + private static final String DB = "archion.db"; + + public static void main(String[] args) throws Exception { + reset(); + System.out.println("✅ Datenbank erfolgreich geleert."); + } + + public static void reset() throws Exception { + try (Connection conn = DriverManager.getConnection("jdbc:sqlite:" + DB); + Statement stmt = conn.createStatement()) { + + // Reihenfolge beachten wegen FK-Beziehungen (falls vorhanden) + stmt.executeUpdate("DELETE FROM ort"); + stmt.executeUpdate("DELETE FROM kreis"); + stmt.executeUpdate("DELETE FROM archive"); + + // Optional: IDs zurücksetzen (nur nötig bei AUTOINCREMENT-Reset) + stmt.executeUpdate("DELETE FROM sqlite_sequence WHERE name IN ('archive', 'kreis', 'ort')"); + } + } +} diff --git a/src/main/java/de/roko/genalogy/downloader/database/DokumentExtractor.java b/src/main/java/de/roko/genalogy/downloader/database/DokumentExtractor.java new file mode 100644 index 0000000..a795483 --- /dev/null +++ b/src/main/java/de/roko/genalogy/downloader/database/DokumentExtractor.java @@ -0,0 +1,367 @@ +package de.roko.genalogy.downloader.database; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.File; +import java.sql.*; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Function; + +public class DokumentExtractor { + + private static final String DB = "/Users/robertkoch/dev/arch/archion.db"; + + public void run() throws Exception { + try (Connection conn = DriverManager.getConnection("jdbc:sqlite:" + DB)) { + String sql = "SELECT id, name, link FROM ort ORDER BY id"; + try (PreparedStatement stmt = conn.prepareStatement(sql); + ResultSet rs = stmt.executeQuery()) { + + while (rs.next()) { + int ortId = rs.getInt("id"); + String ortName = rs.getString("name"); + String ortLink = rs.getString("link"); + + System.out.println("\n📘 Lese Bücher für Ort: " + ortName); + List buecher = extractBuecher(ortId, ortLink); + saveBuecher(buecher, conn); + System.out.println("→ " + buecher.size() + " Bücher gespeichert."); + } + } + } + } + + public void run(int id) throws Exception { + try (Connection conn = DriverManager.getConnection("jdbc:sqlite:" + DB)) { + System.out.println("📂 Aktive DB-Datei: " + new java.io.File(DB).getAbsolutePath()); + + try (Statement stmt = conn.createStatement()) { + ResultSet rs = stmt.executeQuery("PRAGMA table_info(buch)"); + System.out.println("📋 Spalten in 'buch':"); + while (rs.next()) { + System.out.println(" - " + rs.getString("name")); + } + + rs = stmt.executeQuery("PRAGMA index_list('buch')"); + System.out.println("📊 Indizes auf 'buch':"); + while (rs.next()) { + System.out.println(" - " + rs.getString("name") + ", unique: " + rs.getBoolean("unique")); + } + } + + String sql = "SELECT id, name, link FROM ort WHERE id = " + id; + try (PreparedStatement stmt = conn.prepareStatement(sql); + ResultSet rs = stmt.executeQuery()) { + + while (rs.next()) { + int ortId = rs.getInt("id"); + String ortName = rs.getString("name"); + String ortLink = rs.getString("link"); + + System.out.println("\n📘 Lese Bücher für Ort: " + ortName); + System.out.println("Link: " + ortLink); + List buecher = extractBuecher(ortId, ortLink); + saveBuecher(buecher, conn); + System.out.println("→ " + buecher.size() + " Bücher gespeichert."); + } + } + } + } + + private List extractBuecher(int ortId, String ortLink) { + List list = new ArrayList<>(); + try { + Document doc = Jsoup.connect(ortLink).get(); + Elements buchEintraege = doc.select("#archive-nav li.item"); + + for (Element li : buchEintraege) { + Element a = li.selectFirst("a"); + if (a == null) continue; + + + Element span = a.selectFirst("span"); + if (span == null) { // 🔧 NEU: Schutz gegen fehlendes + System.err.println("⚠️ Kein in: " + a); + continue; + } + + String titel = span.text().trim(); + String link = a.absUrl("href").trim(); + + String typ = null; + Element img = a.selectFirst("img"); + + if (img != null) { + String src = img.attr("src"); + int lastSlash = src.lastIndexOf('/'); + int dot = src.lastIndexOf('.'); + if (lastSlash != -1 && dot != -1 && dot > lastSlash) { + typ = src.substring(lastSlash + 1, dot); + } + } + + Buch buch = extractBuchDetails(ortId, titel, link, typ); + list.add(buch); + } + + } catch (Exception e) { + System.err.println("⚠️ Fehler beim Ort-Link " + ortLink + ": " + e.getMessage()); + } + return list; + } + + private Buch extractBuchDetails(int ortId, String titel, String link, String typ) { + String zeitraum = null, enthaelt = null, anmerkung = null, signatur = null, archivname = null, viewerLink = null; + + try { + Document detailDoc = Jsoup.connect(link).get(); + + Element viewerEl = detailDoc.selectFirst("#steckbrief a[href*=\"/viewer/\"]"); + if (viewerEl != null) { + viewerLink = viewerEl.absUrl("href"); + } + + Elements dl = detailDoc.select("#steckbrief dl"); + for (Element dt : dl.select("dt")) { + String label = dt.text().trim().toLowerCase(); + Element dd = dt.nextElementSibling(); + if (dd == null) continue; + String value = dd.text().trim(); + + switch (label) { + case "zeitraum" -> zeitraum = value; + case "enthält auch" -> enthaelt = value; + case "anmerkung" -> anmerkung = value; + case "signatur lokal" -> signatur = value; + case "archiv" -> archivname = value; + } + } + + } catch (Exception e) { + System.err.println("⚠️ Fehler beim Detail-Link " + link + ": " + e.getMessage()); + } + + return new Buch(ortId, titel, link, typ, zeitraum, enthaelt, anmerkung, signatur, archivname, viewerLink); + } + + private void saveBuecher(List list, Connection conn) throws SQLException { + + if (list.isEmpty()) return; + + List vorhandeneTitel = getGespeicherteTitel(list.get(0).ortId(), conn); // 🔧 NEU + List neu = list.stream() + .filter(b -> !vorhandeneTitel.contains(b.titel())) // 🔧 NEU: Duplikate überspringen + .toList(); + + System.out.println("🧮 " + (list.size() - neu.size()) + " Bücher existieren bereits und werden übersprungen."); // 🔧 NEU + System.out.println("✅ " + neu.size() + " neue Bücher werden gespeichert."); // 🔧 NEU + + if (neu.isEmpty()) return; + + + String sql = """ + INSERT INTO buch ( + ort_id, titel, typ, zeitraum, enthaelt, + anmerkung, signatur, archivname, link, viewer_link + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(ort_id, titel) + DO UPDATE SET + typ = excluded.typ, + zeitraum = excluded.zeitraum, + enthaelt = excluded.enthaelt, + anmerkung = excluded.anmerkung, + signatur = excluded.signatur, + archivname = excluded.archivname, + link = excluded.link, + viewer_link = excluded.viewer_link; + """; + + try (PreparedStatement stmt = conn.prepareStatement(sql)) { + for (Buch b : list) { + stmt.setInt(1, b.ortId()); + stmt.setString(2, b.titel()); + stmt.setString(3, b.typ()); + stmt.setString(4, b.zeitraum()); + stmt.setString(5, b.enthaelt()); + stmt.setString(6, b.anmerkung()); + stmt.setString(7, b.signatur()); + stmt.setString(8, b.archivname()); + stmt.setString(9, b.link()); + stmt.setString(10, b.viewerLink()); + stmt.addBatch(); + } + stmt.executeBatch(); + } + } + + private List getGespeicherteTitel(int ortId, Connection conn) throws SQLException { + List vorhandeneTitel = new ArrayList<>(); + String sql = "SELECT titel FROM buch WHERE ort_id = ?"; + try (PreparedStatement stmt = conn.prepareStatement(sql)) { + stmt.setInt(1, ortId); + try (ResultSet rs = stmt.executeQuery()) { + while (rs.next()) { + vorhandeneTitel.add(rs.getString("titel")); + } + } + } + return vorhandeneTitel; + } + + public void runForArchiv(String archivName) throws Exception { + try (Connection conn = DriverManager.getConnection("jdbc:sqlite:" + DB)) { + String sql = """ + SELECT ort.id, ort.name, ort.link + FROM ort + JOIN kreis ON ort.kreis_id = kreis.id + JOIN archive ON kreis.archiv_id = archive.id + WHERE archive.name = ? + ORDER BY ort.id + """; + try (PreparedStatement stmt = conn.prepareStatement(sql)) { + stmt.setString(1, archivName); + runForOrtResultSet(stmt.executeQuery(), conn, archivName); + } + } + } + + + public void runForArchiv(int archivId) throws Exception { + try (Connection conn = DriverManager.getConnection("jdbc:sqlite:" + DB)) { + String sql = """ + SELECT ort.id, ort.name, ort.link + FROM ort + JOIN kreis ON ort.kreis_id = kreis.id + WHERE kreis.archiv_id = ? + ORDER BY ort.id + """; + try (PreparedStatement stmt = conn.prepareStatement(sql)) { + stmt.setInt(1, archivId); + runForOrtResultSet(stmt.executeQuery(), conn, "ID=" + archivId); + } + } + } + + public void runForBundesland(String bundesland) throws Exception { + try (Connection conn = DriverManager.getConnection("jdbc:sqlite:" + DB)) { + String sql = """ + SELECT ort.id, ort.name, ort.link + FROM ort + JOIN kreis ON ort.kreis_id = kreis.id + JOIN archive ON kreis.archiv_id = archive.id + WHERE archive.bundesland = ? + ORDER BY ort.id + """; + try (PreparedStatement stmt = conn.prepareStatement(sql)) { + stmt.setString(1, bundesland); + runForOrtResultSet(stmt.executeQuery(), conn, "Bundesland=" + bundesland); + } + } + } + + + // 🔧 Hilfsmethode zum Verarbeiten des ResultSets + private void runForOrtResultSet(ResultSet rs, Connection conn, String label) throws Exception { + int count = 0; + while (rs.next()) { + int ortId = rs.getInt("id"); + String ortName = rs.getString("name"); + String ortLink = rs.getString("link"); + + System.out.println("\n📘 Lese Bücher für Ort: " + ortName); + List buecher = extractBuecher(ortId, ortLink); + saveBuecher(buecher, conn); + System.out.println("→ " + buecher.size() + " Bücher gespeichert."); + count++; + } + + if (count == 0) { + System.out.println("⚠️ Keine Orte für Archiv '" + label + "' gefunden."); + } + } + + private File buildBildOrdnerPfad( + String baseDir, + String bundesland, + String archivname, + String kreisname, + String ort, + String buchTitel + ) { + // Hilfsfunktion für Dateisystem-sichere Namen + Function safe = s -> + s == null ? "unbekannt" : s.replaceAll("[^\\wäöüÄÖÜß\\-\\s]", "").trim(); + + List pfad = new ArrayList<>(); + pfad.add(baseDir); + pfad.add(safe.apply(bundesland)); + pfad.add(safe.apply(archivname)); + if (kreisname != null && !kreisname.isBlank()) { + pfad.add(safe.apply(kreisname)); + } + pfad.add(safe.apply(ort)); + pfad.add(safe.apply(buchTitel)); + + File ordner = new File(String.join(File.separator, pfad)); + if (!ordner.exists()) { + ordner.mkdirs(); + } + return ordner; + } + + public void erstelleBildOrdnerFuerAlleBuecher(String basisPfad) throws Exception { + try (Connection conn = DriverManager.getConnection("jdbc:sqlite:" + DB)) { + String sql = """ + SELECT + buch.id AS buch_id, + buch.titel AS buch_titel, + ort.name AS ort_name, + kreis.name AS kreis_name, + archive.name AS archiv_name, + archive.bundesland AS bundesland + FROM buch + JOIN ort ON buch.ort_id = ort.id + LEFT JOIN kreis ON ort.kreis_id = kreis.id + JOIN archive ON ort.archiv_id = archive.id + ORDER BY archive.bundesland, archive.name, kreis.name, ort.name, buch.titel + """; + + try (PreparedStatement stmt = conn.prepareStatement(sql); + ResultSet rs = stmt.executeQuery()) { + + int count = 0; + while (rs.next()) { + String buchTitel = rs.getString("buch_titel"); + String ort = rs.getString("ort_name"); + String kreis = rs.getString("kreis_name"); + String archiv = rs.getString("archiv_name"); + String bundesland = rs.getString("bundesland"); + + File ordner = buildBildOrdnerPfad(basisPfad, bundesland, archiv, kreis, ort, buchTitel); + System.out.println("📁 Ordner: " + ordner.getAbsolutePath()); + count++; + } + + System.out.println("✅ " + count + " Bildordner vorbereitet."); + } + } + } + + + public record Buch( + int ortId, + String titel, + String typ, + String zeitraum, + String enthaelt, + String anmerkung, + String signatur, + String archivname, + String link, + String viewerLink + ) {} +} diff --git a/src/main/java/de/roko/genalogy/downloader/database/KirchenkreisExtractor.java b/src/main/java/de/roko/genalogy/downloader/database/KirchenkreisExtractor.java index 3966cca..45d5355 100644 --- a/src/main/java/de/roko/genalogy/downloader/database/KirchenkreisExtractor.java +++ b/src/main/java/de/roko/genalogy/downloader/database/KirchenkreisExtractor.java @@ -15,7 +15,6 @@ public class KirchenkreisExtractor { public void readNwrite() throws Exception { try (Connection conn = DriverManager.getConnection("jdbc:sqlite:" + DB)) { - String sql = "SELECT id, name, link FROM archive ORDER BY id"; try (PreparedStatement stmt = conn.prepareStatement(sql); ResultSet rs = stmt.executeQuery()) { @@ -24,33 +23,60 @@ public class KirchenkreisExtractor { int id = rs.getInt("id"); String name = rs.getString("name"); String link = rs.getString("link"); - System.out.println("\n🔍 Lade Kirchenkreise f\u00fcr Archiv: " + name); - List kreise = extractKirchenkreise(id, link); + System.out.println("\n🔍 Lade Struktur für Archiv: " + name); + + Document doc = Jsoup.connect(link).get(); + Elements items = doc.select("#archive-nav li.item a"); + + List kreise = new ArrayList<>(); + List orte = new ArrayList<>(); + + for (Element linkEl : items) { + String eintragName = linkEl.text().trim(); + String href = linkEl.absUrl("href").trim(); + if (eintragName.isEmpty() || href.isEmpty()) continue; + + if (isKirchenkreisebene(eintragName)) { + kreise.add(new Kirchenkreis(id, eintragName, href)); + } else { + orte.add(new Ort(id, null, eintragName, href)); + } + } + saveKirchenkreise(kreise, conn); + saveOrte(orte, conn); + System.out.println("→ " + kreise.size() + " Kirchenkreise gespeichert."); + System.out.println("→ " + orte.size() + " direkte Orte gespeichert."); } } } } - public static List extractKirchenkreise(int archivId, String url) { - List list = new ArrayList<>(); - try { - Document doc = Jsoup.connect(url).get(); - Elements items = doc.select("#archive-nav li.item a"); - - for (Element link : items) { - String name = link.text().trim(); - String href = link.absUrl("href").trim(); - if (!name.isEmpty() && !href.isEmpty()) { - list.add(new Kirchenkreis(archivId, name, href)); - } - } - } catch (Exception e) { - System.err.println("⚠️ Fehler bei URL " + url + ": " + e.getMessage()); - } - return list; + private boolean isKirchenkreisebene(String name) { + String n = name.toLowerCase(); + return n.contains("kirchenkreis") || + n.contains("dekanat") || + n.contains("juden") || + n.contains("mennoiten") || + n.contains("militärseelsorge") || + n.contains("reformierte kirche") || + n.contains("auslandsgemeinde") || + n.contains("thüringen") || + n.contains("israeliten") || + n.contains("krankenhausseelsorge") || + n.contains("kreis") || + n.contains("reformierter kirchenkreis") || + n.contains("sonderbestände") || + n.contains("allgemeine hilfsmittel") || + n.contains("allgemeines ortschaftsverzeichnis") || + n.contains("kirchenbezirk") || + n.contains("hinterpommern") || + n.contains("Kirchenbücher der Garnisonen und Militärgemeinden") || + n.contains("Regimentskirchenbücher") || + n.contains("Zivilregister") || + n.contains("militärkirchenbücher"); } public static void saveKirchenkreise(List list, Connection conn) throws SQLException { @@ -72,6 +98,30 @@ public class KirchenkreisExtractor { } } - public record Kirchenkreis(int archivId, String name, String link) {} -} + public static void saveOrte(List list, Connection conn) throws SQLException { + String sql = """ + INSERT INTO ort (archiv_id, kreis_id, name, link) + VALUES (?, ?, ?, ?) + ON CONFLICT(archiv_id, name) + DO UPDATE SET link = excluded.link; + """; + try (PreparedStatement stmt = conn.prepareStatement(sql)) { + for (Ort o : list) { + stmt.setInt(1, o.archivId()); + if (o.kreisId() != null) + stmt.setInt(2, o.kreisId()); + else + stmt.setNull(2, Types.INTEGER); + stmt.setString(3, o.name()); + stmt.setString(4, o.link()); + stmt.addBatch(); + } + stmt.executeBatch(); + } + } + + public record Kirchenkreis(int archivId, String name, String link) {} + + public record Ort(int archivId, Integer kreisId, String name, String link) {} +} diff --git a/src/main/java/de/roko/genalogy/downloader/database/KreisOrtExtractor.java b/src/main/java/de/roko/genalogy/downloader/database/KreisOrtExtractor.java index e574e8c..4ece144 100644 --- a/src/main/java/de/roko/genalogy/downloader/database/KreisOrtExtractor.java +++ b/src/main/java/de/roko/genalogy/downloader/database/KreisOrtExtractor.java @@ -15,6 +15,7 @@ public class KreisOrtExtractor { public void readNwrite() throws Exception { try (Connection conn = DriverManager.getConnection("jdbc:sqlite:" + DB)) { + String sql = "SELECT id, archiv_id, name, link FROM kreis ORDER BY id"; try (PreparedStatement stmt = conn.prepareStatement(sql); ResultSet rs = stmt.executeQuery()) { @@ -25,45 +26,47 @@ public class KreisOrtExtractor { String name = rs.getString("name"); String link = rs.getString("link"); - System.out.println("\n🔍 Lade Orte für Kirchenkreis/Dekanat: " + name); - List orte = extractOrte(kreisId, archivId, link); + System.out.println("\n🔎 Lade Orte unter Kirchenkreis: " + name); + + List orte = extractOrte(archivId, kreisId, link); saveOrte(orte, conn); + System.out.println("→ " + orte.size() + " Orte gespeichert."); } } } } - private static List extractOrte(int kreisId, int archivId, String url) { - List list = new ArrayList<>(); + private List extractOrte(int archivId, int kreisId, String url) { + List orte = new ArrayList<>(); try { Document doc = Jsoup.connect(url).get(); - Elements items = doc.select(".list li a"); - for (Element link : items) { - String name = link.text().trim(); - String href = link.absUrl("href").trim(); + Elements items = doc.select("#archive-nav li.item a"); + + for (Element linkEl : items) { + String name = linkEl.text().trim(); + String href = linkEl.absUrl("href").trim(); if (!name.isEmpty() && !href.isEmpty()) { - list.add(new Ort(kreisId, archivId, name, href)); + orte.add(new Ort(archivId, kreisId, name, href)); } } } catch (Exception e) { System.err.println("⚠️ Fehler bei URL " + url + ": " + e.getMessage()); } - return list; + return orte; } - private static void saveOrte(List list, Connection conn) throws SQLException { + private void saveOrte(List list, Connection conn) throws SQLException { String sql = """ - INSERT INTO ort (kreis_id, archiv_id, name, link) - VALUES (?, ?, ?, ?) - ON CONFLICT(kreis_id, archiv_id, name) - DO UPDATE SET link = excluded.link; - """; - + INSERT INTO ort (archiv_id, kreis_id, name, link) + VALUES (?, ?, ?, ?) + ON CONFLICT(archiv_id, name) + DO UPDATE SET link = excluded.link, kreis_id = excluded.kreis_id; + """; try (PreparedStatement stmt = conn.prepareStatement(sql)) { for (Ort o : list) { - stmt.setInt(1, o.kreisId()); - stmt.setInt(2, o.archivId()); + stmt.setInt(1, o.archivId()); + stmt.setInt(2, o.kreisId()); stmt.setString(3, o.name()); stmt.setString(4, o.link()); stmt.addBatch(); @@ -72,6 +75,5 @@ public class KreisOrtExtractor { } } - - public record Ort(int kreisId, int archivId, String name, String link) {} + public record Ort(int archivId, int kreisId, String name, String link) {} } diff --git a/src/main/java/de/roko/genalogy/downloader/viewer/ViewerBildParser.java b/src/main/java/de/roko/genalogy/downloader/viewer/ViewerBildParser.java new file mode 100644 index 0000000..20972e5 --- /dev/null +++ b/src/main/java/de/roko/genalogy/downloader/viewer/ViewerBildParser.java @@ -0,0 +1,71 @@ + +package de.roko.genalogy.downloader.viewer; + +import org.openqa.selenium.*; +import org.openqa.selenium.support.ui.ExpectedConditions; +import org.openqa.selenium.support.ui.WebDriverWait; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; + +public class ViewerBildParser { + + public record BildInfo(int seite, String bildUrl) {} + + public String resolveViewerRedirect(WebDriver driver, String viewerUrl) { + driver.get(viewerUrl); + + try { + Thread.sleep(1000); + + boolean keinZugang = driver.getPageSource().contains("Kein Zugang zum Viewer"); + + if (keinZugang) { + System.out.println("🔐 Kein Zugang – Weiterleitung wird ausgelöst ..."); + + WebElement loginButton = new WebDriverWait(driver, Duration.ofSeconds(5)) + .until(ExpectedConditions.elementToBeClickable( + By.cssSelector("a.btn.btn-primary[href*='/de/login']"))); + + loginButton.click(); + + new WebDriverWait(driver, Duration.ofSeconds(10)) + .until(d -> d.getCurrentUrl().contains("/viewer/churchRegister/")); + + String redirectedUrl = driver.getCurrentUrl(); + System.out.println("✅ Weitergeleitet zum Viewer: " + redirectedUrl); + return redirectedUrl; + } + + System.out.println("✅ Direktzugriff auf Viewer ohne Zwischenseite."); + return driver.getCurrentUrl(); + + } catch (Exception e) { + System.err.println("❌ Fehler bei der Weiterleitung: " + e.getMessage()); + return viewerUrl; + } + } + + public List extractBildUrlsWithSelenium(WebDriver driver) { + List bilder = new ArrayList<>(); + try { + List seiten = driver.findElements(By.cssSelector(".dvpages .dvpage img[data-src]")); + + int seiteNr = 1; + for (WebElement img : seiten) { + String relativeUrl = img.getAttribute("data-src"); + if (relativeUrl == null || relativeUrl.isBlank()) continue; + + String fullUrl = "https://www.archion.de" + relativeUrl.split("\\?")[0]; + bilder.add(new BildInfo(seiteNr++, fullUrl)); + } + + System.out.println("✅ " + bilder.size() + " Bild-URLs extrahiert."); + } catch (Exception e) { + System.err.println("❌ Fehler beim Extrahieren der Bilder: " + e.getMessage()); + } + + return bilder; + } +} diff --git a/src/test/java/de/roko/genalogy/downloader/database/DatenbankDebuggerTest.java b/src/test/java/de/roko/genalogy/downloader/database/DatenbankDebuggerTest.java new file mode 100644 index 0000000..edc9797 --- /dev/null +++ b/src/test/java/de/roko/genalogy/downloader/database/DatenbankDebuggerTest.java @@ -0,0 +1,17 @@ +package de.roko.genalogy.downloader.database; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class DatenbankDebuggerTest { + + + + @Test + void printTable() throws Exception { + DatenbankDebugger dbdebugger = new DatenbankDebugger(); + //dbdebugger.printTable("ort"); + dbdebugger.printTable("buch"); + } +} \ No newline at end of file diff --git a/src/test/java/de/roko/genalogy/downloader/database/DokumentExtractorTest.java b/src/test/java/de/roko/genalogy/downloader/database/DokumentExtractorTest.java new file mode 100644 index 0000000..e816721 --- /dev/null +++ b/src/test/java/de/roko/genalogy/downloader/database/DokumentExtractorTest.java @@ -0,0 +1,81 @@ +package de.roko.genalogy.downloader.database; + +import de.roko.genalogy.downloader.archion.ArchionLoginHelper; +import org.junit.jupiter.api.*; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; + +import java.io.File; + +import static org.junit.jupiter.api.Assertions.*; + +class DokumentExtractorTest { + + public String username = "robatkoch"; + public String password = "PaLiNa2016$$"; + + public ChromeDriver driver; + + @BeforeAll + static void setUp() { + + String userHome = System.getProperty("user.home"); + String downloadFolder = userHome + "/Pictures/archion"; + new File(downloadFolder).mkdirs(); + + + } + + @BeforeEach void login() throws InterruptedException { + + ChromeOptions options = new ChromeOptions(); + options.addArguments("--remote-allow-origins=*"); + + // WebDriver starten + driver = new ChromeDriver(options); + + ArchionLoginHelper archionLoginHelper = new ArchionLoginHelper(driver); + archionLoginHelper.login(username, password); + //assertFalse(archionLoginHelper.isLoggedIn(), "Login nicht möglich"); + + } + + @Test + void run() throws Exception { + try { + DokumentExtractor dokumentExtractor = new DokumentExtractor(); + dokumentExtractor.run(); + } finally { + driver.quit(); + } + + + } + + + @Test + void runForArchiv() throws Exception { + //new DokumentExtractor().runForArchiv("Landeskirchenarchiv der Evangelischen Kirche Mitteldeutschland/Eisenach"); + //new DokumentExtractor().runForArchiv("Landeskirchliches Archiv der Evangelisch-Lutherischen Kirche in Norddeutschland"); + //new DokumentExtractor().runForArchiv("Archiv der Evangelischen Landeskirche Anhalts"); + //new DokumentExtractor().runForArchiv("Landeskirchenarchiv der Evangelischen Kirche Mitteldeutschland/Magdeburg"); + new DokumentExtractor().runForArchiv("Landeskirchliches Archiv der Evangelisch-Lutherischen Landeskirche Sachsens"); + new DokumentExtractor().erstelleBildOrdnerFuerAlleBuecher("/Users/robertkoch/archion_bilder"); + + + } + + @Test + void erstelleBildOrdnerFuerAlleBuecher() throws Exception { + new DokumentExtractor().erstelleBildOrdnerFuerAlleBuecher("/Users/robertkoch/archion_bilder"); + } + + @Test + void runForBundesland() throws Exception { + new DokumentExtractor().runForBundesland("Thüringen"); + } + + @AfterEach void quit() { + driver.quit(); + } +} \ No newline at end of file diff --git a/src/test/java/de/roko/genalogy/downloader/database/KreisOrtExtractorTest.java b/src/test/java/de/roko/genalogy/downloader/database/KreisOrtExtractorTest.java new file mode 100644 index 0000000..6543f2c --- /dev/null +++ b/src/test/java/de/roko/genalogy/downloader/database/KreisOrtExtractorTest.java @@ -0,0 +1,56 @@ +package de.roko.genalogy.downloader.database; + +import de.roko.genalogy.downloader.archion.ArchionLoginHelper; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; + +import java.io.File; + +import static org.junit.jupiter.api.Assertions.*; + +class KreisOrtExtractorTest { + + public String username = "robatkoch"; + public String password = "PaLiNa2016$$"; + + public ChromeDriver driver; + + + + @BeforeAll + static void setUp() { + + String userHome = System.getProperty("user.home"); + String downloadFolder = userHome + "/Pictures/archion"; + new File(downloadFolder).mkdirs(); + + + } + + @BeforeEach void login() { + + ChromeOptions options = new ChromeOptions(); + options.addArguments("--remote-allow-origins=*"); + + // WebDriver starten + driver = new ChromeDriver(options); + + ArchionLoginHelper archionLoginHelper = new ArchionLoginHelper(driver); + archionLoginHelper.login(username, password); + assertFalse(archionLoginHelper.isLoggedIn(), "Login nicht möglich"); + + } + + @Test + void read() throws Exception { + + KreisOrtExtractor kreisOrtExtractor = new KreisOrtExtractor(); + //kreisOrtExtractor.read(); + + } +} \ No newline at end of file diff --git a/src/test/java/de/roko/genalogy/downloader/viewer/ViewerBildParserTest.java b/src/test/java/de/roko/genalogy/downloader/viewer/ViewerBildParserTest.java new file mode 100644 index 0000000..e317e23 --- /dev/null +++ b/src/test/java/de/roko/genalogy/downloader/viewer/ViewerBildParserTest.java @@ -0,0 +1,112 @@ +package de.roko.genalogy.downloader.viewer; + +import de.roko.genalogy.downloader.archion.ArchionLoginHelper; +import net.lightbody.bmp.BrowserMobProxy; +import net.lightbody.bmp.BrowserMobProxyServer; +import net.lightbody.bmp.client.ClientUtil; +import net.lightbody.bmp.core.har.HarEntry; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.openqa.selenium.*; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +class ViewerBildParserTest { + + public String username = "robatkoch"; + public String password = "PaLiNa2016$$"; + + public ChromeDriver driver; + + @BeforeAll + static void setUp() { + + String userHome = System.getProperty("user.home"); + String downloadFolder = userHome + "/Pictures/archion"; + new File(downloadFolder).mkdirs(); + + + } + + @BeforeEach + void login() throws InterruptedException { +/* + ChromeOptions options = new ChromeOptions(); + options.addArguments("--remote-allow-origins=*"); + + // WebDriver starten + driver = new ChromeDriver(options); + + ArchionLoginHelper archionLoginHelper = new ArchionLoginHelper(driver); + archionLoginHelper.login(username, password); + //assertFalse(archionLoginHelper.isLoggedIn(), "Login nicht möglich"); +*/ + } + + @Test + void parse() throws InterruptedException { + + String viewerUrl = "https://www.archion.de/de/viewer/churchRegister/287040?cHash=c61b3fc9f95353f6ba795fe0b90b3288"; + + // Proxy starten + BrowserMobProxy proxy = new BrowserMobProxyServer(); + proxy.start(0); // auf freiem Port starten + + // Proxy in Selenium einbinden + Proxy seleniumProxy = ClientUtil.createSeleniumProxy(proxy); + ChromeOptions options = new ChromeOptions(); + options.setProxy(seleniumProxy); + options.addArguments("--start-maximized"); + + WebDriver driver = new ChromeDriver(options); + + // HAR-Protokoll starten + proxy.newHar("archion-view"); + + // Viewer-URL aufrufen (muss eingeloggt sein!) + ArchionLoginHelper archionLoginHelper = new ArchionLoginHelper(driver); + archionLoginHelper.login(username, password); + driver.get(viewerUrl); + + // Warten und Seiten durchblättern (optional) + for (int i = 0; i < 10; i++) { + try { + Thread.sleep(1500); + WebElement next = driver.findElement(By.cssSelector("a.dvnavnext")); + if (next != null && next.isDisplayed()) { + next.click(); + } else { + break; + } + } catch (Exception e) { + break; // Ende erreicht oder Fehler + } + } + + // Alle Bild-URLs auslesen + List imageUrls = new ArrayList<>(); + for (HarEntry entry : proxy.getHar().getLog().getEntries()) { + String url = entry.getRequest().getUrl(); + if (url.contains("/si/") && url.endsWith("/image.jpg")) { + imageUrls.add(url); + } + } + + // Ausgabe + System.out.println("\n🔍 Gefundene Bild-URLs:"); + for (int i = 0; i < imageUrls.size(); i++) { + System.out.printf("Bild %03d: %s%n", i + 1, imageUrls.get(i)); + } + + // Aufräumen + driver.quit(); + proxy.stop(); + } +} \ No newline at end of file