diff --git a/pom.xml b/pom.xml
index 247c3aa..f34541e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -16,18 +16,29 @@
udk-pdf-scanner
+
+
+ central
+ https://repo.maven.apache.org/maven2
+
+
+
org.apache.pdfbox
pdfbox
- 2.0.29
+ 3.0.4
net.sourceforge.tess4j
tess4j
5.15.0
-
+
+
+
+
+
@@ -44,7 +55,7 @@
true
- ru.mcs.udk.PdfScanner
+ ru.mcs.udk.UdkPdfScanner
diff --git a/src/main/java/ru/mcs/udk/DocScanner.java b/src/main/java/ru/mcs/udk/DocScanner.java
new file mode 100644
index 0000000..9f49789
--- /dev/null
+++ b/src/main/java/ru/mcs/udk/DocScanner.java
@@ -0,0 +1,54 @@
+package ru.mcs.udk;
+
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.*;
+import java.nio.file.attribute.BasicFileAttributes;
+
+
+public class DocScanner {
+ public static void main(String[] args) throws FileNotFoundException {
+ if (args.length == 0) {
+ System.out.println("Пожалуйста, укажите путь к папке.");
+ return;
+ }
+
+ String directoryPath = args[0];
+ Path startDir = Paths.get(directoryPath);
+
+ if (!Files.exists(startDir) || !Files.isDirectory(startDir)) {
+ System.out.println("Указанный путь не существует или не является папкой.");
+ return;
+ }
+
+ System.setOut(new PrintStream(new FileOutputStream("book-list.csv"), true, StandardCharsets.UTF_8));
+ try {
+ Files.walkFileTree(startDir, new SimpleFileVisitor<>() {
+ @Override
+ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
+ if (file.toAbsolutePath().toString().matches(".*\\.pdf$")) {
+ // Выводим путь и имя файла
+ UdkPdfScanner pdfScanner = new UdkPdfScanner();
+ pdfScanner.getUDK(file.toFile());
+ } else if (file.toAbsolutePath().toString().matches(".*\\.djvu$")) {
+ UdkDjvuScanner djvuScanner = new UdkDjvuScanner();
+ djvuScanner.getUDK(file.toFile());
+ }
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult visitFileFailed(Path file, IOException exc) {
+ // Обрабатываем ошибки доступа к файлам
+ System.err.println("Ошибка доступа к файлу: " + file.toAbsolutePath() + " - " + exc.getMessage());
+ return FileVisitResult.CONTINUE;
+ }
+ });
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+}
diff --git a/src/main/java/ru/mcs/udk/DocumentUtils.java b/src/main/java/ru/mcs/udk/DocumentUtils.java
new file mode 100644
index 0000000..478d22f
--- /dev/null
+++ b/src/main/java/ru/mcs/udk/DocumentUtils.java
@@ -0,0 +1,52 @@
+package ru.mcs.udk;
+
+import net.sourceforge.tess4j.Tesseract;
+import net.sourceforge.tess4j.TesseractException;
+
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class DocumentUtils {
+ public static void main(String[] args) {
+ Tesseract tesseract = new Tesseract();
+ tesseract.setDatapath("d:\\program\\Tesseract-OCR\\tessdata\\");
+ tesseract.setLanguage("rus");
+
+ try {
+ String text = tesseract.doOCR(new File("book3.PNG"));
+ System.out.println(text);
+ String udk = findUDK(text);
+ System.out.println("UDK " + udk);
+ } catch (TesseractException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public static String getText(BufferedImage image) throws TesseractException {
+ Tesseract tesseract = new Tesseract();
+ tesseract.setDatapath("d:\\program\\Tesseract-OCR\\tessdata\\");
+ tesseract.setLanguage("rus");
+
+ return tesseract.doOCR(image);
+ }
+
+ public static String findUDK(String text) {
+ // Регулярное выражение для поиска УДК
+ Pattern pattern = Pattern.compile("[Уу]\\s*[Дд]\\s*[Кк]\\s*([0-9.]+)");
+ Matcher matcher = pattern.matcher(text);
+
+ if (matcher.find()) {
+ return matcher.group(1);
+ }
+ return null;
+ }
+
+ public static boolean isCyrillic(String fileName) {
+ String regex = ".*[\\p{IsCyrillic}]{3,}.*";
+ Pattern pattern = Pattern.compile(regex);
+ Matcher matcher = pattern.matcher(fileName);
+ return matcher.matches();
+ }
+}
diff --git a/src/main/java/ru/mcs/udk/OCRUtils.java b/src/main/java/ru/mcs/udk/OCRUtils.java
deleted file mode 100644
index 2e2f099..0000000
--- a/src/main/java/ru/mcs/udk/OCRUtils.java
+++ /dev/null
@@ -1,32 +0,0 @@
-package ru.mcs.udk;
-
-import net.sourceforge.tess4j.Tesseract;
-import net.sourceforge.tess4j.TesseractException;
-
-import java.awt.image.BufferedImage;
-import java.io.File;
-
-public class OCRUtils {
- public static void main(String[] args) {
- Tesseract tesseract = new Tesseract();
- tesseract.setDatapath("d:\\program\\Tesseract-OCR\\tessdata\\");
- tesseract.setLanguage("rus");
-
- try {
- String text = tesseract.doOCR(new File("book3.PNG"));
- System.out.println(text);
- String udk = PdfScanner.findUDK(text);
- System.out.println("UDK " + udk);
- } catch (TesseractException e) {
- e.printStackTrace();
- }
- }
-
- public static String getText(BufferedImage image) throws TesseractException {
- Tesseract tesseract = new Tesseract();
- tesseract.setDatapath("d:\\program\\Tesseract-OCR\\tessdata\\");
- tesseract.setLanguage("rus");
-
- return tesseract.doOCR(image);
- }
-}
diff --git a/src/main/java/ru/mcs/udk/PdfScanner.java b/src/main/java/ru/mcs/udk/PdfScanner.java
deleted file mode 100644
index 28cb343..0000000
--- a/src/main/java/ru/mcs/udk/PdfScanner.java
+++ /dev/null
@@ -1,112 +0,0 @@
-package ru.mcs.udk;
-
-import net.sourceforge.tess4j.TesseractException;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.rendering.PDFRenderer;
-import org.apache.pdfbox.text.PDFTextStripper;
-
-import java.awt.image.BufferedImage;
-import java.io.*;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.*;
-import java.nio.file.attribute.BasicFileAttributes;
-import java.util.Objects;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-public class PdfScanner {
- public static void main(String[] args) throws FileNotFoundException {
- if (args.length == 0) {
- System.out.println("Пожалуйста, укажите путь к папке.");
- return;
- }
-
- String directoryPath = args[0];
- Path startDir = Paths.get(directoryPath);
-
- if (!Files.exists(startDir) || !Files.isDirectory(startDir)) {
- System.out.println("Указанный путь не существует или не является папкой.");
- return;
- }
-
- System.setOut(new PrintStream(new FileOutputStream("book-list.csv"), true, StandardCharsets.UTF_8));
- try {
- Files.walkFileTree(startDir, new SimpleFileVisitor<>() {
- @Override
- public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
- if (file.toAbsolutePath().toString().matches(".*\\.pdf$")) {
- // Выводим путь и имя файла
- getUdk(file.toAbsolutePath());
- }
- return FileVisitResult.CONTINUE;
- }
-
- @Override
- public FileVisitResult visitFileFailed(Path file, IOException exc) {
- // Обрабатываем ошибки доступа к файлам
- System.err.println("Ошибка доступа к файлу: " + file.toAbsolutePath() + " - " + exc.getMessage());
- return FileVisitResult.CONTINUE;
- }
- });
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- public static void getUdk(Path filePath) {
- File file = filePath.toFile();
- String languageBook = "";
- try (PDDocument document = PDDocument.load(file)) {
- PDFTextStripper stripper = new PDFTextStripper();
- // Устанавливаем диапазон страниц для анализа (первые три страницы)
- stripper.setStartPage(1);
- stripper.setEndPage(Math.min(3, document.getNumberOfPages()));
- String text = stripper.getText(document);
- String udk = findUDK(text);
-
- languageBook = isCyrillic(String.valueOf(filePath.getFileName())) ? "ru" : "en";
-
- if ((udk == null || udk.isBlank()) && languageBook.equals("ru")) {
- udk = getUdkByImage(document);
- }
-
- System.out.printf("%s;%s;%s;%s\n", filePath.toAbsolutePath(), Objects.requireNonNullElse(udk, ""), languageBook, "");
- } catch (IOException e) {
- System.out.printf("%s;%s;%s;%s\n", filePath.toAbsolutePath(), Objects.requireNonNullElse("", ""), languageBook, e.getMessage());
- } catch (TesseractException e) {
- e.printStackTrace();
- }
- }
-
- private static String getUdkByImage(PDDocument document) throws IOException, TesseractException {
- PDFRenderer renderer = new PDFRenderer(document);
-
- for (int pageIndex = 0; pageIndex < 5; pageIndex++) {
- BufferedImage image = renderer.renderImageWithDPI(pageIndex, 300);
- String text = OCRUtils.getText(image);
- String udk = findUDK(text);
- if (udk != null) {
- return udk;
- }
- }
- return "";
- }
-
- public static String findUDK(String text) {
- // Регулярное выражение для поиска УДК
- Pattern pattern = Pattern.compile("[Уу]\\s*[Дд]\\s*[Кк]\\s*([0-9.]+)");
- Matcher matcher = pattern.matcher(text);
-
- if (matcher.find()) {
- return matcher.group(1);
- }
- return null;
- }
-
- public static boolean isCyrillic(String fileName) {
- String regex = ".*[\\p{IsCyrillic}]{3,}.*";
- Pattern pattern = Pattern.compile(regex);
- Matcher matcher = pattern.matcher(fileName);
- return matcher.matches();
- }
-}
diff --git a/src/main/java/ru/mcs/udk/UDKScanner.java b/src/main/java/ru/mcs/udk/UDKScanner.java
new file mode 100644
index 0000000..5b8e12b
--- /dev/null
+++ b/src/main/java/ru/mcs/udk/UDKScanner.java
@@ -0,0 +1,9 @@
+package ru.mcs.udk;
+
+import java.io.File;
+import java.io.IOException;
+
+public interface UDKScanner {
+
+ void getUDK(File file) throws IOException, InterruptedException;
+}
diff --git a/src/main/java/ru/mcs/udk/UdkDjvuScanner.java b/src/main/java/ru/mcs/udk/UdkDjvuScanner.java
new file mode 100644
index 0000000..e4a1a10
--- /dev/null
+++ b/src/main/java/ru/mcs/udk/UdkDjvuScanner.java
@@ -0,0 +1,26 @@
+package ru.mcs.udk;
+
+import java.io.File;
+import java.io.IOException;
+
+public class UdkDjvuScanner implements UDKScanner {
+ @Override
+ public void getUDK(File djvuFile) {
+ for (int pageIndex = 0; pageIndex < 6; pageIndex++) {
+ String outputFile = String.format("%s_page_%d.png", djvuFile.getName(), pageIndex);
+ Process process = null;
+ try {
+ process = new ProcessBuilder("ddjvu",
+ "-format=png",
+ "-page=" + pageIndex,
+ djvuFile.getAbsolutePath(),
+ outputFile).start();
+ int exitCode = process.waitFor();
+ } catch (InterruptedException | IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ System.out.printf("%s;%s;%s;%s\n", djvuFile.getPath(), "", "", "");
+ }
+ }
+}
diff --git a/src/main/java/ru/mcs/udk/UdkPdfScanner.java b/src/main/java/ru/mcs/udk/UdkPdfScanner.java
new file mode 100644
index 0000000..713d511
--- /dev/null
+++ b/src/main/java/ru/mcs/udk/UdkPdfScanner.java
@@ -0,0 +1,57 @@
+package ru.mcs.udk;
+
+import net.sourceforge.tess4j.TesseractException;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFTextStripper;
+
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.IOException;
+import java.util.Objects;
+
+import static ru.mcs.udk.DocumentUtils.findUDK;
+import static ru.mcs.udk.DocumentUtils.isCyrillic;
+
+public class UdkPdfScanner implements UDKScanner {
+
+ @Override
+ public void getUDK(File file) {
+ String languageBook = "";
+ try (PDDocument document = Loader.loadPDF(file)) {
+ PDFTextStripper stripper = new PDFTextStripper();
+ // Устанавливаем диапазон страниц для анализа (первые три страницы)
+ stripper.setStartPage(1);
+ stripper.setEndPage(Math.min(3, document.getNumberOfPages()));
+ String text = stripper.getText(document);
+ String udk = findUDK(text);
+
+ languageBook = isCyrillic(String.valueOf(file.getName())) ? "ru" : "en";
+
+ if ((udk == null || udk.isBlank()) && languageBook.equals("ru")) {
+ udk = getUdkByImage(document);
+ }
+
+ System.out.printf("%s;%s;%s;%s\n", file.getPath(), Objects.requireNonNullElse(udk, ""), languageBook, "");
+ } catch (IOException e) {
+ System.out.printf("%s;%s;%s;%s\n", file.getPath(), "", languageBook, e.getMessage());
+ } catch (TesseractException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private static String getUdkByImage(PDDocument document) throws IOException, TesseractException {
+ PDFRenderer renderer = new PDFRenderer(document);
+
+ for (int pageIndex = 0; pageIndex < 5; pageIndex++) {
+ BufferedImage image = renderer.renderImageWithDPI(pageIndex, 300);
+ String text = DocumentUtils.getText(image);
+ String udk = findUDK(text);
+ if (udk != null) {
+ return udk;
+ }
+ }
+ return "";
+ }
+}