diff --git a/README.md b/README.md
index 6644308..d3757f9 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,7 @@
udk-pdf-scanner
===============
-Search UDK in PDF and DJVU
\ No newline at end of file
+Программа для поиска номеров УДК PDF и DJVU файлах.
+Для распознавания используется OCR Tesseract 5 https://github.com/tesseract-ocr/tesseract
+Для Windows 7, я использую https://github.com/UB-Mannheim/tesseract/wiki
+Для работы с djvu используется DjVuLibre https://djvu.sourceforge.net/index.html
diff --git a/pom.xml b/pom.xml
index f34541e..41040dd 100644
--- a/pom.xml
+++ b/pom.xml
@@ -34,11 +34,6 @@
tess4j
5.15.0
-
-
-
-
-
@@ -55,7 +50,7 @@
true
- ru.mcs.udk.UdkPdfScanner
+ ru.mcs.udk.UdkScannerExecutor
diff --git a/src/main/java/ru/mcs/udk/DjvuScanner.java b/src/main/java/ru/mcs/udk/DjvuScanner.java
new file mode 100644
index 0000000..0df0bd4
--- /dev/null
+++ b/src/main/java/ru/mcs/udk/DjvuScanner.java
@@ -0,0 +1,41 @@
+package ru.mcs.udk;
+
+import net.sourceforge.tess4j.TesseractException;
+
+import javax.imageio.ImageIO;
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.IOException;
+
+import static ru.mcs.udk.DocumentUtils.findUDK;
+
+public class DjvuScanner implements ScannerDocument {
+ @Override
+ public void getUDK(File djvuFile) {
+ String udk = "";
+ for (int pageIndex = 0; pageIndex < 6; pageIndex++) {
+ String outputFile = String.format("temp/page_%d.tiff", pageIndex);
+ Process process;
+ try {
+ process = new ProcessBuilder("ddjvu",
+ "-format=tiff",
+ "-quality=90",
+ "-page=" + pageIndex,
+ djvuFile.getAbsolutePath(),
+ outputFile).start();
+ int exitCode = process.waitFor();
+ if (exitCode == 0) {
+ BufferedImage image = ImageIO.read(new File(outputFile));
+ String text = DocumentUtils.getText(image);
+ udk = findUDK(text);
+ }
+ } catch (InterruptedException | IOException e) {
+ throw new RuntimeException(e);
+ } catch (TesseractException e) {
+ e.printStackTrace();
+ }
+
+ }
+ System.out.printf("%s;%s;%s;%s\n", djvuFile.getPath(), udk, (udk != null && !udk.isEmpty()) ? "ru" : "", "");
+ }
+}
diff --git a/src/main/java/ru/mcs/udk/DocScanner.java b/src/main/java/ru/mcs/udk/DocScanner.java
deleted file mode 100644
index 9f49789..0000000
--- a/src/main/java/ru/mcs/udk/DocScanner.java
+++ /dev/null
@@ -1,54 +0,0 @@
-package ru.mcs.udk;
-
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.*;
-import java.nio.file.attribute.BasicFileAttributes;
-
-
-public class DocScanner {
- public static void main(String[] args) throws FileNotFoundException {
- if (args.length == 0) {
- System.out.println("Пожалуйста, укажите путь к папке.");
- return;
- }
-
- String directoryPath = args[0];
- Path startDir = Paths.get(directoryPath);
-
- if (!Files.exists(startDir) || !Files.isDirectory(startDir)) {
- System.out.println("Указанный путь не существует или не является папкой.");
- return;
- }
-
- System.setOut(new PrintStream(new FileOutputStream("book-list.csv"), true, StandardCharsets.UTF_8));
- try {
- Files.walkFileTree(startDir, new SimpleFileVisitor<>() {
- @Override
- public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
- if (file.toAbsolutePath().toString().matches(".*\\.pdf$")) {
- // Выводим путь и имя файла
- UdkPdfScanner pdfScanner = new UdkPdfScanner();
- pdfScanner.getUDK(file.toFile());
- } else if (file.toAbsolutePath().toString().matches(".*\\.djvu$")) {
- UdkDjvuScanner djvuScanner = new UdkDjvuScanner();
- djvuScanner.getUDK(file.toFile());
- }
- return FileVisitResult.CONTINUE;
- }
-
- @Override
- public FileVisitResult visitFileFailed(Path file, IOException exc) {
- // Обрабатываем ошибки доступа к файлам
- System.err.println("Ошибка доступа к файлу: " + file.toAbsolutePath() + " - " + exc.getMessage());
- return FileVisitResult.CONTINUE;
- }
- });
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-}
diff --git a/src/main/java/ru/mcs/udk/PdfScanner.java b/src/main/java/ru/mcs/udk/PdfScanner.java
new file mode 100644
index 0000000..0a8d6e2
--- /dev/null
+++ b/src/main/java/ru/mcs/udk/PdfScanner.java
@@ -0,0 +1,57 @@
+package ru.mcs.udk;
+
+import net.sourceforge.tess4j.TesseractException;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFTextStripper;
+
+import java.awt.image.BufferedImage;
+import java.io.File;
+import java.io.IOException;
+import java.util.Objects;
+
+import static ru.mcs.udk.DocumentUtils.findUDK;
+import static ru.mcs.udk.DocumentUtils.isCyrillic;
+
+public class PdfScanner implements ScannerDocument {
+
+ @Override
+ public void getUDK(File file) {
+ String languageBook = "";
+ try (PDDocument document = Loader.loadPDF(file)) {
+ PDFTextStripper stripper = new PDFTextStripper();
+ // Устанавливаем диапазон страниц для анализа (первые три страницы)
+ stripper.setStartPage(1);
+ stripper.setEndPage(Math.min(3, document.getNumberOfPages()));
+ String text = stripper.getText(document);
+ String udk = findUDK(text);
+
+ languageBook = isCyrillic(file.getName()) ? "ru" : "en";
+
+ if ((udk == null || udk.isBlank()) && languageBook.equals("ru")) {
+ udk = getUdkByImage(document);
+ }
+
+ System.out.printf("%s;%s;%s;%s\n", file.getPath(), Objects.requireNonNullElse(udk, ""), languageBook, "");
+ } catch (IOException e) {
+ System.out.printf("%s;%s;%s;%s\n", file.getPath(), "", languageBook, e.getMessage());
+ } catch (TesseractException e) {
+ e.printStackTrace();
+ }
+ }
+
+ private static String getUdkByImage(PDDocument document) throws IOException, TesseractException {
+ PDFRenderer renderer = new PDFRenderer(document);
+
+ for (int pageIndex = 0; pageIndex < 5; pageIndex++) {
+ BufferedImage image = renderer.renderImageWithDPI(pageIndex, 300);
+ String text = DocumentUtils.getText(image);
+ String udk = findUDK(text);
+ if (udk != null) {
+ return udk;
+ }
+ }
+ return "";
+ }
+}
diff --git a/src/main/java/ru/mcs/udk/ScannerDocument.java b/src/main/java/ru/mcs/udk/ScannerDocument.java
new file mode 100644
index 0000000..cedcb40
--- /dev/null
+++ b/src/main/java/ru/mcs/udk/ScannerDocument.java
@@ -0,0 +1,9 @@
+package ru.mcs.udk;
+
+import java.io.File;
+import java.io.IOException;
+
+public interface ScannerDocument {
+
+ void getUDK(File file) throws IOException, InterruptedException;
+}
diff --git a/src/main/java/ru/mcs/udk/UDKScanner.java b/src/main/java/ru/mcs/udk/UDKScanner.java
deleted file mode 100644
index 5b8e12b..0000000
--- a/src/main/java/ru/mcs/udk/UDKScanner.java
+++ /dev/null
@@ -1,9 +0,0 @@
-package ru.mcs.udk;
-
-import java.io.File;
-import java.io.IOException;
-
-public interface UDKScanner {
-
- void getUDK(File file) throws IOException, InterruptedException;
-}
diff --git a/src/main/java/ru/mcs/udk/UdkDjvuScanner.java b/src/main/java/ru/mcs/udk/UdkDjvuScanner.java
deleted file mode 100644
index a72a940..0000000
--- a/src/main/java/ru/mcs/udk/UdkDjvuScanner.java
+++ /dev/null
@@ -1,41 +0,0 @@
-package ru.mcs.udk;
-
-import net.sourceforge.tess4j.TesseractException;
-
-import javax.imageio.ImageIO;
-import java.awt.image.BufferedImage;
-import java.io.File;
-import java.io.IOException;
-
-import static ru.mcs.udk.DocumentUtils.findUDK;
-
-public class UdkDjvuScanner implements UDKScanner {
- @Override
- public void getUDK(File djvuFile) {
- String udk = "";
- for (int pageIndex = 0; pageIndex < 6; pageIndex++) {
- String outputFile = String.format("temp/page_%d.tiff", pageIndex);
- Process process;
- try {
- process = new ProcessBuilder("ddjvu",
- "-format=tiff",
- "-quality=90",
- "-page=" + pageIndex,
- djvuFile.getAbsolutePath(),
- outputFile).start();
- int exitCode = process.waitFor();
- if (exitCode == 0) {
- BufferedImage image = ImageIO.read(new File(outputFile));
- String text = DocumentUtils.getText(image);
- udk = findUDK(text);
- }
- } catch (InterruptedException | IOException e) {
- throw new RuntimeException(e);
- } catch (TesseractException e) {
- e.printStackTrace();
- }
-
- }
- System.out.printf("%s;%s;%s;%s\n", djvuFile.getPath(), udk, "", "");
- }
-}
diff --git a/src/main/java/ru/mcs/udk/UdkPdfScanner.java b/src/main/java/ru/mcs/udk/UdkPdfScanner.java
deleted file mode 100644
index a682312..0000000
--- a/src/main/java/ru/mcs/udk/UdkPdfScanner.java
+++ /dev/null
@@ -1,57 +0,0 @@
-package ru.mcs.udk;
-
-import net.sourceforge.tess4j.TesseractException;
-import org.apache.pdfbox.Loader;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.rendering.PDFRenderer;
-import org.apache.pdfbox.text.PDFTextStripper;
-
-import java.awt.image.BufferedImage;
-import java.io.File;
-import java.io.IOException;
-import java.util.Objects;
-
-import static ru.mcs.udk.DocumentUtils.findUDK;
-import static ru.mcs.udk.DocumentUtils.isCyrillic;
-
-public class UdkPdfScanner implements UDKScanner {
-
- @Override
- public void getUDK(File file) {
- String languageBook = "";
- try (PDDocument document = Loader.loadPDF(file)) {
- PDFTextStripper stripper = new PDFTextStripper();
- // Устанавливаем диапазон страниц для анализа (первые три страницы)
- stripper.setStartPage(1);
- stripper.setEndPage(Math.min(3, document.getNumberOfPages()));
- String text = stripper.getText(document);
- String udk = findUDK(text);
-
- languageBook = isCyrillic(file.getName()) ? "ru" : "en";
-
- if ((udk == null || udk.isBlank()) && languageBook.equals("ru")) {
- udk = getUdkByImage(document);
- }
-
- System.out.printf("%s;%s;%s;%s\n", file.getPath(), Objects.requireNonNullElse(udk, ""), languageBook, "");
- } catch (IOException e) {
- System.out.printf("%s;%s;%s;%s\n", file.getPath(), "", languageBook, e.getMessage());
- } catch (TesseractException e) {
- e.printStackTrace();
- }
- }
-
- private static String getUdkByImage(PDDocument document) throws IOException, TesseractException {
- PDFRenderer renderer = new PDFRenderer(document);
-
- for (int pageIndex = 0; pageIndex < 5; pageIndex++) {
- BufferedImage image = renderer.renderImageWithDPI(pageIndex, 300);
- String text = DocumentUtils.getText(image);
- String udk = findUDK(text);
- if (udk != null) {
- return udk;
- }
- }
- return "";
- }
-}
diff --git a/src/main/java/ru/mcs/udk/UdkScannerExecutor.java b/src/main/java/ru/mcs/udk/UdkScannerExecutor.java
new file mode 100644
index 0000000..223236a
--- /dev/null
+++ b/src/main/java/ru/mcs/udk/UdkScannerExecutor.java
@@ -0,0 +1,54 @@
+package ru.mcs.udk;
+
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.*;
+import java.nio.file.attribute.BasicFileAttributes;
+
+
+public class UdkScannerExecutor {
+ public static void main(String[] args) throws FileNotFoundException {
+ if (args.length == 0) {
+ System.out.println("Пожалуйста, укажите путь к папке.");
+ return;
+ }
+
+ String directoryPath = args[0];
+ Path startDir = Paths.get(directoryPath);
+
+ if (!Files.exists(startDir) || !Files.isDirectory(startDir)) {
+ System.out.println("Указанный путь не существует или не является папкой.");
+ return;
+ }
+
+ System.setOut(new PrintStream(new FileOutputStream("book-list.csv"), true, StandardCharsets.UTF_8));
+ try {
+ Files.walkFileTree(startDir, new SimpleFileVisitor<>() {
+ @Override
+ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
+ if (file.toAbsolutePath().toString().matches(".*\\.pdf$")) {
+ // Выводим путь и имя файла
+ PdfScanner pdfScanner = new PdfScanner();
+ pdfScanner.getUDK(file.toFile());
+ } else if (file.toAbsolutePath().toString().matches(".*\\.djvu$")) {
+ DjvuScanner djvuScanner = new DjvuScanner();
+ djvuScanner.getUDK(file.toFile());
+ }
+ return FileVisitResult.CONTINUE;
+ }
+
+ @Override
+ public FileVisitResult visitFileFailed(Path file, IOException exc) {
+ // Обрабатываем ошибки доступа к файлам
+ System.err.println("Ошибка доступа к файлу: " + file.toAbsolutePath() + " - " + exc.getMessage());
+ return FileVisitResult.CONTINUE;
+ }
+ });
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+}