diff --git a/README.md b/README.md index 6644308..d3757f9 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ udk-pdf-scanner =============== -Search UDK in PDF and DJVU \ No newline at end of file +Программа для поиска номеров УДК PDF и DJVU файлах. +Для распознавания используется OCR Tesseract 5 https://github.com/tesseract-ocr/tesseract +Для Windows 7, я использую https://github.com/UB-Mannheim/tesseract/wiki +Для работы с djvu используется DjVuLibre https://djvu.sourceforge.net/index.html diff --git a/pom.xml b/pom.xml index f34541e..41040dd 100644 --- a/pom.xml +++ b/pom.xml @@ -34,11 +34,6 @@ tess4j 5.15.0 - - - - - @@ -55,7 +50,7 @@ true - ru.mcs.udk.UdkPdfScanner + ru.mcs.udk.UdkScannerExecutor diff --git a/src/main/java/ru/mcs/udk/DjvuScanner.java b/src/main/java/ru/mcs/udk/DjvuScanner.java new file mode 100644 index 0000000..0df0bd4 --- /dev/null +++ b/src/main/java/ru/mcs/udk/DjvuScanner.java @@ -0,0 +1,41 @@ +package ru.mcs.udk; + +import net.sourceforge.tess4j.TesseractException; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; + +import static ru.mcs.udk.DocumentUtils.findUDK; + +public class DjvuScanner implements ScannerDocument { + @Override + public void getUDK(File djvuFile) { + String udk = ""; + for (int pageIndex = 0; pageIndex < 6; pageIndex++) { + String outputFile = String.format("temp/page_%d.tiff", pageIndex); + Process process; + try { + process = new ProcessBuilder("ddjvu", + "-format=tiff", + "-quality=90", + "-page=" + pageIndex, + djvuFile.getAbsolutePath(), + outputFile).start(); + int exitCode = process.waitFor(); + if (exitCode == 0) { + BufferedImage image = ImageIO.read(new File(outputFile)); + String text = DocumentUtils.getText(image); + udk = findUDK(text); + } + } catch (InterruptedException | IOException e) { + throw new RuntimeException(e); + } catch (TesseractException e) { + e.printStackTrace(); + } + + } + System.out.printf("%s;%s;%s;%s\n", djvuFile.getPath(), udk, (udk != null && !udk.isEmpty()) ? "ru" : "", ""); + } +} diff --git a/src/main/java/ru/mcs/udk/DocScanner.java b/src/main/java/ru/mcs/udk/DocScanner.java deleted file mode 100644 index 9f49789..0000000 --- a/src/main/java/ru/mcs/udk/DocScanner.java +++ /dev/null @@ -1,54 +0,0 @@ -package ru.mcs.udk; - -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.PrintStream; -import java.nio.charset.StandardCharsets; -import java.nio.file.*; -import java.nio.file.attribute.BasicFileAttributes; - - -public class DocScanner { - public static void main(String[] args) throws FileNotFoundException { - if (args.length == 0) { - System.out.println("Пожалуйста, укажите путь к папке."); - return; - } - - String directoryPath = args[0]; - Path startDir = Paths.get(directoryPath); - - if (!Files.exists(startDir) || !Files.isDirectory(startDir)) { - System.out.println("Указанный путь не существует или не является папкой."); - return; - } - - System.setOut(new PrintStream(new FileOutputStream("book-list.csv"), true, StandardCharsets.UTF_8)); - try { - Files.walkFileTree(startDir, new SimpleFileVisitor<>() { - @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) { - if (file.toAbsolutePath().toString().matches(".*\\.pdf$")) { - // Выводим путь и имя файла - UdkPdfScanner pdfScanner = new UdkPdfScanner(); - pdfScanner.getUDK(file.toFile()); - } else if (file.toAbsolutePath().toString().matches(".*\\.djvu$")) { - UdkDjvuScanner djvuScanner = new UdkDjvuScanner(); - djvuScanner.getUDK(file.toFile()); - } - return FileVisitResult.CONTINUE; - } - - @Override - public FileVisitResult visitFileFailed(Path file, IOException exc) { - // Обрабатываем ошибки доступа к файлам - System.err.println("Ошибка доступа к файлу: " + file.toAbsolutePath() + " - " + exc.getMessage()); - return FileVisitResult.CONTINUE; - } - }); - } catch (IOException e) { - e.printStackTrace(); - } - } -} diff --git a/src/main/java/ru/mcs/udk/PdfScanner.java b/src/main/java/ru/mcs/udk/PdfScanner.java new file mode 100644 index 0000000..0a8d6e2 --- /dev/null +++ b/src/main/java/ru/mcs/udk/PdfScanner.java @@ -0,0 +1,57 @@ +package ru.mcs.udk; + +import net.sourceforge.tess4j.TesseractException; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.text.PDFTextStripper; + +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; +import java.util.Objects; + +import static ru.mcs.udk.DocumentUtils.findUDK; +import static ru.mcs.udk.DocumentUtils.isCyrillic; + +public class PdfScanner implements ScannerDocument { + + @Override + public void getUDK(File file) { + String languageBook = ""; + try (PDDocument document = Loader.loadPDF(file)) { + PDFTextStripper stripper = new PDFTextStripper(); + // Устанавливаем диапазон страниц для анализа (первые три страницы) + stripper.setStartPage(1); + stripper.setEndPage(Math.min(3, document.getNumberOfPages())); + String text = stripper.getText(document); + String udk = findUDK(text); + + languageBook = isCyrillic(file.getName()) ? "ru" : "en"; + + if ((udk == null || udk.isBlank()) && languageBook.equals("ru")) { + udk = getUdkByImage(document); + } + + System.out.printf("%s;%s;%s;%s\n", file.getPath(), Objects.requireNonNullElse(udk, ""), languageBook, ""); + } catch (IOException e) { + System.out.printf("%s;%s;%s;%s\n", file.getPath(), "", languageBook, e.getMessage()); + } catch (TesseractException e) { + e.printStackTrace(); + } + } + + private static String getUdkByImage(PDDocument document) throws IOException, TesseractException { + PDFRenderer renderer = new PDFRenderer(document); + + for (int pageIndex = 0; pageIndex < 5; pageIndex++) { + BufferedImage image = renderer.renderImageWithDPI(pageIndex, 300); + String text = DocumentUtils.getText(image); + String udk = findUDK(text); + if (udk != null) { + return udk; + } + } + return ""; + } +} diff --git a/src/main/java/ru/mcs/udk/ScannerDocument.java b/src/main/java/ru/mcs/udk/ScannerDocument.java new file mode 100644 index 0000000..cedcb40 --- /dev/null +++ b/src/main/java/ru/mcs/udk/ScannerDocument.java @@ -0,0 +1,9 @@ +package ru.mcs.udk; + +import java.io.File; +import java.io.IOException; + +public interface ScannerDocument { + + void getUDK(File file) throws IOException, InterruptedException; +} diff --git a/src/main/java/ru/mcs/udk/UDKScanner.java b/src/main/java/ru/mcs/udk/UDKScanner.java deleted file mode 100644 index 5b8e12b..0000000 --- a/src/main/java/ru/mcs/udk/UDKScanner.java +++ /dev/null @@ -1,9 +0,0 @@ -package ru.mcs.udk; - -import java.io.File; -import java.io.IOException; - -public interface UDKScanner { - - void getUDK(File file) throws IOException, InterruptedException; -} diff --git a/src/main/java/ru/mcs/udk/UdkDjvuScanner.java b/src/main/java/ru/mcs/udk/UdkDjvuScanner.java deleted file mode 100644 index a72a940..0000000 --- a/src/main/java/ru/mcs/udk/UdkDjvuScanner.java +++ /dev/null @@ -1,41 +0,0 @@ -package ru.mcs.udk; - -import net.sourceforge.tess4j.TesseractException; - -import javax.imageio.ImageIO; -import java.awt.image.BufferedImage; -import java.io.File; -import java.io.IOException; - -import static ru.mcs.udk.DocumentUtils.findUDK; - -public class UdkDjvuScanner implements UDKScanner { - @Override - public void getUDK(File djvuFile) { - String udk = ""; - for (int pageIndex = 0; pageIndex < 6; pageIndex++) { - String outputFile = String.format("temp/page_%d.tiff", pageIndex); - Process process; - try { - process = new ProcessBuilder("ddjvu", - "-format=tiff", - "-quality=90", - "-page=" + pageIndex, - djvuFile.getAbsolutePath(), - outputFile).start(); - int exitCode = process.waitFor(); - if (exitCode == 0) { - BufferedImage image = ImageIO.read(new File(outputFile)); - String text = DocumentUtils.getText(image); - udk = findUDK(text); - } - } catch (InterruptedException | IOException e) { - throw new RuntimeException(e); - } catch (TesseractException e) { - e.printStackTrace(); - } - - } - System.out.printf("%s;%s;%s;%s\n", djvuFile.getPath(), udk, "", ""); - } -} diff --git a/src/main/java/ru/mcs/udk/UdkPdfScanner.java b/src/main/java/ru/mcs/udk/UdkPdfScanner.java deleted file mode 100644 index a682312..0000000 --- a/src/main/java/ru/mcs/udk/UdkPdfScanner.java +++ /dev/null @@ -1,57 +0,0 @@ -package ru.mcs.udk; - -import net.sourceforge.tess4j.TesseractException; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.rendering.PDFRenderer; -import org.apache.pdfbox.text.PDFTextStripper; - -import java.awt.image.BufferedImage; -import java.io.File; -import java.io.IOException; -import java.util.Objects; - -import static ru.mcs.udk.DocumentUtils.findUDK; -import static ru.mcs.udk.DocumentUtils.isCyrillic; - -public class UdkPdfScanner implements UDKScanner { - - @Override - public void getUDK(File file) { - String languageBook = ""; - try (PDDocument document = Loader.loadPDF(file)) { - PDFTextStripper stripper = new PDFTextStripper(); - // Устанавливаем диапазон страниц для анализа (первые три страницы) - stripper.setStartPage(1); - stripper.setEndPage(Math.min(3, document.getNumberOfPages())); - String text = stripper.getText(document); - String udk = findUDK(text); - - languageBook = isCyrillic(file.getName()) ? "ru" : "en"; - - if ((udk == null || udk.isBlank()) && languageBook.equals("ru")) { - udk = getUdkByImage(document); - } - - System.out.printf("%s;%s;%s;%s\n", file.getPath(), Objects.requireNonNullElse(udk, ""), languageBook, ""); - } catch (IOException e) { - System.out.printf("%s;%s;%s;%s\n", file.getPath(), "", languageBook, e.getMessage()); - } catch (TesseractException e) { - e.printStackTrace(); - } - } - - private static String getUdkByImage(PDDocument document) throws IOException, TesseractException { - PDFRenderer renderer = new PDFRenderer(document); - - for (int pageIndex = 0; pageIndex < 5; pageIndex++) { - BufferedImage image = renderer.renderImageWithDPI(pageIndex, 300); - String text = DocumentUtils.getText(image); - String udk = findUDK(text); - if (udk != null) { - return udk; - } - } - return ""; - } -} diff --git a/src/main/java/ru/mcs/udk/UdkScannerExecutor.java b/src/main/java/ru/mcs/udk/UdkScannerExecutor.java new file mode 100644 index 0000000..223236a --- /dev/null +++ b/src/main/java/ru/mcs/udk/UdkScannerExecutor.java @@ -0,0 +1,54 @@ +package ru.mcs.udk; + +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.*; +import java.nio.file.attribute.BasicFileAttributes; + + +public class UdkScannerExecutor { + public static void main(String[] args) throws FileNotFoundException { + if (args.length == 0) { + System.out.println("Пожалуйста, укажите путь к папке."); + return; + } + + String directoryPath = args[0]; + Path startDir = Paths.get(directoryPath); + + if (!Files.exists(startDir) || !Files.isDirectory(startDir)) { + System.out.println("Указанный путь не существует или не является папкой."); + return; + } + + System.setOut(new PrintStream(new FileOutputStream("book-list.csv"), true, StandardCharsets.UTF_8)); + try { + Files.walkFileTree(startDir, new SimpleFileVisitor<>() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) { + if (file.toAbsolutePath().toString().matches(".*\\.pdf$")) { + // Выводим путь и имя файла + PdfScanner pdfScanner = new PdfScanner(); + pdfScanner.getUDK(file.toFile()); + } else if (file.toAbsolutePath().toString().matches(".*\\.djvu$")) { + DjvuScanner djvuScanner = new DjvuScanner(); + djvuScanner.getUDK(file.toFile()); + } + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFileFailed(Path file, IOException exc) { + // Обрабатываем ошибки доступа к файлам + System.err.println("Ошибка доступа к файлу: " + file.toAbsolutePath() + " - " + exc.getMessage()); + return FileVisitResult.CONTINUE; + } + }); + } catch (IOException e) { + e.printStackTrace(); + } + } +}