diff --git a/pom.xml b/pom.xml index 247c3aa..f34541e 100644 --- a/pom.xml +++ b/pom.xml @@ -16,18 +16,29 @@ udk-pdf-scanner + + + central + https://repo.maven.apache.org/maven2 + + + org.apache.pdfbox pdfbox - 2.0.29 + 3.0.4 net.sourceforge.tess4j tess4j 5.15.0 - + + + + + @@ -44,7 +55,7 @@ true - ru.mcs.udk.PdfScanner + ru.mcs.udk.UdkPdfScanner diff --git a/src/main/java/ru/mcs/udk/DocScanner.java b/src/main/java/ru/mcs/udk/DocScanner.java new file mode 100644 index 0000000..9f49789 --- /dev/null +++ b/src/main/java/ru/mcs/udk/DocScanner.java @@ -0,0 +1,54 @@ +package ru.mcs.udk; + +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.*; +import java.nio.file.attribute.BasicFileAttributes; + + +public class DocScanner { + public static void main(String[] args) throws FileNotFoundException { + if (args.length == 0) { + System.out.println("Пожалуйста, укажите путь к папке."); + return; + } + + String directoryPath = args[0]; + Path startDir = Paths.get(directoryPath); + + if (!Files.exists(startDir) || !Files.isDirectory(startDir)) { + System.out.println("Указанный путь не существует или не является папкой."); + return; + } + + System.setOut(new PrintStream(new FileOutputStream("book-list.csv"), true, StandardCharsets.UTF_8)); + try { + Files.walkFileTree(startDir, new SimpleFileVisitor<>() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) { + if (file.toAbsolutePath().toString().matches(".*\\.pdf$")) { + // Выводим путь и имя файла + UdkPdfScanner pdfScanner = new UdkPdfScanner(); + pdfScanner.getUDK(file.toFile()); + } else if (file.toAbsolutePath().toString().matches(".*\\.djvu$")) { + UdkDjvuScanner djvuScanner = new UdkDjvuScanner(); + djvuScanner.getUDK(file.toFile()); + } + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult visitFileFailed(Path file, IOException exc) { + // Обрабатываем ошибки доступа к файлам + System.err.println("Ошибка доступа к файлу: " + file.toAbsolutePath() + " - " + exc.getMessage()); + return FileVisitResult.CONTINUE; + } + }); + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/src/main/java/ru/mcs/udk/DocumentUtils.java b/src/main/java/ru/mcs/udk/DocumentUtils.java new file mode 100644 index 0000000..478d22f --- /dev/null +++ b/src/main/java/ru/mcs/udk/DocumentUtils.java @@ -0,0 +1,52 @@ +package ru.mcs.udk; + +import net.sourceforge.tess4j.Tesseract; +import net.sourceforge.tess4j.TesseractException; + +import java.awt.image.BufferedImage; +import java.io.File; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class DocumentUtils { + public static void main(String[] args) { + Tesseract tesseract = new Tesseract(); + tesseract.setDatapath("d:\\program\\Tesseract-OCR\\tessdata\\"); + tesseract.setLanguage("rus"); + + try { + String text = tesseract.doOCR(new File("book3.PNG")); + System.out.println(text); + String udk = findUDK(text); + System.out.println("UDK " + udk); + } catch (TesseractException e) { + e.printStackTrace(); + } + } + + public static String getText(BufferedImage image) throws TesseractException { + Tesseract tesseract = new Tesseract(); + tesseract.setDatapath("d:\\program\\Tesseract-OCR\\tessdata\\"); + tesseract.setLanguage("rus"); + + return tesseract.doOCR(image); + } + + public static String findUDK(String text) { + // Регулярное выражение для поиска УДК + Pattern pattern = Pattern.compile("[Уу]\\s*[Дд]\\s*[Кк]\\s*([0-9.]+)"); + Matcher matcher = pattern.matcher(text); + + if (matcher.find()) { + return matcher.group(1); + } + return null; + } + + public static boolean isCyrillic(String fileName) { + String regex = ".*[\\p{IsCyrillic}]{3,}.*"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(fileName); + return matcher.matches(); + } +} diff --git a/src/main/java/ru/mcs/udk/OCRUtils.java b/src/main/java/ru/mcs/udk/OCRUtils.java deleted file mode 100644 index 2e2f099..0000000 --- a/src/main/java/ru/mcs/udk/OCRUtils.java +++ /dev/null @@ -1,32 +0,0 @@ -package ru.mcs.udk; - -import net.sourceforge.tess4j.Tesseract; -import net.sourceforge.tess4j.TesseractException; - -import java.awt.image.BufferedImage; -import java.io.File; - -public class OCRUtils { - public static void main(String[] args) { - Tesseract tesseract = new Tesseract(); - tesseract.setDatapath("d:\\program\\Tesseract-OCR\\tessdata\\"); - tesseract.setLanguage("rus"); - - try { - String text = tesseract.doOCR(new File("book3.PNG")); - System.out.println(text); - String udk = PdfScanner.findUDK(text); - System.out.println("UDK " + udk); - } catch (TesseractException e) { - e.printStackTrace(); - } - } - - public static String getText(BufferedImage image) throws TesseractException { - Tesseract tesseract = new Tesseract(); - tesseract.setDatapath("d:\\program\\Tesseract-OCR\\tessdata\\"); - tesseract.setLanguage("rus"); - - return tesseract.doOCR(image); - } -} diff --git a/src/main/java/ru/mcs/udk/PdfScanner.java b/src/main/java/ru/mcs/udk/PdfScanner.java deleted file mode 100644 index 28cb343..0000000 --- a/src/main/java/ru/mcs/udk/PdfScanner.java +++ /dev/null @@ -1,112 +0,0 @@ -package ru.mcs.udk; - -import net.sourceforge.tess4j.TesseractException; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.rendering.PDFRenderer; -import org.apache.pdfbox.text.PDFTextStripper; - -import java.awt.image.BufferedImage; -import java.io.*; -import java.nio.charset.StandardCharsets; -import java.nio.file.*; -import java.nio.file.attribute.BasicFileAttributes; -import java.util.Objects; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class PdfScanner { - public static void main(String[] args) throws FileNotFoundException { - if (args.length == 0) { - System.out.println("Пожалуйста, укажите путь к папке."); - return; - } - - String directoryPath = args[0]; - Path startDir = Paths.get(directoryPath); - - if (!Files.exists(startDir) || !Files.isDirectory(startDir)) { - System.out.println("Указанный путь не существует или не является папкой."); - return; - } - - System.setOut(new PrintStream(new FileOutputStream("book-list.csv"), true, StandardCharsets.UTF_8)); - try { - Files.walkFileTree(startDir, new SimpleFileVisitor<>() { - @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) { - if (file.toAbsolutePath().toString().matches(".*\\.pdf$")) { - // Выводим путь и имя файла - getUdk(file.toAbsolutePath()); - } - return FileVisitResult.CONTINUE; - } - - @Override - public FileVisitResult visitFileFailed(Path file, IOException exc) { - // Обрабатываем ошибки доступа к файлам - System.err.println("Ошибка доступа к файлу: " + file.toAbsolutePath() + " - " + exc.getMessage()); - return FileVisitResult.CONTINUE; - } - }); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public static void getUdk(Path filePath) { - File file = filePath.toFile(); - String languageBook = ""; - try (PDDocument document = PDDocument.load(file)) { - PDFTextStripper stripper = new PDFTextStripper(); - // Устанавливаем диапазон страниц для анализа (первые три страницы) - stripper.setStartPage(1); - stripper.setEndPage(Math.min(3, document.getNumberOfPages())); - String text = stripper.getText(document); - String udk = findUDK(text); - - languageBook = isCyrillic(String.valueOf(filePath.getFileName())) ? "ru" : "en"; - - if ((udk == null || udk.isBlank()) && languageBook.equals("ru")) { - udk = getUdkByImage(document); - } - - System.out.printf("%s;%s;%s;%s\n", filePath.toAbsolutePath(), Objects.requireNonNullElse(udk, ""), languageBook, ""); - } catch (IOException e) { - System.out.printf("%s;%s;%s;%s\n", filePath.toAbsolutePath(), Objects.requireNonNullElse("", ""), languageBook, e.getMessage()); - } catch (TesseractException e) { - e.printStackTrace(); - } - } - - private static String getUdkByImage(PDDocument document) throws IOException, TesseractException { - PDFRenderer renderer = new PDFRenderer(document); - - for (int pageIndex = 0; pageIndex < 5; pageIndex++) { - BufferedImage image = renderer.renderImageWithDPI(pageIndex, 300); - String text = OCRUtils.getText(image); - String udk = findUDK(text); - if (udk != null) { - return udk; - } - } - return ""; - } - - public static String findUDK(String text) { - // Регулярное выражение для поиска УДК - Pattern pattern = Pattern.compile("[Уу]\\s*[Дд]\\s*[Кк]\\s*([0-9.]+)"); - Matcher matcher = pattern.matcher(text); - - if (matcher.find()) { - return matcher.group(1); - } - return null; - } - - public static boolean isCyrillic(String fileName) { - String regex = ".*[\\p{IsCyrillic}]{3,}.*"; - Pattern pattern = Pattern.compile(regex); - Matcher matcher = pattern.matcher(fileName); - return matcher.matches(); - } -} diff --git a/src/main/java/ru/mcs/udk/UDKScanner.java b/src/main/java/ru/mcs/udk/UDKScanner.java new file mode 100644 index 0000000..5b8e12b --- /dev/null +++ b/src/main/java/ru/mcs/udk/UDKScanner.java @@ -0,0 +1,9 @@ +package ru.mcs.udk; + +import java.io.File; +import java.io.IOException; + +public interface UDKScanner { + + void getUDK(File file) throws IOException, InterruptedException; +} diff --git a/src/main/java/ru/mcs/udk/UdkDjvuScanner.java b/src/main/java/ru/mcs/udk/UdkDjvuScanner.java new file mode 100644 index 0000000..e4a1a10 --- /dev/null +++ b/src/main/java/ru/mcs/udk/UdkDjvuScanner.java @@ -0,0 +1,26 @@ +package ru.mcs.udk; + +import java.io.File; +import java.io.IOException; + +public class UdkDjvuScanner implements UDKScanner { + @Override + public void getUDK(File djvuFile) { + for (int pageIndex = 0; pageIndex < 6; pageIndex++) { + String outputFile = String.format("%s_page_%d.png", djvuFile.getName(), pageIndex); + Process process = null; + try { + process = new ProcessBuilder("ddjvu", + "-format=png", + "-page=" + pageIndex, + djvuFile.getAbsolutePath(), + outputFile).start(); + int exitCode = process.waitFor(); + } catch (InterruptedException | IOException e) { + throw new RuntimeException(e); + } + + System.out.printf("%s;%s;%s;%s\n", djvuFile.getPath(), "", "", ""); + } + } +} diff --git a/src/main/java/ru/mcs/udk/UdkPdfScanner.java b/src/main/java/ru/mcs/udk/UdkPdfScanner.java new file mode 100644 index 0000000..713d511 --- /dev/null +++ b/src/main/java/ru/mcs/udk/UdkPdfScanner.java @@ -0,0 +1,57 @@ +package ru.mcs.udk; + +import net.sourceforge.tess4j.TesseractException; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.text.PDFTextStripper; + +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; +import java.util.Objects; + +import static ru.mcs.udk.DocumentUtils.findUDK; +import static ru.mcs.udk.DocumentUtils.isCyrillic; + +public class UdkPdfScanner implements UDKScanner { + + @Override + public void getUDK(File file) { + String languageBook = ""; + try (PDDocument document = Loader.loadPDF(file)) { + PDFTextStripper stripper = new PDFTextStripper(); + // Устанавливаем диапазон страниц для анализа (первые три страницы) + stripper.setStartPage(1); + stripper.setEndPage(Math.min(3, document.getNumberOfPages())); + String text = stripper.getText(document); + String udk = findUDK(text); + + languageBook = isCyrillic(String.valueOf(file.getName())) ? "ru" : "en"; + + if ((udk == null || udk.isBlank()) && languageBook.equals("ru")) { + udk = getUdkByImage(document); + } + + System.out.printf("%s;%s;%s;%s\n", file.getPath(), Objects.requireNonNullElse(udk, ""), languageBook, ""); + } catch (IOException e) { + System.out.printf("%s;%s;%s;%s\n", file.getPath(), "", languageBook, e.getMessage()); + } catch (TesseractException e) { + e.printStackTrace(); + } + } + + private static String getUdkByImage(PDDocument document) throws IOException, TesseractException { + PDFRenderer renderer = new PDFRenderer(document); + + for (int pageIndex = 0; pageIndex < 5; pageIndex++) { + BufferedImage image = renderer.renderImageWithDPI(pageIndex, 300); + String text = DocumentUtils.getText(image); + String udk = findUDK(text); + if (udk != null) { + return udk; + } + } + return ""; + } +}