diff --git a/book2.PNG b/book2.PNG new file mode 100644 index 0000000..2606a0a --- /dev/null +++ b/book2.PNG Binary files differ diff --git a/book3.PNG b/book3.PNG new file mode 100644 index 0000000..327544f --- /dev/null +++ b/book3.PNG Binary files differ diff --git a/src/main/java/ru/mcs/udk/OCRTest.java b/src/main/java/ru/mcs/udk/OCRTest.java deleted file mode 100644 index 5ed4175..0000000 --- a/src/main/java/ru/mcs/udk/OCRTest.java +++ /dev/null @@ -1,23 +0,0 @@ -package ru.mcs.udk; - -import net.sourceforge.tess4j.Tesseract; -import net.sourceforge.tess4j.TesseractException; - -import java.io.File; - -public class OCRTest { - public static void main(String[] args) { - Tesseract tesseract = new Tesseract(); - tesseract.setDatapath("d:\\program\\Tesseract-OCR\\tessdata\\"); - tesseract.setLanguage("rus"); - - try { - String text = tesseract.doOCR(new File("book1.PNG")); - System.out.println(text); - String udk = PdfScanner.findUDK(text); - System.out.println("UDK " + udk); - } catch (TesseractException e) { - e.printStackTrace(); - } - } -} diff --git a/src/main/java/ru/mcs/udk/OCRUtils.java b/src/main/java/ru/mcs/udk/OCRUtils.java new file mode 100644 index 0000000..2e2f099 --- /dev/null +++ b/src/main/java/ru/mcs/udk/OCRUtils.java @@ -0,0 +1,32 @@ +package ru.mcs.udk; + +import net.sourceforge.tess4j.Tesseract; +import net.sourceforge.tess4j.TesseractException; + +import java.awt.image.BufferedImage; +import java.io.File; + +public class OCRUtils { + public static void main(String[] args) { + Tesseract tesseract = new Tesseract(); + tesseract.setDatapath("d:\\program\\Tesseract-OCR\\tessdata\\"); + tesseract.setLanguage("rus"); + + try { + String text = tesseract.doOCR(new File("book3.PNG")); + System.out.println(text); + String udk = PdfScanner.findUDK(text); + System.out.println("UDK " + udk); + } catch (TesseractException e) { + e.printStackTrace(); + } + } + + public static String getText(BufferedImage image) throws TesseractException { + Tesseract tesseract = new Tesseract(); + tesseract.setDatapath("d:\\program\\Tesseract-OCR\\tessdata\\"); + tesseract.setLanguage("rus"); + + return tesseract.doOCR(image); + } +} diff --git a/src/main/java/ru/mcs/udk/PdfScanner.java b/src/main/java/ru/mcs/udk/PdfScanner.java index 76d684e..28cb343 100644 --- a/src/main/java/ru/mcs/udk/PdfScanner.java +++ b/src/main/java/ru/mcs/udk/PdfScanner.java @@ -1,8 +1,11 @@ package ru.mcs.udk; +import net.sourceforge.tess4j.TesseractException; import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.text.PDFTextStripper; +import java.awt.image.BufferedImage; import java.io.*; import java.nio.charset.StandardCharsets; import java.nio.file.*; @@ -58,18 +61,37 @@ // Устанавливаем диапазон страниц для анализа (первые три страницы) stripper.setStartPage(1); stripper.setEndPage(Math.min(3, document.getNumberOfPages())); - String text = stripper.getText(document); String udk = findUDK(text); languageBook = isCyrillic(String.valueOf(filePath.getFileName())) ? "ru" : "en"; - System.out.printf("%s;%s;%s;%s\n", filePath.toAbsolutePath(), Objects.requireNonNullElse(udk, ""), languageBook,""); + if ((udk == null || udk.isBlank()) && languageBook.equals("ru")) { + udk = getUdkByImage(document); + } + + System.out.printf("%s;%s;%s;%s\n", filePath.toAbsolutePath(), Objects.requireNonNullElse(udk, ""), languageBook, ""); } catch (IOException e) { System.out.printf("%s;%s;%s;%s\n", filePath.toAbsolutePath(), Objects.requireNonNullElse("", ""), languageBook, e.getMessage()); + } catch (TesseractException e) { + e.printStackTrace(); } } + private static String getUdkByImage(PDDocument document) throws IOException, TesseractException { + PDFRenderer renderer = new PDFRenderer(document); + + for (int pageIndex = 0; pageIndex < 5; pageIndex++) { + BufferedImage image = renderer.renderImageWithDPI(pageIndex, 300); + String text = OCRUtils.getText(image); + String udk = findUDK(text); + if (udk != null) { + return udk; + } + } + return ""; + } + public static String findUDK(String text) { // Регулярное выражение для поиска УДК Pattern pattern = Pattern.compile("[Уу]\\s*[Дд]\\s*[Кк]\\s*([0-9.]+)");