diff --git a/src/main/java/ru/mcs/udk/scanner/DocumentScanner.java b/src/main/java/ru/mcs/udk/scanner/DocumentScanner.java new file mode 100644 index 0000000..d14bb29 --- /dev/null +++ b/src/main/java/ru/mcs/udk/scanner/DocumentScanner.java @@ -0,0 +1,10 @@ +package ru.mcs.udk.scanner; + +import ru.mcs.udk.wrapper.DocumentInfo; + +import java.io.File; + +public interface DocumentScanner { + + DocumentInfo getUDK(File file); +} diff --git a/src/main/java/ru/mcs/udk/scanner/impl/DJVUScanner.java b/src/main/java/ru/mcs/udk/scanner/impl/DJVUScanner.java new file mode 100644 index 0000000..9b4caf9 --- /dev/null +++ b/src/main/java/ru/mcs/udk/scanner/impl/DJVUScanner.java @@ -0,0 +1,58 @@ +package ru.mcs.udk.scanner.impl; + +import net.sourceforge.tess4j.TesseractException; +import ru.mcs.udk.scanner.DocumentScanner; +import ru.mcs.udk.utils.DocumentUtils; +import ru.mcs.udk.wrapper.DocumentFormat; +import ru.mcs.udk.wrapper.DocumentInfo; + +import javax.imageio.ImageIO; +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; + +import static ru.mcs.udk.utils.DocumentUtils.findUDK; + +public class DJVUScanner implements DocumentScanner { + @Override + public DocumentInfo getUDK(File djvuFile) { + // Засекаем время начала поиска + long startTime = System.currentTimeMillis(); + DocumentInfo documentInfo = new DocumentInfo(); + documentInfo.setError(""); + documentInfo.setUdk(""); + documentInfo.setDocumentFormat(DocumentFormat.DJVU); +// for (int pageIndex = 1; pageIndex < 7; pageIndex++) { + String outputFile = "page_djvu_%d.tiff"; + Process process; + try { + process = new ProcessBuilder("ddjvu", + "-format=tiff", + "-quality=90", + "-page=1-6", + "-eachpage", + djvuFile.getAbsolutePath(), + outputFile).start(); + int exitCode = process.waitFor(); + if (exitCode == 0) { + for (int index = 1; index <= 6; index++) { + BufferedImage image = ImageIO.read(new File(String.format("page_djvu_%s.tiff", index))); + String text = DocumentUtils.getText(image); + String udk = findUDK(text); + if (udk != null && !udk.isEmpty()) { + documentInfo.setUdk(udk); + documentInfo.setLanguage("ru"); + break; + } + } + + } + } catch (InterruptedException | IOException | TesseractException e) { + documentInfo.setError(e.getMessage()); + } +// } + long endTime = System.currentTimeMillis(); + documentInfo.setTime(endTime - startTime); + return documentInfo; + } +} diff --git a/src/main/java/ru/mcs/udk/scanner/impl/PDFScanner.java b/src/main/java/ru/mcs/udk/scanner/impl/PDFScanner.java new file mode 100644 index 0000000..9f0e248 --- /dev/null +++ b/src/main/java/ru/mcs/udk/scanner/impl/PDFScanner.java @@ -0,0 +1,73 @@ +package ru.mcs.udk.scanner.impl; + +import net.sourceforge.tess4j.TesseractException; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.apache.pdfbox.text.PDFTextStripper; +import ru.mcs.udk.wrapper.DocumentFormat; +import ru.mcs.udk.wrapper.DocumentInfo; +import ru.mcs.udk.scanner.DocumentScanner; +import ru.mcs.udk.utils.DocumentUtils; + +import java.awt.image.BufferedImage; +import java.io.File; +import java.io.IOException; +import java.text.DecimalFormat; + +import static ru.mcs.udk.utils.DocumentUtils.findUDK; +import static ru.mcs.udk.utils.DocumentUtils.isCyrillic; + +public class PDFScanner implements DocumentScanner { + + @Override + public DocumentInfo getUDK(File file) { + // Засекаем время начала поиска + long startTime = System.currentTimeMillis(); + DocumentInfo documentInfo = new DocumentInfo(); + documentInfo.setError(""); + documentInfo.setUdk(""); + documentInfo.setDocumentFormat(DocumentFormat.PDF); + documentInfo.setFileSize(getSizeFile(file)); + try (PDDocument document = Loader.loadPDF(file)) { + PDFTextStripper stripper = new PDFTextStripper(); + // Устанавливаем диапазон страниц для анализа (первые три страницы) + stripper.setStartPage(1); + stripper.setEndPage(Math.min(3, document.getNumberOfPages())); + String text = stripper.getText(document); + documentInfo.setUdk(findUDK(text)); + + documentInfo.setLanguage(isCyrillic(file.getName()) ? "ru" : "en"); + + if ((documentInfo.getUdk() == null || documentInfo.getUdk().isBlank()) && documentInfo.getLanguage().equals("ru")) { + documentInfo.setUdk(getUdkByImage(document)); + } + } catch (IOException | TesseractException e) { + documentInfo.setError(e.getMessage()); + } + long endTime = System.currentTimeMillis(); + documentInfo.setTime(endTime - startTime); + return documentInfo; + } + + private String getSizeFile(File file) { + double sizeInMB = (double) file.length() / (1024 * 1024); // Размер в мегабайтах + DecimalFormat df = new DecimalFormat("#.##"); // Форматирование до двух знаков после запятой + + return df.format(sizeInMB) + " Mb"; + } + + private static String getUdkByImage(PDDocument document) throws IOException, TesseractException { + PDFRenderer renderer = new PDFRenderer(document); + + for (int pageIndex = 0; pageIndex < 6; pageIndex++) { + BufferedImage image = renderer.renderImageWithDPI(pageIndex, 300); + String text = DocumentUtils.getText(image); + String udk = findUDK(text); + if (udk != null) { + return udk; + } + } + return ""; + } +} diff --git a/src/main/java/ru/mcs/udk/wrapper/DocumentFormat.java b/src/main/java/ru/mcs/udk/wrapper/DocumentFormat.java new file mode 100644 index 0000000..0d88202 --- /dev/null +++ b/src/main/java/ru/mcs/udk/wrapper/DocumentFormat.java @@ -0,0 +1,7 @@ +package ru.mcs.udk.wrapper; + + +public enum DocumentFormat { + PDF, + DJVU +} diff --git a/target/classes/ru/mcs/udk/UDKSearcher.class b/target/classes/ru/mcs/udk/UDKSearcher.class new file mode 100644 index 0000000..42a3fcf --- /dev/null +++ b/target/classes/ru/mcs/udk/UDKSearcher.class Binary files differ diff --git a/target/classes/ru/mcs/udk/factory/DocumentScannerFactory.class b/target/classes/ru/mcs/udk/factory/DocumentScannerFactory.class new file mode 100644 index 0000000..9520b02 --- /dev/null +++ b/target/classes/ru/mcs/udk/factory/DocumentScannerFactory.class Binary files differ diff --git a/target/classes/ru/mcs/udk/scanner/DocumentScanner.class b/target/classes/ru/mcs/udk/scanner/DocumentScanner.class new file mode 100644 index 0000000..bf899c6 --- /dev/null +++ b/target/classes/ru/mcs/udk/scanner/DocumentScanner.class Binary files differ diff --git a/target/classes/ru/mcs/udk/scanner/impl/DJVUScanner.class b/target/classes/ru/mcs/udk/scanner/impl/DJVUScanner.class new file mode 100644 index 0000000..12754ba --- /dev/null +++ b/target/classes/ru/mcs/udk/scanner/impl/DJVUScanner.class Binary files differ diff --git a/target/classes/ru/mcs/udk/scanner/impl/PDFScanner.class b/target/classes/ru/mcs/udk/scanner/impl/PDFScanner.class new file mode 100644 index 0000000..80bdba1 --- /dev/null +++ b/target/classes/ru/mcs/udk/scanner/impl/PDFScanner.class Binary files differ diff --git a/target/classes/ru/mcs/udk/utils/DocumentUtils.class b/target/classes/ru/mcs/udk/utils/DocumentUtils.class new file mode 100644 index 0000000..d27049a --- /dev/null +++ b/target/classes/ru/mcs/udk/utils/DocumentUtils.class Binary files differ diff --git a/target/classes/ru/mcs/udk/wrapper/DocumentFormat.class b/target/classes/ru/mcs/udk/wrapper/DocumentFormat.class new file mode 100644 index 0000000..c6f8429 --- /dev/null +++ b/target/classes/ru/mcs/udk/wrapper/DocumentFormat.class Binary files differ diff --git a/target/classes/ru/mcs/udk/wrapper/DocumentInfo.class b/target/classes/ru/mcs/udk/wrapper/DocumentInfo.class new file mode 100644 index 0000000..c47a99d --- /dev/null +++ b/target/classes/ru/mcs/udk/wrapper/DocumentInfo.class Binary files differ