Newer
Older
udk-pdf-scanner / src / main / java / ru / mcs / udk / document / impl / DJVUScanner.java
package ru.mcs.udk.document.impl;

import net.sourceforge.tess4j.TesseractException;
import ru.mcs.udk.wrapper.DocumentInfo;
import ru.mcs.udk.document.DocumentScanner;
import ru.mcs.udk.utils.DocumentUtils;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;

import static ru.mcs.udk.utils.DocumentUtils.findUDK;

public class DJVUScanner implements DocumentScanner {
    @Override
    public DocumentInfo getUDK(File djvuFile) {
        // Засекаем время начала поиска
        long startTime = System.currentTimeMillis();
        DocumentInfo documentInfo = new DocumentInfo();
        documentInfo.setError("");
        documentInfo.setUdk("");
        for (int pageIndex = 1; pageIndex < 7; pageIndex++) {
            String outputFile = String.format("temp/page_%d.tiff", pageIndex);
            Process process;
            try {
                process = new ProcessBuilder("ddjvu",
                        "-format=tiff",
                        "-quality=90",
                        "-page=" + pageIndex,
                        djvuFile.getAbsolutePath(),
                        outputFile).start();
                int exitCode = process.waitFor();
                if (exitCode == 0) {
                    BufferedImage image = ImageIO.read(new File(outputFile));
                    String text = DocumentUtils.getText(image);
                    String udk = findUDK(text);
                    documentInfo.setUdk(udk);
                    if (udk != null && !udk.isEmpty()) {
                        documentInfo.setLanguage("ru");
                        break;
                    }
                }
            } catch (InterruptedException | IOException | TesseractException e) {
                documentInfo.setError(e.getMessage());
            }
        }
        long endTime = System.currentTimeMillis();
        documentInfo.setTime(endTime - startTime);
        return documentInfo;
    }
}