Newer
Older
udk-pdf-scanner / src / main / java / ru / mcs / udk / utils / DocumentUtils.java
package ru.mcs.udk.utils;

import net.sourceforge.tess4j.Tesseract;
import net.sourceforge.tess4j.TesseractException;

import java.awt.image.BufferedImage;
import java.io.File;
import java.text.DecimalFormat;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DocumentUtils {
    private static volatile Tesseract tesseract;

    public static Tesseract getTesseract() {
        Tesseract localInstance = tesseract;
        if (localInstance == null) {
            synchronized (Tesseract.class) {
                localInstance = tesseract;
                if (localInstance == null) {
                    tesseract = localInstance = new Tesseract();
                    tesseract.setDatapath("program\\tesseract\\tessdata\\");
                    tesseract.setLanguage("rus");
                    tesseract.setVariable("user_defined_dpi", "300");
                }
            }
        }
        return localInstance;
    }

    public static void main(String[] args) {
        Tesseract tesseract = getTesseract();

        try {
            String text = tesseract.doOCR(new File("book3.PNG"));
            System.out.println(text);
            String udk = findUDK(text);
            System.out.println("UDK " + udk);
        } catch (TesseractException e) {
            System.err.println("Error: " + e.getMessage());
        }
    }

    public static String getText(BufferedImage image) throws TesseractException {
        Tesseract tesseract = getTesseract();
        return tesseract.doOCR(image);
    }

    public static String findUDK(String text) {
        // Регулярное выражение для поиска УДК
        Pattern pattern = Pattern.compile("[Уу]\\s*[Дд]\\s*[Кк]\\s*([0-9.]+)");
        Matcher matcher = pattern.matcher(text);

        if (matcher.find()) {
            return matcher.group(1);
        }
        return "";
    }

    public static boolean isCyrillic(String fileName) {
        String regex = ".*[\\p{IsCyrillic}]{3,}.*";
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(fileName);
        return matcher.matches();
    }

    public static String getSizeFile(File file) {
        double sizeInMB = (double) file.length() / (1024 * 1024); // Размер в мегабайтах
        DecimalFormat df = new DecimalFormat("#.##"); // Форматирование до двух знаков после запятой

        return df.format(sizeInMB) + " Mb";
    }
}