<?php
namespace boru\boruai\OCR\Methods;

use Imagick;
use ImagickException;

class Tesseract {
    private $pdfPath;
    private $imageDir;

    private $pageData = [];

    private $images = [];

    public function __construct($pdfPath,$pageData=null) {
        $this->pdfPath = $pdfPath;
        $this->imageDir = sys_get_temp_dir() . '/ocr_pages_' . uniqid();
        mkdir($this->imageDir, 0777, true);
        if ($pageData) {
            $this->loadPageData($pageData);
        }
    }
    public function __destruct() {
        $this->cleanup();
    }

    public function save($outfile) {
        $data = $this->ocr();
        if(is_array($data)) {
            $object= [];
            $object["filename"] = $this->pdfPath;
            $object["pages"] = $data;
            file_put_contents($outfile, json_encode($object, JSON_PRETTY_PRINT|JSON_UNESCAPED_SLASHES|JSON_UNESCAPED_UNICODE));
            return true;
        }
        echo "Failed to perform OCR, nothing saved.\n";
        return false;
    }

    /**
     * Perform OCR on the PDF document.
     * @param bool $force Force re-OCR even if cached data exists
     * @return string|array<string|int, string>
     * @throws ImagickException
     */
    public function ocr($force=false) {
        if(!empty($this->pageData) && !$force) {
            return $this->pageData;
        }
        $images = $this->pageImages();
        $this->pageData = [];
        foreach ($images as $i => $imagePath) {
            $this->pageData[$i] = $this->ocrPageImage($imagePath);
        }
        return $this->pageData;
    }

    public function loadPageData($pageData=[]) {
        $this->pageData = $pageData;
    }

    /**
     * Perform OCR on a specific page of the PDF document.
     * @param mixed $pageNumber 
     * @return string 
     * @throws ImagickException 
     */
    public function ocrPage($pageNumber) {
        if(isset($this->pageData[$pageNumber - 1])) {
            return $this->pageData[$pageNumber - 1];
        }
        $images = $this->pageImages();
        if (!isset($images[$pageNumber - 1])) {
            return "[OCR FAILED - PAGE MISSING]";
        }
        $imagePath = $images[$pageNumber - 1];
        $this->pageData[$pageNumber - 1] = $this->ocrPageImage($imagePath);
        return $this->pageData[$pageNumber - 1];
    }

    public function pageImages() {
        if(!empty($this->images)) {
            return $this->images;
        }
        $imagick = new Imagick();
        $imagick->setResolution(300, 300);
        $imagick->readImage($this->pdfPath);
        foreach ($imagick as $i => $page) {
            $page->setImageFormat('png');
            $page->writeImage("{$this->imageDir}/page_{$i}.png");
            $this->images[$i] = "{$this->imageDir}/page_{$i}.png";
        }
        $imagick->clear();
        $imagick->destroy();
        return $this->images;
    }

    protected function ocrPageImage($imagePath) {
        $temp = tempnam(sys_get_temp_dir(), 'ocr');
        $cmd = "tesseract " . escapeshellarg($imagePath) . " " . escapeshellarg($temp) . " -l eng";
        shell_exec($cmd . " 2>&1");
        $text = @file_get_contents($temp . ".txt");
        @unlink($temp);
        @unlink($temp . ".txt");
        return $text ?: "[OCR FAILED]";
    }

    protected function cleanup() {
        foreach (glob($this->imageDir . '/*.png') as $file) {
            @unlink($file);
        }
        @rmdir($this->imageDir);
    }

    public static function load($tFile) {
        if (!file_exists($tFile)) {
            throw new \InvalidArgumentException("File not found: $tFile");
        }
        $data = json_decode(file_get_contents($tFile), true);
        if (json_last_error() !== JSON_ERROR_NONE) {
            throw new \RuntimeException("Failed to parse JSON: " . json_last_error_msg());
        }
        return new self($data['filename'], $data['pages']);
    }
    public static function frame($arrayOfPages) {
        /*
        [BEGIN DOCUMENT OCR OUTPUT]

        [page 1]
        ...
        [end of page 1]

        [page 2]
        ...
        [end of page 2]

        [END DOCUMENT OCR OUTPUT]
        */
        $output = "[BEGIN DOCUMENT OCR OUTPUT]\n";

        foreach($arrayOfPages as $i=>$pageData) {
            $output .= "\n[page " . ($i + 1) . "]\n";
            $output .= $pageData . "\n";
            $output .= "[end of page " . ($i + 1) . "]\n";
        }

        $output .= "\n[END DOCUMENT OCR OUTPUT]";
        return $output;
    }
}