<?php
namespace boru\boruai\OCR\Methods;

use boru\boruai\OCR\Contract\PageImageProviderInterface;
use boru\boruai\OCR\Tesseract\ImagickPdfPageImageProvider;
use boru\boruai\OCR\Tesseract\MuPdfPdfPageImageProvider;
use boru\boruai\OCR\Tesseract\VipsPdfPageImageProvider;
use ImagickException;

class Tesseract
{

    public static $useTsv = false;
    private $pdfPath;
    private $imageDir;
    private $tsvMode;
    private $frameOutput = true;
    private $verbose = false;

    private $pageData = [];

    /** @var PageImageProviderInterface */
    private $imageProvider;

    private $tesseractOpts = [
        "-l"   => "eng",
        "--oem" => "1",
        "--psm" => "4",
        "-c"   => "preserve_interword_spaces=1",
    ];

    public function setVerbose($verbose = true)
    {
        $this->verbose = $verbose;
    }
    public function output($variable)
    {
        if ($this->verbose) {
            $output = "";
            if(is_array($variable) || is_object($variable)) {
                $output .= print_r($variable, true);
            } else {
                $output .= (string)$variable;
            }
            echo $output . "\n";
        }
    }

    public function __construct($pdfPath, $options = [])
    {
        $this->pdfPath = $pdfPath;
        $this->imageDir = sys_get_temp_dir() . '/ocr_pages_' . uniqid();
        mkdir($this->imageDir, 0777, true);

        if (!is_array($options)) {
            $options = [];
        }

        // Configure local options (tsvMode, frameOutput, tesseract options) first
        $this->setOptions($options);
        //force a tsvMode setting.. if one was set it will get used, otherwise default to global setting
        $this->tsvMode();

        // Build image provider options subset
       $imageProviderOptions = [
            'verbose'           => $this->verbose,
            'tileDpi'           => isset($options['tileDpi']) ? $options['tileDpi'] : 800,
            'tilePixelThreshold'=> isset($options['tilePixelThreshold']) ? $options['tilePixelThreshold'] : 0,
            'tileCols'          => isset($options['tileCols']) ? $options['tileCols'] : 2,
            'tileRows'          => isset($options['tileRows']) ? $options['tileRows'] : 2,
            'tileOverlap'       => isset($options['tileOverlap']) ? $options['tileOverlap'] : 20,
            'mutoolBandHeight'  => isset($options['mutoolBandHeight']) ? $options['mutoolBandHeight'] : 0,
            'imageDpi'          => isset($options['imageDpi']) ? $options['imageDpi'] : 400,
            'vipsAutoTile'      => isset($options['vipsAutoTile']) ? $options['vipsAutoTile'] : true,
            'vipsTileSize'      => isset($options['vipsTileSize']) ? $options['vipsTileSize'] : 1024,
        ];
        //$imageProviderOptions['vipsAutoTile'] = $options['vipsAutoTile'] ?? false;
        //$imageProviderOptions['vipsTileSize'] = $options['vipsTileSize'] ?? 1024;

        if (isset($options['imageProvider']) && $options['imageProvider'] instanceof PageImageProviderInterface) {
            $this->imageProvider = $options['imageProvider'];
        } else {
            // Prefer VIPS-based tiling if available
            if (class_exists(VipsPdfPageImageProvider::class) && VipsPdfPageImageProvider::isAvailable()) {
                $this->output("Using VIPS + MuPDF for PDF rendering & tiling.");
                $this->imageProvider = new VipsPdfPageImageProvider($pdfPath, $this->imageDir, $imageProviderOptions);
            } elseif (class_exists(MuPdfPdfPageImageProvider::class) && MuPdfPdfPageImageProvider::isAvailable()) {
                $this->output("Using MuPDF for PDF rendering.");
                $this->imageProvider = new MuPdfPdfPageImageProvider($pdfPath, $this->imageDir, $imageProviderOptions);
            } else {
                $this->output("Using Imagick for PDF rendering.");
                $this->imageProvider = new ImagickPdfPageImageProvider($pdfPath, $this->imageDir, $imageProviderOptions);
            }
        }
    }

    public function __destruct()
    {
        $this->cleanup();
    }

    public function setOptions($arrayOptions = [])
    {
        if (!is_array($arrayOptions)) {
            return;
        }
        if(count($arrayOptions) == 0) {
            return;
        }
        if(isset($arrayOptions['verbose'])) {
            $this->setVerbose($arrayOptions['verbose'] ? true : false);
        }


        $this->output("Setting Tesseract options:");
        $this->output($arrayOptions);

        // If non-associative array, assume it's pageData (backwards compatibility)
        if (!empty($arrayOptions) && array_keys($arrayOptions) === range(0, count($arrayOptions) - 1)) {
            $this->loadPageData($arrayOptions);
            return;
        }



        if (isset($arrayOptions['frameOutput'])) {
            $this->frameOutput = $arrayOptions['frameOutput'] ? true : false;
        }

        if (isset($arrayOptions['pageData'])) {
            $this->loadPageData($arrayOptions['pageData']);
        }

        if (isset($arrayOptions['tsvMode'])) {
            $this->tsvMode = $arrayOptions['tsvMode'] ? true : false;
        } elseif (isset($arrayOptions["tsv"])) {
            $this->tsvMode = $arrayOptions["tsv"] ? true : false;
        }

        if (isset($arrayOptions['opts']) && is_array($arrayOptions['opts'])) {
            $this->tesseractOpts = array_merge($this->tesseractOpts, $arrayOptions['opts']);
        }
    }

    public function setCommandOpts($opts = [], $reset = false)
    {
        if ($reset) {
            $this->tesseractOpts = [];
        }
        $this->tesseractOpts = array_merge($this->tesseractOpts, $opts);
    }

    public function save($outfile)
    {
        // ensure we get raw page array, not framed string
        $prevFrameOutput = $this->frameOutput;
        $this->frameOutput = false;
        $data = $this->ocr();
        $this->frameOutput = $prevFrameOutput;

        if (is_array($data)) {
            $object = [];
            $object["filename"] = $this->pdfPath;
            $object["pages"] = $data;
            file_put_contents($outfile, json_encode($object, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE));
            return true;
        }
        $this->output("Failed to perform OCR, nothing saved.");
        return false;
    }

    /**
     * Perform OCR on the PDF document.
     * @param bool $force Force re-OCR even if cached data exists
     * @return array<int,string>|string
     * @throws ImagickException
     */
    public function ocr($force = false)
    {
        if (!empty($this->pageData) && !$force) {
            return $this->frameOutput ? self::frame($this->pageData) : $this->pageData;
        }

        $images = $this->pageImages();
        $this->pageData = [];

        foreach ($images as $i => $imageEntry) {
            if (is_array($imageEntry)) {
                // tiled page
                $this->pageData[$i] = $this->ocrTiledPage($imageEntry);
            } else {
                // single full-page image
                $this->pageData[$i] = $this->ocrPageImage($imageEntry);
            }
        }

        if ($this->frameOutput) {
            return self::frame($this->pageData);
        }
        return $this->pageData;
    }

    public function loadPageData($pageData = [])
    {
        $this->pageData = $pageData;
    }

    public function tsvMode()
    {
        if($this->tsvMode === null) {
            $this->tsvMode = self::$useTsv;
        }
        return $this->tsvMode;
    }

    /**
     * Perform OCR on a specific page of the PDF document (1-based).
     * @param int $pageNumber
     * @return string
     * @throws ImagickException
     */
    public function ocrPage($pageNumber)
    {
        $idx = $pageNumber - 1;
        if (isset($this->pageData[$idx])) {
            return $this->pageData[$idx];
        }

        $images = $this->pageImages();
        if (!isset($images[$idx])) {
            return "[OCR FAILED - PAGE MISSING]";
        }

        $imageEntry = $images[$idx];
        if (is_array($imageEntry)) {
            $this->pageData[$idx] = $this->ocrTiledPage($imageEntry);
        } else {
            $this->pageData[$idx] = $this->ocrPageImage($imageEntry);
        }

        return $this->pageData[$idx];
    }

    /**
     * Thin wrapper: delegate to image provider.
     *
     * @return array<int, string|array<int,array{path:string,offset_x:int,offset_y:int}>>
     * @throws ImagickException
     */
    protected function pageImages()
    {
        return $this->imageProvider->getPageImages();
    }

    /**
     * OCR a single image file (page or tile).
     *
     * @param string $imagePath
     * @return string
     */
    protected function ocrPageImage($imagePath)
    {
        $temp = tempnam(sys_get_temp_dir(), 'ocr');
        $tempRead = $temp . ($this->tsvMode ? ".tsv" : ".txt");

        $cmd = "tesseract " . escapeshellarg($imagePath) . " " . escapeshellarg($temp) . " " . $this->makeOptsString();
        $this->output("[CMD] > " . $cmd);

        shell_exec($cmd . " 2>&1");

        $text = @file_get_contents($tempRead);
        @unlink($temp);
        @unlink($tempRead);

        return $text ?: "[OCR FAILED]";
    }

    /**
     * OCR a page composed of multiple tiles and stitch results (unchanged).
     *
     * @param array<int,array{path:string,offset_x:int,offset_y:int}> $tiles
     * @return string
     */
    protected function ocrTiledPage(array $tiles)
    {
        if (!$this->tsvMode) {
            // Simple concatenation for plain text mode
            $parts = [];
            foreach ($tiles as $tile) {
                $parts[] = $this->ocrPageImage($tile['path']);
            }
            return implode("\n", $parts);
        }

        // TSV mode stitching
        $globalHeader = null;
        $globalLines  = [];

        foreach ($tiles as $tile) {
            $tileTSV = $this->ocrPageImage($tile['path']);
            if (!$tileTSV || strpos($tileTSV, "[OCR FAILED]") === 0) {
                continue;
            }

            $lines = preg_split("/\r\n|\r|\n/", trim($tileTSV));
            if (empty($lines)) {
                continue;
            }

            $header = array_shift($lines);
            if ($globalHeader === null) {
                $globalHeader = $header;
            }

            // Figure out column indexes from header just once
            static $colIndex = null;
            if ($colIndex === null) {
                $cols = explode("\t", $header);
                $colIndex = [
                    'left'   => array_search('left', $cols, true),
                    'top'    => array_search('top', $cols, true),
                    'width'  => array_search('width', $cols, true),
                    'height' => array_search('height', $cols, true),
                ];
            }

            foreach ($lines as $line) {
                if ($line === '') {
                    continue;
                }
                $cols = explode("\t", $line);

                if ($colIndex['left'] === false || $colIndex['top'] === false) {
                    $globalLines[] = $line;
                    continue;
                }

                // Shift coordinates by tile offsets
                if (isset($cols[$colIndex['left']])) {
                    $cols[$colIndex['left']] = (string)((int)$cols[$colIndex['left']] + $tile['offset_x']);
                }
                if (isset($cols[$colIndex['top']])) {
                    $cols[$colIndex['top']] = (string)((int)$cols[$colIndex['top']] + $tile['offset_y']);
                }

                $globalLines[] = implode("\t", $cols);
            }
        }

        if ($globalHeader === null) {
            return "[OCR FAILED]";
        }

        return $globalHeader . "\n" . implode("\n", $globalLines) . "\n";
    }

    protected function makeOptsString()
    {
        $opts = "";
        foreach ($this->tesseractOpts as $key => $value) {
            if (!is_numeric($key)) {
                $opts .= " " . escapeshellarg($key);
            }
            if (!is_null($value) && $value !== "") {
                $opts .= " " . escapeshellarg($value);
            }
        }
        if ($this->tsvMode) {
            $opts .= " tsv";
        }
        return $opts;
    }

    protected function cleanup()
    {
        if ($this->imageProvider) {
            $this->imageProvider->cleanup();
        }
    }

    public static function load($tFile)
    {
        if (!file_exists($tFile)) {
            throw new \InvalidArgumentException("File not found: $tFile");
        }
        $data = json_decode(file_get_contents($tFile), true);
        if (json_last_error() !== JSON_ERROR_NONE) {
            throw new \RuntimeException("Failed to parse JSON: " . json_last_error_msg());
        }

        // Note: second arg is pageData here
        return new self($data['filename'], $data['pages']);
    }

    public static function frame($arrayOfPages)
    {
        /*
        [BEGIN DOCUMENT OCR OUTPUT]

        [page 1]
        ...
        [end of page 1]

        [page 2]
        ...
        [end of page 2]

        [END DOCUMENT OCR OUTPUT]
        */
        $output = "[BEGIN DOCUMENT OCR OUTPUT]\n";
        if (!is_array($arrayOfPages) && !empty($arrayOfPages)) {
            $arrayOfPages = [$arrayOfPages];
        }
        foreach ($arrayOfPages as $i => $pageData) {
            $output .= "\n[page " . ($i + 1) . "]\n";
            $output .= $pageData . "\n";
            $output .= "[end of page " . ($i + 1) . "]\n";
        }

        $output .= "\n[END DOCUMENT OCR OUTPUT]";
        return $output;
    }
}
