<?php

namespace boru\ocr\Pipeline;

use boru\ocr\Page\PageImageProviderInterface;
use boru\ocr\Page\PageImageProviderFactory;

use boru\ocr\Tesseract\TesseractCliRunner;
use boru\ocr\Tesseract\TesseractOptions;
use boru\ocr\Tesseract\TesseractTextOcr;
use boru\ocr\Tesseract\TesseractTsvOcr;

use boru\ocr\Tesseract\Tsv\TsvParser;

use boru\ocr\Layout\TsvLayoutBuilder;
use boru\ocr\Evidence\TsvEvidenceIndex;
use boru\ocr\Confidence\ConfidenceCalculator;
use boru\ocr\Layout\LayoutOptions;
use boru\ocr\Traits\OcrLogTrait;

/**
 * Orchestrates the deterministic (non-AI) OCR flow:
 * - Render pages via provider (or choose best provider)
 * - Run Tesseract text OCR
 * - Run Tesseract TSV OCR
 * - Build baseline layout text from TSV
 * - Provide EvidenceIndex
 *
 * AI integration (planner/final agent) should be layered above this.
 */
class OcrPipeline
{
    /** @var OCRLogger|null */
    protected $logger;
    use OcrLogTrait;

    /** @var TesseractCliRunner */
    protected $runner;

    /** @var TesseractOptions */
    protected $tesseractOptions;

    /** @var array */
    protected $providerOptions = array();

    /** @var LayoutOptions */
    protected $layoutOptions;

    /** @var string|null */
    protected $imageDir = null;

    /** @var string */
    protected $tesseractBinary = 'tesseract';

    /**
     * @param array $config
     *  - logger (callable|null)
     *  - tesseractBinary (string) default 'tesseract'
     *  - tesseractOptions (array) passed to TesseractOptions
     *  - providerOptions (array) passed to provider
     *  - layoutOptions (array) passed to TsvLayoutBuilder
     *  - imageDir (string|null) where providers should write images
     */
    public function __construct(array $config = array())
    {
        $this->logger = isset($config['logger']) ? $config['logger'] : null;

        if (isset($config['tesseractBinary'])) {
            $this->tesseractBinary = (string)$config['tesseractBinary'];
        }

        if(isset($config["tesseractOptions"]) && $config["tesseractOptions"] instanceof TesseractOptions) {
            $this->tesseractOptions = $config["tesseractOptions"];
        } elseif (isset($config['tesseractOptions']) && is_array($config['tesseractOptions'])) {
            $this->tesseractOptions = new TesseractOptions($config['tesseractOptions']);
        } else {
            $this->tesseractOptions = new TesseractOptions([]);
        }

        if (isset($config['providerOptions']) && is_array($config['providerOptions'])) {
            $this->providerOptions = $config['providerOptions'];
        }

        if(isset($config["layoutOptions"]) && $config["layoutOptions"] instanceof LayoutOptions) {
            $this->layoutOptions = $config["layoutOptions"];
        } elseif (isset($config['layoutOptions']) && is_array($config['layoutOptions'])) {
            $this->layoutOptions = LayoutOptions::create($config['layoutOptions']);
        } else {
            $this->layoutOptions = LayoutOptions::create();
        }
        if($this->logger !== null) {
            if(empty($this->layoutOptions->logger)) {
                $this->layoutOptions->logger = $this->logger;
            }
        }

        if (isset($config['imageDir'])) {
            $this->imageDir = $config['imageDir'];
        }

        $this->runner = new TesseractCliRunner($this->tesseractBinary);
    }

    public function getTesseractOptions()
    {
        return $this->tesseractOptions;
    }

    /**
     * Run OCR for a file.
     *
     * @param string $sourceFile
     * @param PageImageProviderInterface|null $provider
     * @return OcrPipelineResultBundle
     * @throws \Exception
     */
    public function run($sourceFile, $provider = null)
    {
        $sourceFile = (string)$sourceFile;
        if ($sourceFile === '' || !file_exists($sourceFile)) {
            throw new \Exception("OcrPipeline: source file not found: " . $sourceFile);
        }

        // Choose provider if not provided
        if ($provider === null) {
            $this->logInfo("Resolving source file type and page image provider...");
            if($this->logger !== null) {
                if(!isset($this->providerOptions["logger"]) || empty($this->providerOptions['logger'])) {
                    $this->providerOptions['logger'] = $this->logger;
                }
            }
            $resolved = \boru\ocr\Source\SourceRouter::resolve(
                $sourceFile,
                $this->imageDir,
                $this->providerOptions
            );

            // Short-circuit spreadsheets
            if ($resolved['type'] === 'spreadsheet') {
                $result = new OcrResult($sourceFile);

                $pages = $resolved['textProvider']->getTextPages();

                $result->textPages = $pages;
                $result->baselinePages = $pages; // baseline == text for spreadsheets
                $result->metrics = array(
                    'type' => 'spreadsheet',
                    'pageCount' => count($pages),
                );

                return new OcrPipelineResultBundle(
                    $result,
                    new \boru\ocr\Evidence\TsvEvidenceIndex(array())
                );
            }

            // NEW: short-circuit Word documents
            if ($resolved['type'] === 'word') {
                $result = new OcrResult($sourceFile);

                $pages = $resolved['textProvider']->getTextPages();

                $result->textPages = $pages;
                $result->baselinePages = $pages; // baseline == text for word docs
                $result->metrics = array(
                    'type' => 'word',
                    'pageCount' => is_array($pages) ? count($pages) : 0,
                );

                return new OcrPipelineResultBundle(
                    $result,
                    new \boru\ocr\Evidence\TsvEvidenceIndex(array())
                );
            }

            $provider = $resolved['provider'];
        }

        // Build engines
        $textOcr = new TesseractTextOcr($provider, $this->runner);

        $parser = new TsvParser();
        $tsvOcr = new TesseractTsvOcr($provider, $this->runner, $parser);

        $layoutBuilder = new TsvLayoutBuilder($this->layoutOptions);

        $result = new OcrResult($sourceFile);

        // Orchestrate (always cleanup provider)
        try {
            $this->logLog("Rendering PDF pages / preparing images...");
            $pages = $provider->getPages(); // triggers render

            // Text OCR
            $this->logLog("Running Tesseract text OCR...");
            $result->textPages = $textOcr->ocrDocument($this->tesseractOptions);

            // TSV OCR
            $this->logLog("Running Tesseract TSV OCR...");
            $result->tsvPages = $tsvOcr->ocrDocument($this->tesseractOptions);

            // Baseline from TSV
            $this->logLog("Building baseline layout text from TSV...");
            $baselinePages = array();
            foreach ($result->tsvPages as $tsvPage) {
                $baselinePages[] = $layoutBuilder->buildPageText($tsvPage);

                // Capture per-page layout diagnostics + table candidate hints (if enabled)
                $diag = $layoutBuilder->getLastDiagnostics();
                if ($diag) {
                    $this->logLog("Capturing TSV layout diagnostics for page " . (int)$tsvPage->pageNumber);
                    $this->logDebug(json_encode($diag, JSON_PRETTY_PRINT|JSON_UNESCAPED_UNICODE|JSON_UNESCAPED_SLASHES));
                    $result->layoutDiagnosticsPages[] = $diag;

                    if (isset($diag['metrics']) && isset($diag['metrics']['tableCandidates']) && is_array($diag['metrics']['tableCandidates'])) {
                        foreach ($diag['metrics']['tableCandidates'] as $cand) {
                            // attach page number if missing
                            if (!isset($cand['page'])) $cand['page'] = (int)$tsvPage->pageNumber;
                            $result->tableCandidates[] = $cand;
                        }
                    }
                }
            }
            $result->baselinePages = $baselinePages;

            // Metrics (basic, can expand later)
            $result->metrics = array(
                'pageCount' => is_array($pages) ? count($pages) : 0,
                'hasTextPages' => is_array($result->textPages) && count($result->textPages) > 0,
                'hasTsvPages' => is_array($result->tsvPages) && count($result->tsvPages) > 0,
            );

            if ($resolved['type'] === 'spreadsheet') {
                $result->confidence = new \boru\ocr\Confidence\ConfidenceReport();
                $result->confidence->mode = 'none';
                $result->confidence->notes[] = 'No OCR/TSV confidence available for this source type.';
            } else {
                // Confidence report (TSV-based)
                $this->logLog("Computing confidence report from TSV...");
                $calc = new ConfidenceCalculator();
                $result->confidence = $calc->fromTsvPages($result->tsvPages);

                $result->debug = array(
                    'providerClass' => is_object($provider) ? get_class($provider) : null,
                    'providerOptions' => $this->providerOptions,
                    'layoutOptions' => $this->layoutOptions,
                    'tesseractBinary' => $this->tesseractBinary,
                    'tesseractOptions' => array(
                        'lang' => $this->tesseractOptions->lang,
                        'psm' => $this->tesseractOptions->psm,
                        'oem' => $this->tesseractOptions->oem,
                        'config' => $this->tesseractOptions->config,
                    ),
                );
            }

            // Bundle evidence index for convenience
            $evidence = new TsvEvidenceIndex($result->tsvPages);

            return new OcrPipelineResultBundle($result, $evidence);
        } finally {
            $this->logLog("Cleaning up page image provider...");
            try {
                if($this->tesseractOptions->saveImages !== false) {
                    $pages = $provider->getPages();
                    foreach($pages as $pageIndex => $pageEntry) {
                        // single image
                        if(is_string($pageEntry)) {
                            $destPath = rtrim($this->tesseractOptions->saveImages, DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR .
                                'page_' . ($pageIndex + 1) . '.' . pathinfo($pageEntry, PATHINFO_EXTENSION);
                            copy($pageEntry, $destPath);
                        }
                        // tiled images
                        if(is_array($pageEntry)) {
                            foreach($pageEntry as $tileIndex => $tile) {
                                if(!isset($tile['path'])) {
                                    continue;
                                }
                                $destPath = rtrim($this->tesseractOptions->saveImages, DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR .
                                    'page_' . ($pageIndex + 1) . '_tile_' . ($tileIndex + 1) . '.' . pathinfo($tile['path'], PATHINFO_EXTENSION);
                                copy($tile['path'], $destPath);
                            }
                        }
                    }
                }
                $provider->cleanup();
            } catch (\Exception $e) {
                // swallow cleanup exceptions; caller can inspect logs
                $this->logLog("Provider cleanup failed: " . $e->getMessage());
            }
        }
    }
}
