<?php

namespace boru\ocr\Pipeline;

use boru\ocr\Page\PageImageProviderInterface;
use boru\ocr\Page\PageImageProviderFactory;

use boru\ocr\Tesseract\TesseractCliRunner;
use boru\ocr\Tesseract\TesseractOptions;
use boru\ocr\Tesseract\TesseractTextOcr;
use boru\ocr\Tesseract\TesseractTsvOcr;

use boru\ocr\Tesseract\Tsv\TsvParser;

use boru\ocr\Layout\TsvLayoutBuilder;
use boru\ocr\Evidence\TsvEvidenceIndex;
use boru\ocr\Confidence\ConfidenceCalculator;

/**
 * Orchestrates the deterministic (non-AI) OCR flow:
 * - Render pages via provider (or choose best provider)
 * - Run Tesseract text OCR
 * - Run Tesseract TSV OCR
 * - Build baseline layout text from TSV
 * - Provide EvidenceIndex
 *
 * AI integration (planner/final agent) should be layered above this.
 */
class OcrPipeline
{
    /** @var callable|null */
    protected $logger;

    /** @var TesseractCliRunner */
    protected $runner;

    /** @var TesseractOptions */
    protected $tesseractOptions;

    /** @var array */
    protected $providerOptions = array();

    /** @var array */
    protected $layoutOptions = array();

    /** @var string|null */
    protected $imageDir = null;

    /** @var string */
    protected $tesseractBinary = 'tesseract';

    /**
     * @param array $config
     *  - logger (callable|null)
     *  - tesseractBinary (string) default 'tesseract'
     *  - tesseractOptions (array) passed to TesseractOptions
     *  - providerOptions (array) passed to provider
     *  - layoutOptions (array) passed to TsvLayoutBuilder
     *  - imageDir (string|null) where providers should write images
     */
    public function __construct(array $config = array())
    {
        $this->logger = isset($config['logger']) ? $config['logger'] : null;

        if (isset($config['tesseractBinary'])) {
            $this->tesseractBinary = (string)$config['tesseractBinary'];
        }

        $tessOpts = isset($config['tesseractOptions']) && is_array($config['tesseractOptions'])
            ? $config['tesseractOptions']
            : array();

        $this->tesseractOptions = new TesseractOptions($tessOpts);

        if (isset($config['providerOptions']) && is_array($config['providerOptions'])) {
            $this->providerOptions = $config['providerOptions'];
        }

        if (isset($config['layoutOptions']) && is_array($config['layoutOptions'])) {
            $this->layoutOptions = $config['layoutOptions'];
        }
        if($this->logger && is_callable($this->logger)) {
            if(!isset($this->layoutOptions["logger"])) {
                $this->layoutOptions["logger"] = $this->logger;
            }
        }

        if (isset($config['imageDir'])) {
            $this->imageDir = $config['imageDir'];
        }

        $this->runner = new TesseractCliRunner($this->tesseractBinary);
    }

    /**
     * Run OCR for a file.
     *
     * @param string $sourceFile
     * @param PageImageProviderInterface|null $provider
     * @return OcrPipelineResultBundle
     * @throws \Exception
     */
    public function run($sourceFile, $provider = null)
    {
        $sourceFile = (string)$sourceFile;
        if ($sourceFile === '' || !file_exists($sourceFile)) {
            throw new \Exception("OcrPipeline: source file not found: " . $sourceFile);
        }

        // Choose provider if not provided
        if ($provider === null) {
            $resolved = \boru\ocr\Source\SourceRouter::resolve(
                $sourceFile,
                $this->imageDir,
                $this->providerOptions
            );

            if ($resolved['type'] === 'spreadsheet') {
                $result = new OcrResult($sourceFile);

                $pages = $resolved['textProvider']->getTextPages();

                $result->textPages = $pages;
                $result->baselinePages = $pages; // baseline == text for spreadsheets
                $result->metrics = array(
                    'type' => 'spreadsheet',
                    'pageCount' => count($pages),
                );

                return new OcrPipelineResultBundle(
                    $result,
                    new \boru\ocr\Evidence\TsvEvidenceIndex(array())
                );
            }

            $provider = $resolved['provider'];
        }

        // Build engines
        $textOcr = new TesseractTextOcr($provider, $this->runner);

        $parser = new TsvParser();
        $tsvOcr = new TesseractTsvOcr($provider, $this->runner, $parser);

        $layoutBuilder = new TsvLayoutBuilder($this->layoutOptions);

        $result = new OcrResult($sourceFile);

        // Orchestrate (always cleanup provider)
        try {
            $this->log("Rendering PDF pages / preparing images...");
            $pages = $provider->getPages(); // triggers render

            // Text OCR
            $this->log("Running Tesseract text OCR...");
            $result->textPages = $textOcr->ocrDocument($this->tesseractOptions);

            // TSV OCR
            $this->log("Running Tesseract TSV OCR...");
            $result->tsvPages = $tsvOcr->ocrDocument($this->tesseractOptions);

            // Baseline from TSV
            $this->log("Building baseline layout text from TSV...");
            $baselinePages = array();
            foreach ($result->tsvPages as $tsvPage) {
                $baselinePages[] = $layoutBuilder->buildPageText($tsvPage);

                // Capture per-page layout diagnostics + table candidate hints (if enabled)
                $diag = $layoutBuilder->getLastDiagnostics();
                if ($diag) {
                    $result->layoutDiagnosticsPages[] = $diag;

                    if (isset($diag['metrics']) && isset($diag['metrics']['tableCandidates']) && is_array($diag['metrics']['tableCandidates'])) {
                        foreach ($diag['metrics']['tableCandidates'] as $cand) {
                            // attach page number if missing
                            if (!isset($cand['page'])) $cand['page'] = (int)$tsvPage->pageNumber;
                            $result->tableCandidates[] = $cand;
                        }
                    }
                }
            }
            $result->baselinePages = $baselinePages;

            // Metrics (basic, can expand later)
            $result->metrics = array(
                'pageCount' => is_array($pages) ? count($pages) : 0,
                'hasTextPages' => is_array($result->textPages) && count($result->textPages) > 0,
                'hasTsvPages' => is_array($result->tsvPages) && count($result->tsvPages) > 0,
            );

            if ($resolved['type'] === 'spreadsheet') {
                $result->confidence = new \boru\ocr\Confidence\ConfidenceReport();
                $result->confidence->mode = 'none';
                $result->confidence->notes[] = 'No OCR/TSV confidence available for this source type.';
            } else {
                // Confidence report (TSV-based)
                $this->log("Computing confidence report from TSV...");
                $calc = new ConfidenceCalculator();
                $result->confidence = $calc->fromTsvPages($result->tsvPages);

                $result->debug = array(
                    'providerClass' => is_object($provider) ? get_class($provider) : null,
                    'providerOptions' => $this->providerOptions,
                    'layoutOptions' => $this->layoutOptions,
                    'tesseractBinary' => $this->tesseractBinary,
                    'tesseractOptions' => array(
                        'lang' => $this->tesseractOptions->lang,
                        'psm' => $this->tesseractOptions->psm,
                        'oem' => $this->tesseractOptions->oem,
                        'config' => $this->tesseractOptions->config,
                    ),
                );
            }

            // Bundle evidence index for convenience
            $evidence = new TsvEvidenceIndex($result->tsvPages);

            return new OcrPipelineResultBundle($result, $evidence);
        } finally {
            $this->log("Cleaning up page image provider...");
            try {
                $provider->cleanup();
            } catch (\Exception $e) {
                // swallow cleanup exceptions; caller can inspect logs
                $this->log("Provider cleanup failed: " . $e->getMessage());
            }
        }
    }

    protected function log($message)
    {
        if ($this->logger !== null && is_callable($this->logger)) {
            call_user_func($this->logger, (string)$message);
        }
    }
}
