<?php

namespace boru\ocr\Tesseract;

use boru\ocr\Page\PageImageProviderInterface;

class TesseractTextOcr
{
    /** @var PageImageProviderInterface */
    protected $pageProvider;

    /** @var TesseractCliRunner */
    protected $runner;

    public function __construct(PageImageProviderInterface $pageProvider, TesseractCliRunner $runner)
    {
        $this->pageProvider = $pageProvider;
        $this->runner = $runner;
    }

    /**
     * Returns per-page text (array), not framed.
     *
     * @param TesseractOptions $opts
     * @return array Array of page strings
     */
    public function ocrDocument(TesseractOptions $opts)
    {
        $pages = $this->pageProvider->getPages();
        $outPages = array();

        foreach ($pages as $pageEntry) {
            // single image
            if (is_string($pageEntry)) {
                $outPages[] = $this->runner->run($pageEntry, $opts, 'txt');
                continue;
            }

            // tiled images -> join tile OCR outputs (simple concat)
            if (is_array($pageEntry)) {
                $lines = array();

                foreach ($pageEntry as $tile) {
                    if (!isset($tile['path'])) continue;

                    $txt = $this->runner->run($tile['path'], $opts, 'txt');
                    $tileLines = preg_split('/\R/u', $txt);

                    foreach ($tileLines as $ln) {
                        $ln = trim($ln);
                        if ($ln === '') continue;
                        $lines[] = $ln;
                    }
                }

                // Dedupe overlapping tile output
                $lines = $this->dedupeTextLines($lines);

                $out[] = implode("\n", $lines);
            }


            $outPages[] = '';
        }

        return $outPages;
    }

    /**
     * Remove duplicate lines caused by overlapping tiles.
     *
     * @param string[] $lines
     * @return string[]
     */
    protected function dedupeTextLines(array $lines)
    {
        $seen = array();
        $out  = array();

        foreach ($lines as $ln) {
            // normalize aggressively – text OCR is noisy
            $key = mb_strtolower(
                preg_replace('/\s+/u', ' ', trim($ln))
            );

            if ($key === '') continue;
            if (isset($seen[$key])) continue;

            $seen[$key] = true;
            $out[] = $ln;
        }

        return $out;
    }

}
