<?php

namespace boru\ocr\Tesseract;

use boru\ocr\Page\PageImageProviderInterface;

class TesseractTextOcr
{
    /** @var PageImageProviderInterface */
    protected $pageProvider;

    /** @var TesseractCliRunner */
    protected $runner;

    public function __construct(PageImageProviderInterface $pageProvider, TesseractCliRunner $runner)
    {
        $this->pageProvider = $pageProvider;
        $this->runner = $runner;
    }

    /**
     * Returns per-page text (array), not framed.
     *
     * @param TesseractOptions $opts
     * @return array Array of page strings
     */
    public function ocrDocument(TesseractOptions $opts)
    {
        $pages = $this->pageProvider->getPages();
        $outPages = array();

        foreach ($pages as $pageEntry) {
            // single image
            if (is_string($pageEntry)) {
                $outPages[] = $this->runner->run($pageEntry, $opts, 'txt');
                continue;
            }

            // tiled images -> join tile OCR outputs (simple concat)
            if (is_array($pageEntry)) {
                $pageText = '';
                foreach ($pageEntry as $tile) {
                    if (!isset($tile['path'])) continue;
                    $tileText = $this->runner->run($tile['path'], $opts, 'txt');
                    if ($tileText !== '') {
                        $pageText .= ($pageText === '' ? '' : "\n") . rtrim($tileText);
                    }
                }
                $outPages[] = $pageText;
                continue;
            }

            $outPages[] = '';
        }

        return $outPages;
    }
}
