<?php

namespace boru\ocr\Tesseract;

use boru\ocr\Page\PageImageProviderInterface;
use boru\ocr\Tesseract\Tsv\TsvParser;
use boru\ocr\Tesseract\Tsv\TsvPage;

class TesseractTsvOcr
{
    /** @var PageImageProviderInterface */
    protected $pageProvider;

    /** @var TesseractCliRunner */
    protected $runner;

    /** @var TsvParser */
    protected $parser;

    public function __construct(PageImageProviderInterface $pageProvider, TesseractCliRunner $runner, TsvParser $parser)
    {
        $this->pageProvider = $pageProvider;
        $this->runner = $runner;
        $this->parser = $parser;
    }

    public function ocrDocument(TesseractOptions $opts)
    {
        $pages = $this->pageProvider->getPages();
        $out = array();

        $pageIndex = 0;
        foreach ($pages as $pageEntry) {
            $pageNumber = $pageIndex + 1;

            // single image -> parse rows
            if (is_string($pageEntry)) {
                $raw  = $this->runner->run($pageEntry, $opts, 'tsv');
                $rows = $this->parser->parseRows($raw);
                $out[] = new TsvPage($pageNumber, $rows);

                $pageIndex++;
                continue;
            }

            // tiled images -> parse each tile, offset, merge
            if (is_array($pageEntry)) {
                $merged = array();

                $tileIdx = 0;
                foreach ($pageEntry as $tile) {
                    $tileIdx++;

                    if (!isset($tile['path'])) continue;

                    $dx = isset($tile['offset_x']) ? (int)$tile['offset_x'] : 0;
                    $dy = isset($tile['offset_y']) ? (int)$tile['offset_y'] : 0;

                    $raw  = $this->runner->run($tile['path'], $opts, 'tsv');
                    $rows = $this->parser->parseRows($raw);

                    // Make block/par/line namespaces unique per tile so byLine() can't collide
                    $base = $tileIdx * 100000;

                    foreach ($rows as $r) {
                        $r->applyOffset($dx, $dy);

                        // only adjust if present (and keep relative ordering within tile)
                        $r->block_num += $base;
                        $r->par_num   += $base;
                        $r->line_num  += $base;

                        $merged[] = $r;
                    }
                }

                // Remove duplicates caused by overlap / repeated OCR
                $merged = $this->dedupeRows($merged);

                $out[] = new TsvPage($pageNumber, $merged);

                $pageIndex++;
                continue;
            }

            $out[] = new TsvPage($pageNumber, array());
            $pageIndex++;
        }

        return $out;
    }

    /**
     * Dedupe rows (primarily word rows) by (text + approx bbox). Keep highest conf.
     *
     * @param \boru\ocr\Tesseract\Tsv\TsvRow[] $rows
     * @return \boru\ocr\Tesseract\Tsv\TsvRow[]
     */
    protected function dedupeRows(array $rows)
    {
        $best = array();

        foreach ($rows as $r) {
            // focus on word rows; keep others as-is
            if ((int)$r->level !== 5) {
                $best['__nonword__' . spl_object_hash($r)] = $r;
                continue;
            }

            $txt = trim((string)$r->text);
            if ($txt === '') continue;

            // bucket bbox to tolerate tiny coordinate diffs between tiles
            $bx = (int)floor(((int)$r->left)   / 2);
            $by = (int)floor(((int)$r->top)    / 2);
            $bw = (int)floor(((int)$r->width)  / 2);
            $bh = (int)floor(((int)$r->height) / 2);

            $key = $txt . '|' . $bx . '|' . $by . '|' . $bw . '|' . $bh;

            if (!isset($best[$key])) {
                $best[$key] = $r;
                continue;
            }

            // keep the higher-confidence version
            if ((float)$r->conf > (float)$best[$key]->conf) {
                $best[$key] = $r;
            }
        }

        return array_values($best);
    }

}
