<?php

namespace boru\ocr\Evidence;

use boru\ocr\Tesseract\Tsv\TsvPage;
use boru\ocr\Tesseract\Tsv\TsvRow;

/**
 * Deterministic TSV evidence querying over a set of pages.
 *
 * This is designed to back "tool/callback" style access from an AI agent:
 * - find anchor text occurrences
 * - fetch tokens within a bounding box
 * - fetch text near an anchor or point
 * - fetch a specific line group
 * - basic confidence stats
 *
 * PHP 5.6 compatible (no scalar types / return types).
 */
class TsvEvidenceIndex
{
    /** @var TsvPage[] keyed by 1-based page number */
    protected $pagesByNumber = array();

    /** @var bool */
    protected $caseInsensitive = true;

    /**
     * @param TsvPage[] $pages
     * @param array $options
     *   - caseInsensitive (bool) default true
     */
    public function __construct(array $pages, array $options = array())
    {
        $this->caseInsensitive = array_key_exists('caseInsensitive', $options)
            ? (bool)$options['caseInsensitive']
            : true;

        foreach ($pages as $p) {
            if (!$p instanceof TsvPage) continue;
            $this->pagesByNumber[(int)$p->pageNumber] = $p;
        }
    }

    /**
     * @param int $pageNumber 1-based
     * @return TsvPage|null
     */
    public function getPage($pageNumber)
    {
        $pageNumber = (int)$pageNumber;
        return isset($this->pagesByNumber[$pageNumber]) ? $this->pagesByNumber[$pageNumber] : null;
    }

    /**
     * Find occurrences of text on a page (or all pages).
     *
     * This is intentionally "good enough" to drive evidence retrieval:
     * - It searches line text (constructed from word tokens)
     * - Returns matches with line key + approximate bounding box
     *
     * @param string $needle
     * @param array $options
     *   - page (int|null) search only this page (1-based)
     *   - minConf (int) default 0
     *   - maxMatches (int) default 20
     *   - useWordsOnly (bool) default true (level=5)
     * @return TsvTextMatch[]
     */
    public function findText($needle, array $options = array())
    {
        $needle = (string)$needle;
        if ($needle === '') return array();

        $minConf = isset($options['minConf']) ? (int)$options['minConf'] : 0;
        $maxMatches = isset($options['maxMatches']) ? (int)$options['maxMatches'] : 20;
        $useWordsOnly = array_key_exists('useWordsOnly', $options) ? (bool)$options['useWordsOnly'] : true;

        $pageFilter = isset($options['page']) ? (int)$options['page'] : null;

        $matches = array();
        $needleCmp = $this->caseInsensitive ? mb_strtolower($needle, 'UTF-8') : $needle;

        foreach ($this->pagesByNumber as $pageNum => $page) {
            if ($pageFilter !== null && $pageNum !== $pageFilter) continue;

            $lines = $this->getLines($page, $minConf, $useWordsOnly);

            foreach ($lines as $lineKey => $lineRows) {
                $lineText = $this->renderLineText($lineRows);
                if ($lineText === '') continue;

                $hay = $this->caseInsensitive ? mb_strtolower($lineText, 'UTF-8') : $lineText;
                if (strpos($hay, $needleCmp) === false) continue;

                $bounds = $this->boundsForRows($lineRows);

                $matches[] = new TsvTextMatch(array(
                    'page' => $pageNum,
                    'lineKey' => $lineKey,
                    'text' => $lineText,
                    'left' => $bounds['left'],
                    'top' => $bounds['top'],
                    'width' => $bounds['width'],
                    'height' => $bounds['height'],
                ));

                if (count($matches) >= $maxMatches) {
                    return $matches;
                }
            }
        }

        return $matches;
    }

    /**
     * Get word tokens within a bounding box.
     *
     * @param int $pageNumber 1-based
     * @param int $x1 left
     * @param int $y1 top
     * @param int $x2 right
     * @param int $y2 bottom
     * @param array $options
     *   - minConf (int) default 0
     *   - useWordsOnly (bool) default true
     *   - sort (string) 'reading'|'top_left' default 'reading'
     * @return TsvRow[]
     */
    public function getBoxWords($pageNumber, $x1, $y1, $x2, $y2, array $options = array())
    {
        $page = $this->getPage($pageNumber);
        if (!$page) return array();

        $minConf = isset($options['minConf']) ? (int)$options['minConf'] : 0;
        $useWordsOnly = array_key_exists('useWordsOnly', $options) ? (bool)$options['useWordsOnly'] : true;
        $sort = isset($options['sort']) ? (string)$options['sort'] : 'reading';

        $x1 = (int)$x1; $y1 = (int)$y1; $x2 = (int)$x2; $y2 = (int)$y2;
        if ($x2 < $x1) { $t = $x1; $x1 = $x2; $x2 = $t; }
        if ($y2 < $y1) { $t = $y1; $y1 = $y2; $y2 = $t; }

        $rows = $useWordsOnly ? $page->words() : $page->rows;
        $out = array();

        foreach ($rows as $r) {
            if ($useWordsOnly && (int)$r->level !== 5) continue;
            if ((int)$r->level === 5 && $r->conf < $minConf) continue;
            if ($r->text === '') continue;

            // Intersection test
            $rl = (int)$r->left;
            $rt = (int)$r->top;
            $rr = (int)$r->right();
            $rb = (int)$r->bottom();

            if ($rr < $x1 || $rl > $x2) continue;
            if ($rb < $y1 || $rt > $y2) continue;

            $out[] = $r;
        }

        // Sorting
        if ($sort === 'top_left') {
            usort($out, array($this, 'cmpTopThenLeft'));
        } else {
            // "reading": sort by (block,par,line,word) if possible, else top/left
            usort($out, array($this, 'cmpReading'));
        }

        return $out;
    }

    /**
     * Get text within a bounding box, line-joined.
     *
     * @param int $pageNumber 1-based
     * @param int $x1
     * @param int $y1
     * @param int $x2
     * @param int $y2
     * @param array $options (same as getBoxWords)
     * @return string
     */
    public function getBoxText($pageNumber, $x1, $y1, $x2, $y2, array $options = array())
    {
        $words = $this->getBoxWords($pageNumber, $x1, $y1, $x2, $y2, $options);
        return $this->renderTextByLineGroups($words);
    }

    /**
     * Get text near an anchor match.
     *
     * @param TsvTextMatch $match
     * @param array $options
     *   - padX (int) default 80
     *   - padY (int) default 60
     *   - minConf (int) default 0
     * @return string
     */
    public function getNearMatchText(TsvTextMatch $match, array $options = array())
    {
        $padX = isset($options['padX']) ? (int)$options['padX'] : 80;
        $padY = isset($options['padY']) ? (int)$options['padY'] : 60;
        $minConf = isset($options['minConf']) ? (int)$options['minConf'] : 0;

        $x1 = $match->left - $padX;
        $y1 = $match->top - $padY;
        $x2 = $match->left + $match->width + $padX;
        $y2 = $match->top + $match->height + $padY;

        return $this->getBoxText($match->page, $x1, $y1, $x2, $y2, array(
            'minConf' => $minConf,
            'useWordsOnly' => true,
            'sort' => 'reading',
        ));
    }

    /**
     * Fetch a specific TSV line group by its key "block:par:line".
     *
     * @param int $pageNumber 1-based
     * @param string $lineKey "b:p:l"
     * @param array $options
     *   - minConf (int) default 0
     *   - useWordsOnly (bool) default true
     * @return TsvRow[]
     */
    public function getLineRows($pageNumber, $lineKey, array $options = array())
    {
        $page = $this->getPage($pageNumber);
        if (!$page) return array();

        $minConf = isset($options['minConf']) ? (int)$options['minConf'] : 0;
        $useWordsOnly = array_key_exists('useWordsOnly', $options) ? (bool)$options['useWordsOnly'] : true;

        $lines = $this->getLines($page, $minConf, $useWordsOnly);

        if (!isset($lines[$lineKey])) return array();

        $rows = $lines[$lineKey];
        usort($rows, array($this, 'cmpLeft'));
        return $rows;
    }

    /**
     * Basic per-page stats (word count, conf distribution).
     *
     * @param int $pageNumber 1-based
     * @param array $options
     *   - useWordsOnly (bool) default true
     * @return array
     */
    public function stats($pageNumber, array $options = array())
    {
        $page = $this->getPage($pageNumber);
        if (!$page) return array();

        $useWordsOnly = array_key_exists('useWordsOnly', $options) ? (bool)$options['useWordsOnly'] : true;
        $rows = $useWordsOnly ? $page->words() : $page->rows;

        $count = 0;
        $confCount = 0;
        $confSum = 0.0;
        $confMin = null;
        $confMax = null;

        foreach ($rows as $r) {
            if ($useWordsOnly && (int)$r->level !== 5) continue;
            if ($r->text === '') continue;

            $count++;

            if ((int)$r->level === 5 && $r->conf >= 0) {
                $confCount++;
                $confSum += (float)$r->conf;
                if ($confMin === null || $r->conf < $confMin) $confMin = $r->conf;
                if ($confMax === null || $r->conf > $confMax) $confMax = $r->conf;
            }
        }

        return array(
            'page' => (int)$pageNumber,
            'tokenCount' => $count,
            'confCount' => $confCount,
            'confAvg' => $confCount > 0 ? ($confSum / $confCount) : null,
            'confMin' => $confMin,
            'confMax' => $confMax,
        );
    }

    // -----------------------
    // Internal helpers
    // -----------------------

    /**
     * Build line groups keyed by "block:par:line".
     *
     * @param TsvPage $page
     * @param int $minConf
     * @param bool $useWordsOnly
     * @return array<string, TsvRow[]>
     */
    protected function getLines(TsvPage $page, $minConf, $useWordsOnly)
    {
        $rows = $useWordsOnly ? $page->words() : $page->rows;

        $lines = array();
        foreach ($rows as $r) {
            if ($useWordsOnly && (int)$r->level !== 5) continue;
            if ((int)$r->level === 5 && $r->conf < $minConf) continue;
            if ($r->text === '') continue;

            $k = $r->block_num . ':' . $r->par_num . ':' . $r->line_num;
            if (!isset($lines[$k])) $lines[$k] = array();
            $lines[$k][] = $r;
        }

        // Sort tokens inside each line
        foreach ($lines as $k => $lineRows) {
            usort($lineRows, array($this, 'cmpLeft'));
            $lines[$k] = $lineRows;
        }

        return $lines;
    }

    /**
     * Render a line (word rows) into text.
     *
     * @param TsvRow[] $rows
     * @return string
     */
    protected function renderLineText(array $rows)
    {
        if (!$rows) return '';

        usort($rows, array($this, 'cmpLeft'));

        $out = '';
        $prevRight = null;

        foreach ($rows as $r) {
            $t = $r->text;
            if ($t === '') continue;

            $t = str_replace(array("\t", "\r", "\n"), ' ', $t);

            if ($out === '') {
                $out = $t;
                $prevRight = (int)$r->right();
                continue;
            }

            $gap = (int)$r->left - (int)$prevRight;
            $out .= ($gap > 12) ? '  ' : ' ';
            $out .= $t;

            $prevRight = (int)$r->right();
        }

        return trim($out);
    }

    /**
     * Render arbitrary word rows into multi-line text by grouping them by line key.
     *
     * @param TsvRow[] $words
     * @return string
     */
    protected function renderTextByLineGroups(array $words)
    {
        if (!$words) return '';

        // Group by line key, then sort groups by top
        $groups = array();
        foreach ($words as $r) {
            $k = $r->block_num . ':' . $r->par_num . ':' . $r->line_num;
            if (!isset($groups[$k])) $groups[$k] = array();
            $groups[$k][] = $r;
        }

        // Build line objects with bounds for sorting
        $lines = array();
        foreach ($groups as $k => $rows) {
            usort($rows, array($this, 'cmpLeft'));
            $b = $this->boundsForRows($rows);
            $lines[] = array('key' => $k, 'rows' => $rows, 'top' => $b['top'], 'left' => $b['left']);
        }

        usort($lines, array($this, 'cmpLineTopThenLeft'));

        $outLines = array();
        foreach ($lines as $line) {
            $t = $this->renderLineText($line['rows']);
            if ($t !== '') $outLines[] = $t;
        }

        return implode("\n", $outLines);
    }

    /**
     * Compute bounds for rows.
     *
     * @param TsvRow[] $rows
     * @return array{left:int,top:int,width:int,height:int}
     */
    protected function boundsForRows(array $rows)
    {
        $left = null; $top = null; $right = null; $bottom = null;

        foreach ($rows as $r) {
            $l = (int)$r->left;
            $t = (int)$r->top;
            $rr = (int)$r->right();
            $bb = (int)$r->bottom();

            if ($left === null || $l < $left) $left = $l;
            if ($top === null || $t < $top) $top = $t;
            if ($right === null || $rr > $right) $right = $rr;
            if ($bottom === null || $bb > $bottom) $bottom = $bb;
        }

        if ($left === null) $left = 0;
        if ($top === null) $top = 0;
        if ($right === null) $right = 0;
        if ($bottom === null) $bottom = 0;

        return array(
            'left' => $left,
            'top' => $top,
            'width' => $right - $left,
            'height' => $bottom - $top,
        );
    }

    // -----------------------
    // Comparators
    // -----------------------

    public function cmpLeft(TsvRow $a, TsvRow $b)
    {
        if ($a->left === $b->left) {
            if ($a->top === $b->top) return 0;
            return ($a->top < $b->top) ? -1 : 1;
        }
        return ($a->left < $b->left) ? -1 : 1;
    }

    public function cmpTopThenLeft(TsvRow $a, TsvRow $b)
    {
        if ($a->top === $b->top) {
            if ($a->left === $b->left) return 0;
            return ($a->left < $b->left) ? -1 : 1;
        }
        return ($a->top < $b->top) ? -1 : 1;
    }

    public function cmpReading(TsvRow $a, TsvRow $b)
    {
        // Prefer structured ordering when present
        if ($a->block_num !== $b->block_num) return ($a->block_num < $b->block_num) ? -1 : 1;
        if ($a->par_num !== $b->par_num) return ($a->par_num < $b->par_num) ? -1 : 1;
        if ($a->line_num !== $b->line_num) return ($a->line_num < $b->line_num) ? -1 : 1;
        if ($a->word_num !== $b->word_num) return ($a->word_num < $b->word_num) ? -1 : 1;

        // fallback
        return $this->cmpTopThenLeft($a, $b);
    }

    public function cmpLineTopThenLeft(array $a, array $b)
    {
        if ($a['top'] === $b['top']) {
            if ($a['left'] === $b['left']) return 0;
            return ($a['left'] < $b['left']) ? -1 : 1;
        }
        return ($a['top'] < $b['top']) ? -1 : 1;
    }
}
