<?php

namespace boru\ocr\Layout\Support;

use boru\ocr\Tesseract\Tsv\TsvRow;

/**
 * Detect "table-like" contiguous regions inside an ordered line list.
 *
 * This is heuristic and intentionally lightweight:
 * - bucket token x positions into columns
 * - compute a per-line "column signature"
 * - look for repeating signatures / stable column counts across a sliding window
 *
 * Output regions are ranges of line indices with a score.
 *
 * PHP 5.6+
 */
class TableRegionDetector
{
    /** @var BoundsCalculator */
    protected $bounds;

    /** @var int */
    protected $bucketPx = 18;

    /** @var int */
    protected $window = 7;

    /** @var int */
    protected $minLines = 5;

    /** @var float */
    protected $scoreThreshold = 0.55;

    /** @var int */
    protected $minTokens = 60;

    /** @var int */
    protected $minCols = 3;

    public function __construct(BoundsCalculator $bounds, array $options = array())
    {
        $this->bounds = $bounds;

        if (isset($options['tableRegionBucketPx'])) $this->bucketPx = (int)$options['tableRegionBucketPx'];
        if (isset($options['tableRegionWindow'])) $this->window = (int)$options['tableRegionWindow'];
        if (isset($options['tableRegionMinLines'])) $this->minLines = (int)$options['tableRegionMinLines'];
        if (isset($options['tableRegionScoreThreshold'])) $this->scoreThreshold = (float)$options['tableRegionScoreThreshold'];
        if (isset($options['tableRegionMinTokens'])) $this->minTokens = (int)$options['tableRegionMinTokens'];
        if (isset($options['tableRegionMinCols'])) $this->minCols = (int)$options['tableRegionMinCols'];

        if ($this->bucketPx < 1) $this->bucketPx = 1;
        if ($this->window < 3) $this->window = 3;
        if ($this->minLines < 2) $this->minLines = 2;
        if ($this->scoreThreshold < 0.0) $this->scoreThreshold = 0.0;
        if ($this->scoreThreshold > 1.0) $this->scoreThreshold = 1.0;
        if ($this->minCols < 2) $this->minCols = 2;
    }

    /**
     * @param array $orderedLines array<int, array<int,TsvRow>>
     * @return array<int,array> regions: {start,end,score,cols,tokens,bbox:{x,y,w,h}}
     */
    public function detect(array $orderedLines)
    {
        $n = count($orderedLines);
        if ($n < $this->minLines) return array();

        $sigs = array();
        $colCounts = array();
        $tokenCounts = array();
        $bboxes = array();

        for ($i = 0; $i < $n; $i++) {
            $ln = $orderedLines[$i];
            if (!$ln || count($ln) === 0) {
                $sigs[$i] = '';
                $colCounts[$i] = 0;
                $tokenCounts[$i] = 0;
                $bboxes[$i] = null;
                continue;
            }

            // left-sort tokens
            usort($ln, array($this, 'cmpLeft'));

            $cols = array();
            $tok = 0;
            foreach ($ln as $r) {
                if (!($r instanceof TsvRow)) continue;
                $t = (string)$r->text;
                if ($t === '') continue;
                $tok++;
                $x = (int)$r->left;
                $bucket = (int)floor($x / $this->bucketPx);
                $cols[$bucket] = 1;
            }

            $b = $this->bounds->lineBounds($ln);
            $bboxes[$i] = $b;

            $keys = array_keys($cols);
            sort($keys);
            // compress buckets into signature like "3|7|11"
            $sig = implode('|', $keys);

            $sigs[$i] = $sig;
            $colCounts[$i] = count($keys);
            $tokenCounts[$i] = $tok;
        }

        // sliding window score per line (centered-ish)
        $scores = array();
        for ($i = 0; $i < $n; $i++) {
            $w0 = max(0, $i - (int)floor($this->window / 2));
            $w1 = min($n - 1, $w0 + $this->window - 1);
            $w0 = max(0, $w1 - $this->window + 1);

            $hist = array();
            $cc = array();
            $tokSum = 0;
            $lineCount = 0;
            for ($j = $w0; $j <= $w1; $j++) {
                if ($sigs[$j] === '') continue;
                $lineCount++;
                $tokSum += (int)$tokenCounts[$j];
                $hist[$sigs[$j]] = isset($hist[$sigs[$j]]) ? $hist[$sigs[$j]] + 1 : 1;
                $cc[] = (int)$colCounts[$j];
            }

            if ($lineCount < 3) {
                $scores[$i] = 0.0;
                continue;
            }

            rsort($hist);
            $topRepeat = (float)$hist[0] / (float)$lineCount;

            // column stability: 1 - (stddev-ish / mean)
            $mean = 0.0;
            foreach ($cc as $v) $mean += $v;
            $mean = $mean / max(1, count($cc));
            $var = 0.0;
            foreach ($cc as $v) {
                $d = $v - $mean;
                $var += $d * $d;
            }
            $var = $var / max(1, count($cc));
            $sd = sqrt($var);
            $stability = ($mean > 0.0) ? max(0.0, 1.0 - ($sd / $mean)) : 0.0;

            // alignment proxy: prefer more columns
            $colsHere = (int)$colCounts[$i];
            $colScore = min(1.0, max(0.0, ($colsHere - 1) / 6.0));

            // token density proxy: normalize within window
            $tokScore = min(1.0, (float)$tokSum / (float)max(1, $this->minTokens));

            // weighted blend
            $score = (0.45 * $topRepeat) + (0.30 * $stability) + (0.15 * $colScore) + (0.10 * $tokScore);

            // enforce minimum columns gate
            if ($colsHere < $this->minCols) $score *= 0.3;

            $scores[$i] = $score;
        }

        // merge contiguous lines above threshold into regions
        $regions = array();
        $in = false;
        $rs = 0;
        for ($i = 0; $i < $n; $i++) {
            $ok = ($scores[$i] >= $this->scoreThreshold);
            if ($ok && !$in) {
                $in = true;
                $rs = $i;
            } elseif (!$ok && $in) {
                $re = $i - 1;
                $this->maybeAddRegion($regions, $rs, $re, $scores, $orderedLines, $bboxes, $colCounts, $tokenCounts);
                $in = false;
            }
        }
        if ($in) {
            $this->maybeAddRegion($regions, $rs, $n - 1, $scores, $orderedLines, $bboxes, $colCounts, $tokenCounts);
        }

        return $regions;
    }

    protected function maybeAddRegion(&$regions, $start, $end, array $scores, array $orderedLines, array $bboxes, array $colCounts, array $tokenCounts)
    {
        $len = (int)$end - (int)$start + 1;
        if ($len < $this->minLines) return;

        $scoreSum = 0.0;
        $scoreN = 0;
        $tokSum = 0;
        $colMax = 0;

        $bb = null;
        for ($i = $start; $i <= $end; $i++) {
            $scoreSum += (float)$scores[$i];
            $scoreN++;
            $tokSum += (int)$tokenCounts[$i];
            $colMax = max($colMax, (int)$colCounts[$i]);

            if ($bboxes[$i]) {
                if (!$bb) {
                    $bb = $bboxes[$i];
                } else {
                    $bb = $this->mergeBounds($bb, $bboxes[$i]);
                }
            }
        }

        if ($tokSum < $this->minTokens) return;
        if ($colMax < $this->minCols) return;

        $regions[] = array(
            'start' => (int)$start,
            'end' => (int)$end,
            'score' => ($scoreN > 0) ? ($scoreSum / (float)$scoreN) : 0.0,
            'cols' => (int)$colMax,
            'tokens' => (int)$tokSum,
            'bbox' => $bb ? array('x' => (int)$bb['left'], 'y' => (int)$bb['top'], 'w' => (int)$bb['width'], 'h' => (int)$bb['height']) : null,
        );
    }

    protected function mergeBounds(array $a, array $b)
    {
        $l = min((int)$a['left'], (int)$b['left']);
        $t = min((int)$a['top'], (int)$b['top']);
        $r = max((int)$a['right'], (int)$b['right']);
        $btm = max((int)$a['bottom'], (int)$b['bottom']);
        return array(
            'left' => $l,
            'top' => $t,
            'right' => $r,
            'bottom' => $btm,
            'width' => $r - $l,
            'height' => $btm - $t,
        );
    }

    public function cmpLeft(TsvRow $a, TsvRow $b)
    {
        if ($a->left === $b->left) return 0;
        return ($a->left < $b->left) ? -1 : 1;
    }
}
