<?php
namespace boru\ocr\Layout\Detector;

use boru\ocr\Layout\LayoutOptions;
use boru\ocr\Layout\Support\BoundsCalculator;
use boru\ocr\Tesseract\Tsv\TsvRow;

/**
 * KeyValueTableDetector
 *
 * Detects 2-column "key/value" tables common in drawings (title blocks, weights, specs),
 * menus (item/price), and legal docs (label/amount).
 *
 * Input: ordered lines, where each line is an array of TsvRow tokens.
 *
 * PHP 5.6+
 */
class KeyValueTableDetector
{
    /** @var BoundsCalculator */
    protected $bounds = null;
    /** @var int */
    protected $minLines = 4;

    /** @var int */
    protected $minTokens = 24;

    /** @var float */
    protected $scoreThreshold = 0.58;

    /** @var bool */
    protected $requireNumericValue = true;

    /** @var int */
    protected $valueAlignTolerancePx = 14;

    /** @var int */
    protected $maxValuesPerLine = 1;

    /** @var string */
    protected $valueRegex = '/^[\$\(\)]?\s*[-+]?\d[\d,]*(\.\d+)?\s*$/';

    /** @var LayoutOptions */
    protected $layoutOptions = null;

    public function __construct(BoundsCalculator $bounds, LayoutOptions $options = null)
    {

        $this->bounds = $bounds;
        $this->layoutOptions = LayoutOptions::create($options);
        $this->minLines = $this->layoutOptions->keyValueMinLines;
        $this->minTokens = $this->layoutOptions->keyValueMinTokens;
        $this->scoreThreshold = $this->layoutOptions->keyValueScoreThreshold;
        $this->requireNumericValue = $this->layoutOptions->keyValueRequireNumericValue;
        $this->valueAlignTolerancePx = $this->layoutOptions->keyValueValueAlignTolerancePx;
        $this->maxValuesPerLine = $this->layoutOptions->keyValueMaxValuesPerLine;
        $this->valueRegex = $this->layoutOptions->keyValueValueRegex;

        if ($this->minLines < 2) $this->minLines = 2;
        if ($this->minTokens < 0) $this->minTokens = 0;
        if ($this->valueAlignTolerancePx < 1) $this->valueAlignTolerancePx = 1;
        if ($this->maxValuesPerLine < 1) $this->maxValuesPerLine = 1;
    }

    public function minLines() { return $this->minLines; }
    public function minTokens() { return $this->minTokens; }
    public function threshold() { return $this->scoreThreshold; }

    /**
     * Scores a line block as key/value table-ish.
     *
     * @param array $lines array<array<TsvRow>>
     * @return array { score:float, features:array, tokenSum:int, valueRightEdges:array<int>, valueCount:int, lineCount:int }
     */
    public function scoreLines(array $lines)
    {
        $lineCount = count($lines);
        $tokenSum = 0;

        $valueRightEdges = array();
        $valueLeftEdges = array();
        $linesWithValue = 0;
        $linesWithMultiValue = 0;
        $linesWithLabel = 0;
        $maxLabelRight = array(); // per-line

        foreach ($lines as $ln) {
            if (!$ln || count($ln) === 0) continue;

            // left-sort tokens
            usort($ln, array($this, 'cmpLeft'));

            $tokenSum += count($ln);

            $vals = array();
            $labels = array();

            foreach ($ln as $r) {
                if (!($r instanceof TsvRow)) continue;
                $t = trim((string)$r->text);
                if ($t === '') continue;

                if ($this->isValueToken($t)) {
                    $vals[] = $r;
                } else {
                    $labels[] = $r;
                }
            }

            if (count($labels) > 0) $linesWithLabel++;

            if (count($vals) > 0) {
                $linesWithValue++;

                // pick rightmost value token by left
                $rv = $vals[count($vals)-1];
                $valueRightEdges[] = (int)($rv->left + $rv->width);
                $valueLeftEdges[] = (int)$rv->left;

                if (count($vals) > $this->maxValuesPerLine) $linesWithMultiValue++;

                // compute label extent
                $mr = 0;
                foreach ($labels as $lr) {
                    $rgt = (int)($lr->left + $lr->width);
                    if ($rgt > $mr) $mr = $rgt;
                }
                $maxLabelRight[] = $mr;
            } else {
                // no value: allow as header line; record label right for separation stats
                $mr = 0;
                foreach ($labels as $lr) {
                    $rgt = (int)($lr->left + $lr->width);
                    if ($rgt > $mr) $mr = $rgt;
                }
                $maxLabelRight[] = $mr;
            }
        }

        $effectiveLines = $lineCount;
        if ($effectiveLines < 1) $effectiveLines = 1;

        $valueRatio = $linesWithValue / (float)$effectiveLines;
        $labelRatio = $linesWithLabel / (float)$effectiveLines;

        // Alignment: right edges should cluster tightly
        $alignScore = 0.0;
        if (count($valueRightEdges) >= 2) {
            $minR = min($valueRightEdges);
            $maxR = max($valueRightEdges);
            $range = (float)($maxR - $minR);
            $alignScore = 1.0 - min(1.0, $range / (float)$this->valueAlignTolerancePx);
        } elseif (count($valueRightEdges) === 1) {
            $alignScore = 0.25; // weak signal but not zero
        }

        // Separation: labels should generally end before value column starts
        $sepScore = 0.0;
        if (count($valueLeftEdges) > 0 && count($maxLabelRight) > 0) {
            $vl = (int)round(array_sum($valueLeftEdges) / count($valueLeftEdges));
            $gaps = array();
            foreach ($maxLabelRight as $mr) {
                $gaps[] = $vl - (int)$mr;
            }
            // median-ish gap
            sort($gaps);
            $mid = $gaps[(int)floor(count($gaps)/2)];
            $sepScore = 0.0;
            if ($mid > 0) {
                // scale: 0..1 with 30px being "good enough"
                $sepScore = min(1.0, $mid / 30.0);
            }
        }

        // Penalize too many multi-value lines
        $multiPenalty = 0.0;
        if ($linesWithMultiValue > 0) {
            $multiPenalty = min(0.4, ($linesWithMultiValue / (float)$effectiveLines) * 0.6);
        }

        // Numeric requirement gating
        if ($this->requireNumericValue && $linesWithValue === 0) {
            $score = 0.0;
        } else {
            // combine: prioritize value alignment and presence, require labels
            $score = 0.45*$alignScore + 0.30*$valueRatio + 0.20*$sepScore + 0.15*$labelRatio;
            $score -= $multiPenalty;
            if ($score < 0) $score = 0.0;
            if ($score > 1) $score = 1.0;
        }

        return array(
            'score' => $score,
            'tokenSum' => (int)$tokenSum,
            'lineCount' => (int)$lineCount,
            'valueCount' => (int)$linesWithValue,
            'features' => array(
                'valueRatio' => $valueRatio,
                'labelRatio' => $labelRatio,
                'alignScore' => $alignScore,
                'sepScore' => $sepScore,
                'multiValueLines' => (int)$linesWithMultiValue,
                'requireNumericValue' => (bool)$this->requireNumericValue,
            ),
        );
    }

    /**
     * Score a contiguous block of lines as a 2-column key/value table.
     *
     * @param TsvRow[][] $lines
     * @param int $start
     * @param int $end
     * @param string $profile
     * @return array
     */
    public function scoreBlock(array $lines, $start, $end, $profile = 'document')
    {
        $start = (int)$start;
        $end   = (int)$end;

        $n = count($lines);
        if ($n === 0) return $this->emptyScore('empty');

        if ($start < 0) $start = 0;
        if ($end >= $n) $end = $n - 1;
        if ($end < $start) return $this->emptyScore('empty');

        $block = array();
        $tokenCount = 0;

        for ($i = $start; $i <= $end; $i++) {
            $ln = $lines[$i];
            if (!is_array($ln)) $ln = array();
            $block[] = $ln;
            $tokenCount += count($ln);
        }

        $lineCount = count($block);

        // Apply min gating (with slight relax in diagrams)
        $minLines  = (int)$this->layoutOptions->keyValueMinLines;
        $minTokens = (int)$this->layoutOptions->keyValueMinTokens;

        if ($profile === 'diagram') {
            if ($minLines > 3) $minLines = $minLines - 1;
            if ($minTokens > 10) $minTokens = (int)max(10, $minTokens - 8);
        }

        if ($lineCount < $minLines || $tokenCount < $minTokens) {
            return array(
                'score' => 0.0,
                'cols' => 2,
                'tokens' => $tokenCount,
                'features' => array(
                    'reason' => 'below_min',
                    'lineCount' => $lineCount,
                    'tokenCount' => $tokenCount
                )
            );
        }

        $requireValue = (bool)$this->layoutOptions->keyValueRequireNumericValue;
        $tolPx = (int)$this->layoutOptions->keyValueValueAlignTolerancePx;

        $valueRightEdges = array();
        $valueCount = 0;
        $multiValueLines = 0;
        $noValueLines = 0;

        $sepGood = 0;
        $sepTotal = 0;

        foreach ($block as $ln) {
            $vals = $this->extractNumericTokens($ln);
            $valN = count($vals);

            if ($valN === 0) {
                $noValueLines++;
                continue;
            }

            if ($valN > 1) $multiValueLines++;

            // choose right-most numeric token as value
            $best = $vals[0];
            for ($k = 1; $k < $valN; $k++) {
                if ($vals[$k]['right'] > $best['right']) $best = $vals[$k];
            }

            $valueRightEdges[] = $best['right'];
            $valueCount++;

            // separation: most non-numeric tokens should be left of valueLeft - margin
            $sepTotal++;
            if ($this->labelValueSeparationScore($ln, $best['left']) >= 0.65) {
                $sepGood++;
            }
        }

        if ($requireValue && $valueCount === 0) {
            return array(
                'score' => 0.0,
                'cols' => 2,
                'tokens' => $tokenCount,
                'features' => array('reason' => 'no_values')
            );
        }

        $presenceRatio = ($lineCount > 0) ? ($valueCount / $lineCount) : 0.0;
        $alignScore = $this->rightEdgeAlignmentScore($valueRightEdges, $tolPx);
        $sepScore = ($sepTotal > 0) ? ($sepGood / $sepTotal) : 0.0;
        $multiPenalty = ($lineCount > 0) ? min(1.0, $multiValueLines / $lineCount) : 0.0;

        // Weighted score; allows one header row without value
        $score =
            (0.45 * $alignScore) +
            (0.35 * $sepScore) +
            (0.25 * $presenceRatio) -
            (0.25 * $multiPenalty);

        // profile bias (optional; safe if you added these fields)
        if (property_exists($this->layoutOptions, 'diagramKeyValueScoreBias') && $profile === 'diagram') {
            $score += (float)$this->layoutOptions->diagramKeyValueScoreBias;
        }

        if ($score < 0) $score = 0.0;
        if ($score > 1) $score = 1.0;

        return array(
            'score' => $score,
            'cols' => 2,
            'tokens' => $tokenCount,
            'features' => array(
                'alignScore' => $alignScore,
                'separationScore' => $sepScore,
                'valuePresenceRatio' => $presenceRatio,
                'multiValuePenalty' => $multiPenalty,
                'lineCount' => $lineCount,
                'tokenCount' => $tokenCount,
                'noValueLines' => $noValueLines,
                'valueLines' => $valueCount,
            )
        );
    }

    protected function emptyScore($reason)
    {
        return array('score' => 0.0, 'cols' => 2, 'tokens' => 0, 'features' => array('reason' => $reason));
    }

    protected function isNumericLike($s)
    {
        // Accept: 511,190  511190  5,111.90  $1,234  (optional)
        $s = trim($s);
        $s = str_replace(array(' ', "\t"), '', $s);
        $s = ltrim($s, '$');
        return (bool)preg_match('/^\d[\d,]*(\.\d+)?$/', $s);
    }

    protected function alignmentScore(array $rightEdges, $tolerancePx)
    {
        $n = count($rightEdges);
        if ($n <= 1) return ($n === 1) ? 1.0 : 0.0;

        sort($rightEdges);
        $median = $rightEdges[(int)floor($n / 2)];
        $inTol = 0;
        foreach ($rightEdges as $x) {
            if (abs($x - $median) <= $tolerancePx) $inTol++;
        }
        return $inTol / $n;
    }


    public function isCandidate(array $lines)
    {
        if (count($lines) < $this->minLines) return false;
        $s = $this->scoreLines($lines);
        if ($s['tokenSum'] < $this->minTokens) return false;
        return ((float)$s['score'] >= (float)$this->scoreThreshold);
    }

    protected function isValueToken($t)
    {
        // Strip common trailing punctuation
        $tt = trim($t);
        $tt = rtrim($tt, ':;');
        return (bool)preg_match($this->valueRegex, $tt);
    }

    public function cmpLeft($a, $b)
    {
        if (!($a instanceof TsvRow) || !($b instanceof TsvRow)) return 0;
        if ($a->left == $b->left) return 0;
        return ($a->left < $b->left) ? -1 : 1;
    }

    /**
     * @param TsvRow[] $line
     * @return array[] each: ['left'=>int,'right'=>int,'text'=>string]
     */
    protected function extractNumericTokens(array $line)
    {
        $out = array();
        foreach ($line as $r) {
            if (!$r instanceof TsvRow) continue;
            if ((int)$r->level !== 5) continue;
            if ($r->text === '') continue;

            $txt = trim($r->text);
            if ($txt === '') continue;

            if (!$this->isNumericLike($txt)) continue;

            $left = (int)$r->left;
            $right = (int)$r->right();

            $out[] = array(
                'left' => $left,
                'right' => $right,
                'text' => $txt,
            );
        }
        return $out;
    }

    

    /**
     * Returns score 0..1 for how well labels are left of value column.
     *
     * @param TsvRow[] $line
     * @param int $valueLeftX
     * @return float
     */
    protected function labelValueSeparationScore(array $line, $valueLeftX)
    {
        if ($valueLeftX <= 0) return 0.0;

        $tot = 0;
        $good = 0;

        foreach ($line as $r) {
            if (!$r instanceof TsvRow) continue;
            if ((int)$r->level !== 5) continue;
            if ($r->text === '') continue;

            $txt = trim($r->text);
            if ($txt === '') continue;
            if ($this->isNumericLike($txt)) continue;

            $tot++;
            $right = (int)$r->right();
            if ($right <= ($valueLeftX - 4)) $good++;
        }

        return ($tot > 0) ? ($good / $tot) : 0.0;
    }

    /**
     * Alignment score: fraction of right-edges within tolerance of median.
     *
     * @param int[] $rightEdges
     * @param int $tolerancePx
     * @return float
     */
    protected function rightEdgeAlignmentScore(array $rightEdges, $tolerancePx)
    {
        $n = count($rightEdges);
        if ($n <= 0) return 0.0;
        if ($n === 1) return 1.0;

        sort($rightEdges);
        $median = $rightEdges[(int)floor($n / 2)];

        $inTol = 0;
        foreach ($rightEdges as $x) {
            if (abs($x - $median) <= $tolerancePx) $inTol++;
        }

        return $inTol / $n;
    }
}
