<?php

namespace boru\ocr\Layout\Detector;

use boru\ocr\Tesseract\Tsv\TsvPage;
use boru\ocr\Layout\Support\BoundsCalculator;

/**
 * Heuristic table/ledger detector.
 *
 * Detects pages that are "grid/table dominated" (bank statements, ledgers, invoices).
 * This is distinct from multi-column narrative and from engineering diagrams.
 *
 * Inputs: TSV words + grouped lines (array of TsvRow[]).
 * Output: TableLayoutDecision with score and feature breakdown.
 */
class TableLayoutDetector
{
    /** @var float */
    protected $minScoreToTrigger = 0.62;

    /** @var int */
    protected $minWordCount = 60;

    /** @var int */
    protected $minLineCount = 12;

    /** @var float */
    protected $minNumericRatio = 0.22;

    /** @var float */
    protected $minColumnAlignmentScore = 0.45;

    /** @var BoundsCalculator */
    protected $bounds;

    public function __construct(BoundsCalculator $bounds = null, array $options = array())
    {
        if (isset($options['minScoreToTrigger'])) $this->minScoreToTrigger = (float)$options['minScoreToTrigger'];
        if (isset($options['minWordCount'])) $this->minWordCount = (int)$options['minWordCount'];
        if (isset($options['minLineCount'])) $this->minLineCount = (int)$options['minLineCount'];
        if (isset($options['minNumericRatio'])) $this->minNumericRatio = (float)$options['minNumericRatio'];
        if (isset($options['minColumnAlignmentScore'])) $this->minColumnAlignmentScore = (float)$options['minColumnAlignmentScore'];

        $this->bounds = $bounds ?: new BoundsCalculator();
    }

    /**
     * @param TsvPage $page
     * @param array<int,array> $lines array of lines, each is array of TsvRow
     * @return TableLayoutDecision
     */
    public function detect(TsvPage $page, array $lines)
    {
        $features = array();

        $words = $page->words();
        $wordCount = $words ? count($words) : 0;
        $lineCount = count($lines);

        $features['wordCount'] = $wordCount;
        $features['lineCount'] = $lineCount;

        if ($wordCount < $this->minWordCount || $lineCount < $this->minLineCount) {
            return new TableLayoutDecision(false, 0.0, $features);
        }

        // Numeric-ish tokens ratio
        $numeric = 0;
        $nonempty = 0;
        foreach ($words as $w) {
            $t = trim($w->text);
            if ($t === '') continue;
            $nonempty++;

            if ($this->isNumericToken($t)) $numeric++;
        }
        $numericRatio = $nonempty > 0 ? ($numeric / $nonempty) : 0.0;
        $features['numericRatio'] = $numericRatio;

        // Average tokens per line, and how "tight" lines are (many short lines suggests table)
        $tokensPerLine = array();
        foreach ($lines as $ln) {
            $c = 0;
            foreach ($ln as $r) {
                if (trim($r->text) !== '') $c++;
            }
            $tokensPerLine[] = $c;
        }
        $avgTokensPerLine = $this->avg($tokensPerLine);
        $features['avgTokensPerLine'] = $avgTokensPerLine;

        // Column alignment score:
        // build a histogram of left positions (bucketed)
        $alignmentScore = $this->columnAlignmentScore($lines);
        $features['columnAlignmentScore'] = $alignmentScore;

        // Repetition / "ruler" score: count how often bucket positions repeat across lines
        $repetitionScore = $this->columnRepetitionScore($lines);
        $features['columnRepetitionScore'] = $repetitionScore;

        // Table score: weighted blend
        $score = 0.0;

        // numeric ratio contributes strongly
        $score += $this->clamp(($numericRatio - 0.10) / 0.40, 0.0, 1.0) * 0.35;

        // column alignment
        $score += $this->clamp(($alignmentScore - 0.20) / 0.60, 0.0, 1.0) * 0.30;

        // repetition (columns repeating across lines)
        $score += $this->clamp(($repetitionScore - 0.20) / 0.60, 0.0, 1.0) * 0.25;

        // token density: tables often have 3-12 tokens/line on average
        $score += $this->clamp(1.0 - (abs($avgTokensPerLine - 7.0) / 10.0), 0.0, 1.0) * 0.10;

        $features['score'] = $score;

        $isTable = ($score >= $this->minScoreToTrigger)
            && ($numericRatio >= $this->minNumericRatio || $alignmentScore >= $this->minColumnAlignmentScore);

        return new TableLayoutDecision($isTable, $score, $features);
    }

    protected function isNumericToken($t)
    {
        // Allow currency, commas, decimals, negative, percents, fractions-ish
        // Examples: 1,234.56  -12.00  $55.00  10%  1/2
        if (preg_match('/^[\$\€\£]?\-?[0-9][0-9,]*([.][0-9]+)?%?$/', $t)) return true;
        if (preg_match('/^\-?[0-9]+\/[0-9]+$/', $t)) return true;
        return false;
    }

    protected function columnAlignmentScore(array $lines)
    {
        // Bucket left positions at 12px granularity and see how concentrated they are
        $bucket = array();
        $total = 0;

        foreach ($lines as $ln) {
            foreach ($ln as $r) {
                $t = trim($r->text);
                if ($t === '') continue;
                $b = (int)floor(((int)$r->left) / 12);
                if (!isset($bucket[$b])) $bucket[$b] = 0;
                $bucket[$b]++;
                $total++;
            }
        }
        if ($total < 50) return 0.0;

        arsort($bucket);
        $top = array_slice(array_values($bucket), 0, 8);
        $sumTop = 0;
        foreach ($top as $v) $sumTop += $v;

        // If a small set of x-buckets account for lots of tokens, columns exist
        return $sumTop / $total;
    }

    protected function columnRepetitionScore(array $lines)
    {
        // For each line, compute which buckets appear, then count how often those buckets repeat across lines.
        $lineBuckets = array();

        foreach ($lines as $ln) {
            $set = array();
            foreach ($ln as $r) {
                $t = trim($r->text);
                if ($t === '') continue;
                $b = (int)floor(((int)$r->left) / 16);
                $set[$b] = true;
            }
            if (!empty($set)) $lineBuckets[] = array_keys($set);
        }

        if (count($lineBuckets) < 10) return 0.0;

        $freq = array();
        $lineCount = 0;

        foreach ($lineBuckets as $bs) {
            $lineCount++;
            foreach ($bs as $b) {
                if (!isset($freq[$b])) $freq[$b] = 0;
                $freq[$b]++;
            }
        }

        if ($lineCount === 0) return 0.0;

        // Consider buckets that appear in many lines
        $common = 0;
        foreach ($freq as $b => $c) {
            if ($c >= (int)floor($lineCount * 0.35)) $common++;
        }

        // Normalize by expected columns (clamp 0..1 with 12 columns as "full")
        return $this->clamp($common / 12.0, 0.0, 1.0);
    }

    protected function avg(array $ints)
    {
        $n = count($ints);
        if ($n === 0) return 0.0;
        $sum = 0.0;
        foreach ($ints as $v) $sum += (float)$v;
        return $sum / $n;
    }

    protected function clamp($v, $min, $max)
    {
        if ($v < $min) return $min;
        if ($v > $max) return $max;
        return $v;
    }
}
