<?php

namespace boru\ocr\Layout\Detector;

use boru\ocr\Layout\LayoutOptions;
use boru\ocr\Tesseract\Tsv\TsvPage;
use boru\ocr\Tesseract\Tsv\TsvRow;
use boru\ocr\Layout\Support\BoundsCalculator;

/**
 * Deterministic page-type detector:
 * - document vs diagram (mechanical drawing / schematic / scattered labels).
 *
 * This does NOT reorder lines; it only decides which strategy should be used.
 */
class LayoutProfileDetector
{
    /** @var float */
    protected $diagramScoreThreshold = 0.62;

    /** @var int */
    protected $shortTokenLen = 4;

    /** @var BoundsCalculator */
    protected $bounds;

    /** @var LayoutOptions */
    protected $layoutOptions = null;

    public function __construct(BoundsCalculator $bounds, LayoutOptions $options = null)
    {
        $this->bounds = $bounds;
        $this->layoutOptions = LayoutOptions::create($options);
        $this->diagramScoreThreshold = $this->layoutOptions->diagramScoreThreshold;
        $this->shortTokenLen = $this->layoutOptions->shortTokenLen;
    }

    /**
     * Decide layout profile.
     *
     * @param TsvPage $page
     * @param TsvRow[] $rows Word rows (prefer level=5) after filtering.
     * @param array<int, TsvRow[]> $lines
     * @return LayoutProfileDecision
     */
    public function detect(TsvPage $page, array $rows, array $lines)
    {
        $d = new LayoutProfileDecision();

        $wordCount = 0;
        $shortCount = 0;

        $xs = array();
        $ys = array();

        foreach ($rows as $r) {
            if ((int)$r->level !== 5) continue;
            if ($r->text === '') continue;
            if ($r->conf < 0) continue;

            $wordCount++;

            $t = trim($r->text);
            $alphaNum = preg_replace('/[^A-Za-z0-9]/', '', $t);
            if ($alphaNum !== '' && strlen($alphaNum) <= $this->shortTokenLen) {
                $shortCount++;
            }

            $xs[] = (int)$r->left;
            $ys[] = (int)$r->top;
        }

        $lineCount = count($lines);
        $avgTokensPerLine = $this->avgTokensPerLine($lines);

        $clusterCount = $this->estimateClusterCount($rows);
        $scatter = $this->scatterScore($xs, $ys, $page);

        $shortRatio = $wordCount > 0 ? ($shortCount / $wordCount) : 0.0;

        // Diagram score: weighted mix
        $clusterScore = $this->clamp(($clusterCount - 3) / 12.0);       // 0 at 3 clusters, ~1 at 15+
        $flowScore    = $this->clamp((6.0 - $avgTokensPerLine) / 6.0);  // 1 when avgTokens<=0, 0 when >=6

        $score =
            (0.40 * $scatter) +
            (0.25 * $clusterScore) +
            (0.20 * $this->clamp($shortRatio / 0.65)) +
            (0.15 * $flowScore);

        $isDiagram = ($score >= $this->diagramScoreThreshold);

        // Populate decision in a “stable” shape (even if LayoutProfileDecision is permissive).
        $d->profile = $isDiagram ? 'diagram' : 'document';
        $d->diagramScore = $score;
        $d->threshold = $this->diagramScoreThreshold;

        $d->features = array(
            'wordCount' => $wordCount,
            'lineCount' => $lineCount,
            'avgTokensPerLine' => $avgTokensPerLine,
            'shortRatio' => $shortRatio,
            'clusterCount' => $clusterCount,
            'scatter' => $scatter,
        );

        return $d;
    }

    /**
     * @param array<int, TsvRow[]> $lines
     * @return float
     */
    protected function avgTokensPerLine(array $lines)
    {
        if (!$lines) return 0.0;

        $total = 0;
        $count = 0;

        foreach ($lines as $ln) {
            $c = 0;
            foreach ($ln as $r) {
                if ((int)$r->level !== 5) continue;
                if ($r->text === '') continue;
                if ($r->conf < 0) continue;
                $c++;
            }
            if ($c > 0) {
                $total += $c;
                $count++;
            }
        }

        if ($count === 0) return 0.0;
        return (float)$total / (float)$count;
    }

    /**
     * Estimate number of spatial clusters of words via coarse bucketing.
     * Cheap + stable for php5.6.
     *
     * @param TsvRow[] $rows
     * @return int
     */
    protected function estimateClusterCount(array $rows)
    {
        // Bucket size is intentionally coarse: we care about “scattered labels” vs “paragraph block”.
        $bucket = 140;

        $seen = array();

        foreach ($rows as $r) {
            if ((int)$r->level !== 5) continue;
            if ($r->text === '') continue;
            if ($r->conf < 0) continue;

            $bx = (int)floor(((int)$r->left) / $bucket);
            $by = (int)floor(((int)$r->top) / $bucket);

            $k = $bx . ':' . $by;
            $seen[$k] = true;
        }

        return count($seen);
    }

    /**
     * Scatter score based on normalized spread of word positions.
     *
     * @param int[] $xs
     * @param int[] $ys
     * @param TsvPage $page
     * @return float
     */
    protected function scatterScore(array $xs, array $ys, TsvPage $page)
    {
        if (count($xs) < 8 || count($ys) < 8) return 0.0;

        sort($xs);
        sort($ys);

        $minX = $xs[0];
        $maxX = $xs[count($xs) - 1];
        $minY = $ys[0];
        $maxY = $ys[count($ys) - 1];

        $spanX = max(1, $maxX - $minX);
        $spanY = max(1, $maxY - $minY);

        // If page dimensions exist, normalize to them; otherwise normalize to observed span.
        $pageW = isset($page->width) ? (int)$page->width : 0;
        $pageH = isset($page->height) ? (int)$page->height : 0;

        if ($pageW > 0) $spanX = max(1, min($spanX, $pageW));
        if ($pageH > 0) $spanY = max(1, min($spanY, $pageH));

        // Use IQR-ish dispersion: distance between 20th and 80th percentile
        $x20 = $xs[(int)floor(count($xs) * 0.2)];
        $x80 = $xs[(int)floor(count($xs) * 0.8)];
        $y20 = $ys[(int)floor(count($ys) * 0.2)];
        $y80 = $ys[(int)floor(count($ys) * 0.8)];

        $dx = (int)$x80 - (int)$x20;
        $dy = (int)$y80 - (int)$y20;

        $nx = $dx / (float)$spanX;
        $ny = $dy / (float)$spanY;

        // “Scattered labels” tends to have higher dispersion in both axes.
        return $this->clamp((0.55 * $nx) + (0.45 * $ny));
    }

    /**
     * @param float $v
     * @return float
     */
    protected function clamp($v)
    {
        $v = (float)$v;
        if ($v < 0.0) return 0.0;
        if ($v > 1.0) return 1.0;
        return $v;
    }
}
