<?php

namespace boru\ocr\Layout;

use boru\ocr\Tesseract\Tsv\TsvPage;
use boru\ocr\Tesseract\Tsv\TsvRow;

/**
 * Builds baseline readable text for a page from TSV.
 *
 * This is NOT the final output. It's a deterministic baseline that can:
 *  - replace "AI first-pass OCR" as layout/reading-order input
 *  - be fed to a final agent with TSV evidence callbacks
 *
 * Heuristics used:
 *  - Prefer word-level rows (level=5) if available, else use all rows.
 *  - Group by (block, par, line) when possible.
 *  - Within a line, sort by left coordinate.
 *  - Across lines, sort by top coordinate (stable).
 *  - Insert spaces based on gaps between words.
 *  - Optionally filter low-confidence tokens.
 */
class TsvLayoutBuilder
{
    /** @var int Minimum confidence to include a word (tesseract conf is usually -1..100) */
    protected $minConf;

    /** @var bool Whether to drop empty tokens */
    protected $dropEmpty;

    /** @var bool Whether to attempt light multi-column ordering */
    protected $multiColumn;

    /** @var int Gap threshold (pixels) to insert an extra space */
    protected $gapExtraSpacePx;

    /**
     * @param array $options
     *   - minConf (int) default 0
     *   - dropEmpty (bool) default true
     *   - multiColumn (bool) default true
     *   - gapExtraSpacePx (int) default 12
     */
    public function __construct(array $options = array())
    {
        $this->minConf = isset($options['minConf']) ? (int)$options['minConf'] : 0;
        $this->dropEmpty = array_key_exists('dropEmpty', $options) ? (bool)$options['dropEmpty'] : true;
        $this->multiColumn = array_key_exists('multiColumn', $options) ? (bool)$options['multiColumn'] : true;
        $this->gapExtraSpacePx = isset($options['gapExtraSpacePx']) ? (int)$options['gapExtraSpacePx'] : 12;
    }

    /**
     * Build baseline text for a single page.
     *
     * @param TsvPage $page
     * @return string
     */
    public function buildPageText(TsvPage $page)
    {
        $rows = $this->selectRows($page);

        if (!$rows) {
            return '';
        }

        // Group rows into "lines"
        $lines = $this->groupIntoLines($rows);

        if (!$lines) {
            return '';
        }

        // Sort lines in reading order
        $this->sortLines($lines);

        // Optionally apply multi-column grouping/reordering
        if ($this->multiColumn) {
            $lines = $this->reorderForMultiColumn($lines);
        }

        // Render lines to text
        $outLines = array();
        foreach ($lines as $line) {
            $text = $this->renderLine($line);
            $text = trim($text);
            if ($text === '') continue;
            $outLines[] = $text;
        }

        return implode("\n", $outLines);
    }

    /**
     * Select rows to use. Prefer word-level (level=5) rows when available.
     *
     * @param TsvPage $page
     * @return TsvRow[]
     */
    protected function selectRows(TsvPage $page)
    {
        $words = $page->words();
        $rows = $words && count($words) > 0 ? $words : $page->rows;

        $out = array();
        foreach ($rows as $r) {
            if ($this->dropEmpty && $r->text === '') continue;

            // conf can be -1 for non-word levels; allow those through if we didn't get words
            if ((int)$r->level === 5) {
                if ($r->conf < $this->minConf) continue;
            }

            $out[] = $r;
        }
        return $out;
    }

    /**
     * Group into lines using block/par/line where possible.
     * If line_num/block_num are absent or zeroed, fallback to top-clustering.
     *
     * @param TsvRow[] $rows
     * @return array<int, TsvRow[]>
     */
    protected function groupIntoLines(array $rows)
    {
        // Prefer structured grouping if it looks meaningful
        $hasStructured = false;
        foreach ($rows as $r) {
            if ($r->block_num > 0 || $r->par_num > 0 || $r->line_num > 0) {
                $hasStructured = true;
                break;
            }
        }

        if ($hasStructured) {
            $groups = array();
            foreach ($rows as $r) {
                $k = $r->block_num . ':' . $r->par_num . ':' . $r->line_num;
                if (!isset($groups[$k])) $groups[$k] = array();
                $groups[$k][] = $r;
            }
            return array_values($groups);
        }

        // Fallback: cluster by 'top' proximity (simple)
        // Sort by top first
        usort($rows, array($this, 'cmpTopThenLeft'));

        $lines = array();
        $current = array();
        $currentTop = null;
        $threshold = 8; // px; conservative default for scanned docs

        foreach ($rows as $r) {
            if ($currentTop === null) {
                $currentTop = $r->top;
                $current[] = $r;
                continue;
            }

            if (abs($r->top - $currentTop) <= $threshold) {
                $current[] = $r;
                continue;
            }

            $lines[] = $current;
            $current = array($r);
            $currentTop = $r->top;
        }

        if ($current) $lines[] = $current;

        return $lines;
    }

    /**
     * Sort words inside each line by left coordinate, and keep line ordering stable by top.
     *
     * @param array<int, TsvRow[]> &$lines
     * @return void
     */
    protected function sortLines(array &$lines)
    {
        // sort words within each line
        foreach ($lines as &$line) {
            usort($line, array($this, 'cmpLeft'));
        }
        unset($line);

        // sort lines by top (then left)
        usort($lines, array($this, 'cmpLineTopThenLeft'));
    }

    /**
     * Render a line into text, inserting spaces based on bbox gaps.
     *
     * @param TsvRow[] $line
     * @return string
     */
    protected function renderLine(array $line)
    {
        // Ensure line is left-sorted
        usort($line, array($this, 'cmpLeft'));

        $out = '';
        $prevRight = null;

        foreach ($line as $r) {
            $t = $r->text;

            // Skip empties
            if ($this->dropEmpty && $t === '') continue;

            // Normalize TSV escaped stuff lightly (keep conservative)
            $t = str_replace(array("\t", "\r", "\n"), ' ', $t);

            if ($out === '') {
                $out = $t;
                $prevRight = $r->right();
                continue;
            }

            $gap = 0;
            if ($prevRight !== null) {
                $gap = (int)$r->left - (int)$prevRight;
            }

            // Always at least one space between tokens (baseline)
            // Add extra spacing if the gap is meaningful
            if ($gap > $this->gapExtraSpacePx) {
                $out .= '  ';
            } else {
                $out .= ' ';
            }

            $out .= $t;
            $prevRight = $r->right();
        }

        return $out;
    }

    /**
     * Light multi-column reorder:
     * Split lines into left/right buckets based on median x of each line,
     * then output all left-column lines (top-sorted) followed by right-column lines (top-sorted)
     * if there is evidence of two columns.
     *
     * This is intentionally conservative; we can improve later.
     *
     * @param array<int, TsvRow[]> $lines
     * @return array<int, TsvRow[]>
     */
    protected function reorderForMultiColumn(array $lines)
    {
        if (count($lines) < 8) {
            return $lines; // too few lines; don't risk it
        }

        // compute page-wide min/max x
        $minX = null;
        $maxX = null;

        $lineMids = array();
        foreach ($lines as $line) {
            $bounds = $this->lineBounds($line);
            if ($bounds === null) continue;

            if ($minX === null || $bounds['min'] < $minX) $minX = $bounds['min'];
            if ($maxX === null || $bounds['max'] > $maxX) $maxX = $bounds['max'];

            $lineMids[] = $bounds['mid'];
        }

        if ($minX === null || $maxX === null) {
            return $lines;
        }

        $width = $maxX - $minX;
        if ($width < 200) {
            return $lines; // narrow; unlikely multi-column
        }

        // median line midpoints
        sort($lineMids);
        $median = $lineMids[(int)floor(count($lineMids) / 2)];

        // Decide whether there are two clusters (very rough):
        // count how many mids are far left vs far right of median
        $leftCount = 0;
        $rightCount = 0;
        $margin = (int)max(40, $width * 0.12);

        foreach ($lineMids as $m) {
            if ($m < $median - $margin) $leftCount++;
            if ($m > $median + $margin) $rightCount++;
        }

        // Only treat as multi-column if both sides are meaningfully represented
        if ($leftCount < 3 || $rightCount < 3) {
            return $lines;
        }

        $left = array();
        $right = array();
        $middle = array(); // ambiguous lines remain in original flow

        foreach ($lines as $line) {
            $bounds = $this->lineBounds($line);
            if ($bounds === null) {
                $middle[] = $line;
                continue;
            }

            if ($bounds['mid'] < $median - $margin) {
                $left[] = $line;
            } elseif ($bounds['mid'] > $median + $margin) {
                $right[] = $line;
            } else {
                $middle[] = $line;
            }
        }

        // already top-sorted, but re-sort to be safe
        usort($left, array($this, 'cmpLineTopThenLeft'));
        usort($right, array($this, 'cmpLineTopThenLeft'));
        usort($middle, array($this, 'cmpLineTopThenLeft'));

        // Conservative ordering: left, middle, right
        // (middle often includes titles/headers spanning columns)
        return array_merge($left, $middle, $right);
    }

    /**
     * @param TsvRow[] $line
     * @return array|null ['min'=>int,'max'=>int,'mid'=>int,'top'=>int,'left'=>int]
     */
    protected function lineBounds(array $line)
    {
        if (!$line) return null;

        $min = null;
        $max = null;
        $top = null;

        foreach ($line as $r) {
            $l = (int)$r->left;
            $rr = (int)$r->right();

            if ($min === null || $l < $min) $min = $l;
            if ($max === null || $rr > $max) $max = $rr;

            if ($top === null || (int)$r->top < $top) $top = (int)$r->top;
        }

        if ($min === null || $max === null) return null;

        return array(
            'min' => $min,
            'max' => $max,
            'mid' => (int)floor(($min + $max) / 2),
            'top' => $top,
            'left' => $min,
        );
    }

    // ----- Comparators -----

    public function cmpLeft(TsvRow $a, TsvRow $b)
    {
        if ($a->left === $b->left) {
            if ($a->top === $b->top) return 0;
            return ($a->top < $b->top) ? -1 : 1;
        }
        return ($a->left < $b->left) ? -1 : 1;
    }

    public function cmpTopThenLeft(TsvRow $a, TsvRow $b)
    {
        if ($a->top === $b->top) {
            if ($a->left === $b->left) return 0;
            return ($a->left < $b->left) ? -1 : 1;
        }
        return ($a->top < $b->top) ? -1 : 1;
    }

    public function cmpLineTopThenLeft(array $lineA, array $lineB)
    {
        $a = $this->lineBounds($lineA);
        $b = $this->lineBounds($lineB);

        if ($a === null && $b === null) return 0;
        if ($a === null) return 1;
        if ($b === null) return -1;

        if ($a['top'] === $b['top']) {
            if ($a['left'] === $b['left']) return 0;
            return ($a['left'] < $b['left']) ? -1 : 1;
        }

        return ($a['top'] < $b['top']) ? -1 : 1;
    }
}
