<?php

namespace boru\ocr\Layout\Diagram;

use boru\ocr\Layout\LayoutOptions;
use boru\ocr\Tesseract\Tsv\TsvRow;

class DiagramRegionClusterer
{
    /** @var int */
    protected $clusterPadPx = 70;

    /** @var int */
    protected $minRegionWords = 8;

    public function __construct(LayoutOptions $options)
    {
        $this->clusterPadPx = (int)$options->clusterPadPx;
        $this->minRegionWords = (int)$options->minRegionWords;
    }

    /**
     * Cluster word rows into diagram regions.
     *
     * Region format:
     *  minLeft, maxRight, minTop, maxBottom, words (TsvRow[])
     *
     * @param TsvRow[] $rows
     * @return array<int, array>
     */
    public function clusterWords(array $rows)
    {
        $words = array();
        foreach ($rows as $r) {
            if ((int)$r->level !== 5) continue;
            if ($r->text === '') continue;
            if ($r->conf < 0) continue;
            $words[] = $r;
        }
        if (count($words) === 0) return array();

        usort($words, array(__CLASS__, 'cmpWordTopLeft'));

        $regions = array();

        foreach ($words as $w) {
            $wBox = array(
                'minLeft' => (int)$w->left,
                'maxRight' => (int)$w->right(),
                'minTop' => (int)$w->top,
                'maxBottom' => (int)$w->top + (int)$w->height,
                'words' => array($w),
            );

            $mergedInto = null;

            for ($i = 0; $i < count($regions); $i++) {
                if ($this->boxesNear($regions[$i], $wBox, $this->clusterPadPx)) {
                    $regions[$i] = $this->mergeBoxes($regions[$i], $wBox);
                    $mergedInto = $i;
                    break;
                }
            }

            if ($mergedInto === null) {
                $regions[] = $wBox;
            }
        }

        $out = array();
        foreach ($regions as $r) {
            if (count($r['words']) >= $this->minRegionWords) {
                $out[] = $r;
            }
        }

        if (count($out) === 0) $out = $regions;

        return $out;
    }

    protected function boxesNear(array $a, array $b, $pad)
    {
        $pad = (int)$pad;

        $aL = (int)$a['minLeft'] - $pad;
        $aR = (int)$a['maxRight'] + $pad;
        $aT = (int)$a['minTop'] - $pad;
        $aB = (int)$a['maxBottom'] + $pad;

        $bL = (int)$b['minLeft'];
        $bR = (int)$b['maxRight'];
        $bT = (int)$b['minTop'];
        $bB = (int)$b['maxBottom'];

        $xOverlap = ($aL <= $bR) && ($aR >= $bL);
        $yOverlap = ($aT <= $bB) && ($aB >= $bT);

        return ($xOverlap && $yOverlap);
    }

    protected function mergeBoxes(array $a, array $b)
    {
        $a['minLeft'] = min((int)$a['minLeft'], (int)$b['minLeft']);
        $a['maxRight'] = max((int)$a['maxRight'], (int)$b['maxRight']);
        $a['minTop'] = min((int)$a['minTop'], (int)$b['minTop']);
        $a['maxBottom'] = max((int)$a['maxBottom'], (int)$b['maxBottom']);

        if (!isset($a['words'])) $a['words'] = array();
        if (isset($b['words'])) {
            foreach ($b['words'] as $w) $a['words'][] = $w;
        }
        return $a;
    }

    public static function cmpWordTopLeft($a, $b)
    {
        $at = (int)$a->top;
        $bt = (int)$b->top;
        if ($at === $bt) {
            $al = (int)$a->left;
            $bl = (int)$b->left;
            if ($al === $bl) return 0;
            return ($al < $bl) ? -1 : 1;
        }
        return ($at < $bt) ? -1 : 1;
    }
}
