<?php

namespace boru\ocr\Confidence;

use boru\ocr\Tesseract\Tsv\TsvPage;
use boru\ocr\Tesseract\Tsv\TsvRow;

class ConfidenceCalculator
{
    /**
     * Defaults tuned for OCR reliability.
     * @var int
     */
    protected $lowConfThreshold = 60;

    /** @var int */
    protected $veryLowConfThreshold = 35;

    /** @var float */
    protected $hotspotAvgThreshold = 55.0;

    /** @var int */
    protected $maxHotspots = 30;

    public function __construct(array $options = array())
    {
        if (isset($options['lowConfThreshold'])) $this->lowConfThreshold = (int)$options['lowConfThreshold'];
        if (isset($options['veryLowConfThreshold'])) $this->veryLowConfThreshold = (int)$options['veryLowConfThreshold'];
        if (isset($options['hotspotAvgThreshold'])) $this->hotspotAvgThreshold = (float)$options['hotspotAvgThreshold'];
        if (isset($options['maxHotspots'])) $this->maxHotspots = (int)$options['maxHotspots'];
    }

    /**
     * @param TsvPage[] $tsvPages
     * @return ConfidenceReport
     */
    public function fromTsvPages(array $tsvPages)
    {
        $report = new ConfidenceReport();
        $report->mode = 'tsv';
        $report->thresholds = array(
            'lowConfThreshold' => $this->lowConfThreshold,
            'veryLowConfThreshold' => $this->veryLowConfThreshold,
            'hotspotAvgThreshold' => $this->hotspotAvgThreshold,
        );

        $overallConfs = array();
        $overallTokenCount = 0;
        $overallConfCount = 0;
        $overallLow = 0;
        $overallVeryLow = 0;

        foreach ($tsvPages as $page) {
            $pc = $this->pageStats($page);
            $report->pages[] = $pc;

            // accumulate overall
            $overallTokenCount += $pc->tokenCount;
            $overallConfCount += $pc->confCount;

            if (isset($pc->percentiles['_confs']) && is_array($pc->percentiles['_confs'])) {
                foreach ($pc->percentiles['_confs'] as $c) $overallConfs[] = $c;
            }

            // reconstruct low counts using ratios when counts exist
            if ($pc->confCount > 0) {
                $overallLow += (int)round($pc->lowConfRatio * $pc->confCount);
                $overallVeryLow += (int)round($pc->veryLowConfRatio * $pc->confCount);
            }

            // hotspots per page
            $pageHotspots = $this->hotspotsForPage($page);
            foreach ($pageHotspots as $h) $report->hotspots[] = $h;
        }

        // overall stats
        $overall = array(
            'tokenCount' => $overallTokenCount,
            'confCount' => $overallConfCount,
            'confAvg' => null,
            'confMin' => null,
            'confMax' => null,
            'percentiles' => array(),
            'lowConfRatio' => 0.0,
            'veryLowConfRatio' => 0.0,
        );

        if (count($overallConfs) > 0) {
            sort($overallConfs);
            $overall['confMin'] = $overallConfs[0];
            $overall['confMax'] = $overallConfs[count($overallConfs) - 1];
            $overall['confAvg'] = $this->avg($overallConfs);
            $overall['percentiles'] = $this->percentiles($overallConfs);

            $overall['lowConfRatio'] = $overallConfCount > 0 ? ($overallLow / $overallConfCount) : 0.0;
            $overall['veryLowConfRatio'] = $overallConfCount > 0 ? ($overallVeryLow / $overallConfCount) : 0.0;
        } else {
            $report->notes[] = "No confidence-bearing TSV word tokens found.";
        }

        $report->overall = $overall;

        // cleanup: remove private _confs store from page percentiles
        foreach ($report->pages as $p) {
            if (isset($p->percentiles['_confs'])) unset($p->percentiles['_confs']);
        }

        // sort hotspots by avg confidence ascending, then by lowConfRatio desc
        usort($report->hotspots, array($this, 'cmpHotspot'));

        // cap
        if (count($report->hotspots) > $this->maxHotspots) {
            $report->hotspots = array_slice($report->hotspots, 0, $this->maxHotspots);
        }

        return $report;
    }

    /**
     * @param TsvPage $page
     * @return PageConfidence
     */
    protected function pageStats(TsvPage $page)
    {
        $pc = new PageConfidence();
        $pc->page = (int)$page->pageNumber;

        $confs = array();
        $tokenCount = 0;
        $low = 0;
        $veryLow = 0;

        foreach ($page->words() as $r) {
            if ((int)$r->level !== 5) continue;
            if ($r->text === '') continue;
            if ($r->conf < 0) continue;

            $tokenCount++;
            $c = (int)$r->conf;
            $confs[] = $c;

            if ($c < $this->lowConfThreshold) $low++;
            if ($c < $this->veryLowConfThreshold) $veryLow++;
        }

        $pc->tokenCount = $tokenCount;
        $pc->confCount = count($confs);

        if (count($confs) > 0) {
            sort($confs);
            $pc->confMin = $confs[0];
            $pc->confMax = $confs[count($confs) - 1];
            $pc->confAvg = $this->avg($confs);

            $pc->percentiles = $this->percentiles($confs);
            // stash raw confs temporarily for overall accumulation
            $pc->percentiles['_confs'] = $confs;

            $pc->lowConfRatio = $pc->confCount > 0 ? ($low / $pc->confCount) : 0.0;
            $pc->veryLowConfRatio = $pc->confCount > 0 ? ($veryLow / $pc->confCount) : 0.0;
        }

        return $pc;
    }

    /**
     * Hotspots are computed per TSV line-group (block:par:line).
     *
     * @param TsvPage $page
     * @return Hotspot[]
     */
    protected function hotspotsForPage(TsvPage $page)
    {
        $out = array();
        $groups = $page->byLine();

        foreach ($groups as $lineKey => $rows) {
            $words = array();
            foreach ($rows as $r) {
                if ((int)$r->level !== 5) continue;
                if ($r->text === '') continue;
                if ($r->conf < 0) continue;
                $words[] = $r;
            }

            if (count($words) === 0) continue;

            $confs = array();
            $low = 0;
            foreach ($words as $w) {
                $c = (int)$w->conf;
                $confs[] = $c;
                if ($c < $this->lowConfThreshold) $low++;
            }

            $avg = $this->avg($confs);
            $lowRatio = count($confs) > 0 ? ($low / count($confs)) : 0.0;

            // hotspot criteria:
            // - low average OR very high low-ratio
            if ($avg > $this->hotspotAvgThreshold && $lowRatio < 0.45) {
                continue;
            }

            $bounds = $this->boundsForRows($words);
            $h = new Hotspot();
            $h->page = (int)$page->pageNumber;
            $h->lineKey = (string)$lineKey;
            $h->left = $bounds['left'];
            $h->top = $bounds['top'];
            $h->width = $bounds['width'];
            $h->height = $bounds['height'];
            $h->confAvg = $avg;
            $h->tokenCount = count($confs);
            $h->lowConfRatio = $lowRatio;

            $h->sampleText = $this->sampleText($words, 160);

            $h->tags = array('low_conf');
            if ($avg < $this->veryLowConfThreshold) $h->tags[] = 'very_low_conf';
            if ($lowRatio >= 0.6) $h->tags[] = 'high_low_ratio';
            if ($h->width < 120 || $h->height < 25) $h->tags[] = 'tiny_region_suspected';

            $out[] = $h;
        }

        return $out;
    }

    protected function boundsForRows(array $rows)
    {
        $minL = null; $minT = null; $maxR = null; $maxB = null;

        foreach ($rows as $r) {
            $l = (int)$r->left;
            $t = (int)$r->top;
            $rgt = $l + (int)$r->width;
            $bot = $t + (int)$r->height;

            if ($minL === null || $l < $minL) $minL = $l;
            if ($minT === null || $t < $minT) $minT = $t;
            if ($maxR === null || $rgt > $maxR) $maxR = $rgt;
            if ($maxB === null || $bot > $maxB) $maxB = $bot;
        }

        if ($minL === null) $minL = 0;
        if ($minT === null) $minT = 0;
        if ($maxR === null) $maxR = $minL;
        if ($maxB === null) $maxB = $minT;

        return array(
            'left' => $minL,
            'top' => $minT,
            'width' => max(0, $maxR - $minL),
            'height' => max(0, $maxB - $minT),
        );
    }

    protected function sampleText(array $rows, $maxChars)
    {
        // order by left to get a decent in-line sample
        usort($rows, array($this, 'cmpLeft'));
        $parts = array();
        foreach ($rows as $r) {
            if ($r->text !== '') $parts[] = $r->text;
        }
        $s = trim(implode(' ', $parts));
        if (strlen($s) > $maxChars) $s = substr($s, 0, $maxChars - 3) . '...';
        return $s;
    }

    protected function percentiles(array $sortedInts)
    {
        // expects sorted
        return array(
            'p10' => $this->pct($sortedInts, 0.10),
            'p25' => $this->pct($sortedInts, 0.25),
            'p50' => $this->pct($sortedInts, 0.50),
            'p75' => $this->pct($sortedInts, 0.75),
            'p90' => $this->pct($sortedInts, 0.90),
        );
    }

    protected function pct(array $sortedInts, $p)
    {
        $n = count($sortedInts);
        if ($n === 0) return null;
        if ($n === 1) return $sortedInts[0];

        // nearest-rank with 1-based rank
        $rank = (int)ceil($p * $n);
        if ($rank < 1) $rank = 1;
        if ($rank > $n) $rank = $n;

        return $sortedInts[$rank - 1];
    }

    protected function avg(array $ints)
    {
        $n = count($ints);
        if ($n === 0) return null;
        $sum = 0.0;
        foreach ($ints as $v) $sum += (float)$v;
        return $sum / $n;
    }

    public function cmpLeft($a, $b)
    {
        $al = (int)$a->left; $bl = (int)$b->left;
        if ($al === $bl) return 0;
        return $al < $bl ? -1 : 1;
    }

    public function cmpHotspot(Hotspot $a, Hotspot $b)
    {
        $ac = $a->confAvg === null ? 9999 : $a->confAvg;
        $bc = $b->confAvg === null ? 9999 : $b->confAvg;

        if ($ac < $bc) return -1;
        if ($ac > $bc) return 1;

        // tie-break: more low-conf ratio first
        if ($a->lowConfRatio > $b->lowConfRatio) return -1;
        if ($a->lowConfRatio < $b->lowConfRatio) return 1;

        return 0;
    }
}
