<?php

namespace boru\ocr\Layout\Support;

/**
 * Detect and suppress repeating header/footer lines across pages.
 *
 * Designed to run at the multi-page layer (after you have per-page baseline lines).
 *
 * Typical usage:
 *   $filter = new RepeatSuppressionFilter();
 *   $filter->analyze($pagesLines); // array of pages => array of string lines
 *   $filteredLines = $filter->filter($pagesLines);
 *
 * Notes:
 * - Operates on line text only (no bounding boxes needed).
 * - Uses top/bottom windows (first N lines / last N lines).
 * - Marks lines that appear on >= minPageFraction pages (and >= minCount pages).
 */
class RepeatSuppressionFilter
{
    /** @var int */
    protected $topWindow = 12;

    /** @var int */
    protected $bottomWindow = 12;

    /** @var float */
    protected $minPageFraction = 0.60;

    /** @var int */
    protected $minCount = 3;

    /** @var bool */
    protected $digitsWildcard = true;

    /** @var array normalizedLine => count */
    protected $counts = array();

    /** @var array normalizedLine => true */
    protected $repeatSet = array();

    /** @var int */
    protected $pageCount = 0;

    public function __construct(array $options = array())
    {
        if (isset($options['topWindow'])) $this->topWindow = (int)$options['topWindow'];
        if (isset($options['bottomWindow'])) $this->bottomWindow = (int)$options['bottomWindow'];
        if (isset($options['minPageFraction'])) $this->minPageFraction = (float)$options['minPageFraction'];
        if (isset($options['minCount'])) $this->minCount = (int)$options['minCount'];
        if (array_key_exists('digitsWildcard', $options)) $this->digitsWildcard = (bool)$options['digitsWildcard'];

        if ($this->topWindow < 0) $this->topWindow = 0;
        if ($this->bottomWindow < 0) $this->bottomWindow = 0;
        if ($this->minCount < 1) $this->minCount = 1;
        if ($this->minPageFraction < 0.1) $this->minPageFraction = 0.1;
        if ($this->minPageFraction > 1.0) $this->minPageFraction = 1.0;
    }

    /**
     * Analyze pages to find repeated header/footer lines.
     *
     * @param array<int,array<int,string>> $pagesLines
     * @return void
     */
    public function analyze(array $pagesLines)
    {
        $this->counts = array();
        $this->repeatSet = array();
        $this->pageCount = count($pagesLines);

        if ($this->pageCount <= 1) return;

        foreach ($pagesLines as $pageIdx => $lines) {
            if (!is_array($lines) || empty($lines)) continue;

            $seen = array();

            $top = array_slice($lines, 0, $this->topWindow);
            $bottom = $this->bottomWindow > 0 ? array_slice($lines, max(0, count($lines) - $this->bottomWindow)) : array();

            $window = array_merge($top, $bottom);

            foreach ($window as $line) {
                $n = $this->normalizeLine($line);
                if ($n === '') continue;

                // Count at most once per page per normalized line
                if (isset($seen[$n])) continue;
                $seen[$n] = true;

                if (!isset($this->counts[$n])) $this->counts[$n] = 0;
                $this->counts[$n]++;
            }
        }

        $threshold = max($this->minCount, (int)ceil($this->pageCount * $this->minPageFraction));

        foreach ($this->counts as $n => $c) {
            if ($c >= $threshold) {
                $this->repeatSet[$n] = true;
            }
        }
    }

    /**
     * Filter all pages.
     *
     * @param array<int,array<int,string>> $pagesLines
     * @return array<int,array<int,string>>
     */
    public function filter(array $pagesLines)
    {
        $out = array();
        foreach ($pagesLines as $pageIdx => $lines) {
            $out[$pageIdx] = $this->filterPage($lines);
        }
        return $out;
    }

    /**
     * Filter one page.
     *
     * @param array<int,string> $lines
     * @return array<int,string>
     */
    public function filterPage(array $lines)
    {
        if (empty($this->repeatSet) || empty($lines)) return $lines;

        $n = count($lines);
        $keep = array();

        for ($i = 0; $i < $n; $i++) {
            $inTop = ($i < $this->topWindow);
            $inBottom = ($this->bottomWindow > 0 && $i >= ($n - $this->bottomWindow));

            if (!$inTop && !$inBottom) {
                $keep[] = $lines[$i];
                continue;
            }

            $norm = $this->normalizeLine($lines[$i]);
            if ($norm !== '' && isset($this->repeatSet[$norm])) {
                // suppress
                continue;
            }

            $keep[] = $lines[$i];
        }

        // Collapse excessive blank lines caused by removals
        $keep = $this->collapseBlankLines($keep);

        return $keep;
    }

    /**
     * @return array<string,int>
     */
    public function getCounts()
    {
        return $this->counts;
    }

    /**
     * @return array<string,bool>
     */
    public function getRepeatSet()
    {
        return $this->repeatSet;
    }

    protected function normalizeLine($line)
    {
        $line = trim((string)$line);
        if ($line === '') return '';

        // collapse whitespace
        $line = preg_replace('/\s+/', ' ', $line);

        // uppercase for stability
        $line = strtoupper($line);

        // wildcard digits to avoid page numbers / dates breaking repeats
        if ($this->digitsWildcard) {
            // replace runs of digits with #
            $line = preg_replace('/[0-9]+/', '#', $line);
        }

        // normalize common punctuation spacing
        $line = str_replace(array(' :', ': '), ':', $line);

        return trim($line);
    }

    protected function collapseBlankLines(array $lines)
    {
        $out = array();
        $blankRun = 0;

        foreach ($lines as $ln) {
            if (trim($ln) === '') {
                $blankRun++;
                if ($blankRun > 2) continue;
                $out[] = $ln;
            } else {
                $blankRun = 0;
                $out[] = $ln;
            }
        }

        return $out;
    }
}
