<?php

namespace boru\ocr\Format;

class OutputNormalizer
{
    /**
     * Normalize OCR output for readability while keeping structure.
     *
     * @param string $text
     * @param array $options
     * @return string
     */
    public static function normalize($text, array $options = array())
    {
        $collapseSpaces = array_key_exists('collapseSpaces', $options) ? (bool)$options['collapseSpaces'] : true;
        $preserveIndent = array_key_exists('preserveIndent', $options) ? (bool)$options['preserveIndent'] : true;
        $collapseBlankLines = array_key_exists('collapseBlankLines', $options) ? (bool)$options['collapseBlankLines'] : true;

        $lines = preg_split("/\r\n|\r|\n/", (string)$text);
        $out = array();

        foreach ($lines as $line) {
            $original = $line;

            // Preserve leading indentation if desired (rare in OCR, but safe)
            $lead = '';
            if ($preserveIndent) {
                if (preg_match('/^(\s+)/', $line, $m)) {
                    $lead = $m[1];
                    $line = substr($line, strlen($lead));
                }
            }

            if ($collapseSpaces) {
                // Collapse runs of 2+ spaces INSIDE the line to 1 space
                // (leave leading indent handled above)
                $line = preg_replace('/[ ]{2,}/', ' ', $line);
            }

            // Trim trailing whitespace (always)
            $line = rtrim($lead . $line);

            $out[] = $line;
        }

        if ($collapseBlankLines) {
            // Reduce 3+ blank lines to 2 (keeps paragraph breaks)
            $joined = implode("\n", $out);
            $joined = preg_replace("/\n{3,}/", "\n\n", $joined);
            return $joined;
        }

        return implode("\n", $out);
    }
}
