<?php

namespace boru\ocr\Source\Spreadsheet;

class SpreadsheetPageTextProvider
{
    /** @var string */
    protected $filePath;

    /** @var PExcelCsvConverter */
    protected $converter;

    public function __construct($filePath, ExcelCsvConverter $converter = null)
    {
        if (!file_exists($filePath)) {
            throw new \Exception("Spreadsheet file not found: " . $filePath);
        }

        $this->filePath = $filePath;
        $this->converter = $converter ? $converter : new ExcelCsvConverter();
    }

    /**
     * Convert spreadsheet into logical pages (one per sheet).
     *
     * @return array<int,string>
     */
    public function getTextPages()
    {
        $ext = strtolower(pathinfo($this->filePath, PATHINFO_EXTENSION));

        // CSV = single page
        if ($ext === 'csv') {
            return array(
                0 => trim(file_get_contents($this->filePath))
            );
        }

        // XLS / XLSX = multi-sheet
        if (!in_array($ext, array('xls', 'xlsx'))) {
            throw new \Exception("Unsupported spreadsheet format: " . $ext);
        }

        $csvString = $this->converter->convertSpreadsheetToCsvString($this->filePath);

        return $this->splitCsvIntoSheetPages($csvString);
    }

    /**
     * Split CSV string into per-sheet pages.
     *
     * Sheet headers look like:
     * ### Sheet: SheetName
     *
     * @param string $csv
     * @return array<int,string>
     */
    protected function splitCsvIntoSheetPages($csv)
    {
        $lines = preg_split('/\r\n|\r|\n/', $csv);
        $pages = array();

        $currentSheet = null;
        $buffer = array();
        $pageIndex = 0;

        foreach ($lines as $line) {
            if (strpos($line, '### Sheet: ') === 0) {
                // Flush previous sheet
                if ($currentSheet !== null) {
                    $pages[$pageIndex++] = trim(implode("\n", $buffer));
                    $buffer = array();
                }

                $currentSheet = trim(substr($line, strlen('### Sheet: ')));
                $buffer[] = '[Sheet: ' . $currentSheet . ']';
                continue;
            }

            $buffer[] = $line;
        }

        // Final sheet
        if ($currentSheet !== null) {
            $pages[$pageIndex] = trim(implode("\n", $buffer));
        }

        return $pages;
    }
}
