<?php

namespace boru\ocr\Source\Word;

class WordPageTextProvider
{
    /** @var string */
    protected $sourceFile;

    /**
     * @param string $sourceFile
     */
    public function __construct($sourceFile)
    {
        $this->sourceFile = (string)$sourceFile;
    }

    /**
     * Return per-page text strings for the Word document.
     *
     * For a simple implementation you can treat the whole document
     * as a single "page" or split on form-feed markers if available.
     *
     * @return array<int,string>
     * @throws \Exception
     */
    public function getTextPages()
    {
        $file = $this->sourceFile;

        if (!file_exists($file)) {
            throw new \Exception('WordPageTextProvider: source file not found: ' . $file);
        }

        // TODO: implement actual .doc/.docx extraction.
        // Options:
        //  - shell out to `docx2txt`, `pandoc`, or `libreoffice --convert-to txt`
        //  - use a PHP library such as PhpWord, etc.
        //
        // For now we use a placeholder implementation that assumes some
        // external command `docx2txt` is available:

        $ext = strtolower(pathinfo($file, PATHINFO_EXTENSION));

        $text = '';

        if ($ext === 'docx') {
            // Example: use docx2txt if installed
            $cmd = 'docx2txt ' . escapeshellarg($file) . ' -';
            $text = shell_exec($cmd);
            if ($text === null) {
                throw new \Exception('WordPageTextProvider: failed to extract DOCX text (is docx2txt installed?)');
            }
        } elseif ($ext === 'doc') {
            // Example: use `antiword` or similar for .doc
            $cmd = 'antiword ' . escapeshellarg($file);
            $text = shell_exec($cmd);
            if ($text === null) {
                throw new \Exception('WordPageTextProvider: failed to extract DOC text (is antiword installed?)');
            }
        } else {
            throw new \Exception('WordPageTextProvider: unsupported extension: ' . $ext);
        }

        $text = (string)$text;

        // Simple approach: treat whole document as a single page
        return array($text);
    }
}
