<?php

namespace boru\ocr\Tesseract;

use boru\ocr\OCRLogger;
use boru\ocr\Traits\OcrLogTrait;

class TesseractCliRunner
{
    /** @var string */
    protected $binary;

    /** @var OCRLogger|null */
    protected $logger = null;
    use OcrLogTrait;

    /**
     * @param string $binary Path or command name (e.g. 'tesseract')
     */
    public function __construct($binary = 'tesseract')
    {
        $this->binary = $binary;
    }

    /**
     * Runs tesseract and returns raw output (stdout).
     *
     * @param string $imagePath
     * @param TesseractOptions $opts
     * @param string $format 'txt' or 'tsv'
     * @return string
     * @throws \Exception
     */
    public function run($imagePath, TesseractOptions $opts, $format)
    {
        if (!is_string($imagePath) || $imagePath === '' || !file_exists($imagePath)) {
            throw new \Exception("TesseractCliRunner: imagePath not found: " . (string)$imagePath);
        }
        if($opts->logger !== null && $opts->logger instanceof OCRLogger) {
            $this->logger = $opts->logger;
        }

        $format = strtolower((string)$format);
        if ($format !== 'txt' && $format !== 'tsv') {
            throw new \Exception("TesseractCliRunner: invalid format: " . (string)$format);
        }

        // IMPORTANT: output base "stdout" means: output to stdout (not file).
        // Many tesseract builds accept outputbase "stdout".
        $cmdParts = array();
        $cmdParts[] = escapeshellcmd($this->binary);
        $cmdParts[] = escapeshellarg($imagePath);
        $cmdParts[] = 'stdout';

        $args = $this->buildArgs($opts);
        foreach ($args as $a) $cmdParts[] = $a;

        if ($format === 'tsv') {
            $cmdParts[] = 'tsv';
        }

        // Suppress stderr unless you want to capture it
        $cmd = implode(' ', $cmdParts);
        if ($opts->quiet) {
            $cmd .= ' 2>/dev/null';
        }
        $this->logDebug("TesseractCliRunner: running command: " . $cmd);
        $out = shell_exec($cmd);

        if (!is_string($out) || $out === '') {
            // Still return empty string (some images truly return nothing), but caller can decide.
            return '';
        }

        return $out;
    }

    /**
     * @param TesseractOptions $opts
     * @return array Array of CLI args (already escaped where needed)
     */
    protected function buildArgs(TesseractOptions $opts)
    {
        $args = array();

        if ($opts->lang !== null && $opts->lang !== '') {
            $args[] = '-l';
            $args[] = escapeshellarg($opts->lang);
        }

        if ($opts->psm !== null) {
            $args[] = '--psm';
            $args[] = (string)intval($opts->psm);
        }

        if ($opts->oem !== null) {
            $args[] = '--oem';
            $args[] = (string)intval($opts->oem);
        }

        // config can be:
        //  - ['preserve_interword_spaces' => '1']
        //  - ['-c', 'foo=bar'] (raw style)
        foreach ($opts->config as $k => $v) {
            if (is_int($k)) {
                // raw tokens
                $args[] = (string)$v;
                continue;
            }
            $args[] = '-c';
            $args[] = escapeshellarg($k . '=' . $v);
        }

        return $args;
    }
}
