<?php

namespace boru\ocr\Tesseract\Tsv;

class TsvSerializer
{
    /**
     * Canonical TSV header columns (tesseract default).
     * @return array
     */
    public static function header()
    {
        return array(
            'level','page_num','block_num','par_num','line_num','word_num',
            'left','top','width','height','conf','text'
        );
    }

    /**
     * Serialize a page to a TSV string (including header).
     *
     * @param TsvPage $page
     * @param array $options
     *   - includeNonWords (bool) default true
     * @return string
     */
    public static function toTsvString(TsvPage $page, array $options = array())
    {
        $includeNonWords = array_key_exists('includeNonWords', $options) ? (bool)$options['includeNonWords'] : true;

        $cols = self::header();
        $out = implode("\t", $cols) . "\n";

        foreach ($page->rows as $r) {
            if (!$includeNonWords && (int)$r->level !== 5) continue;

            $row = array(
                (string)$r->level,
                (string)$r->page_num,
                (string)$r->block_num,
                (string)$r->par_num,
                (string)$r->line_num,
                (string)$r->word_num,
                (string)$r->left,
                (string)$r->top,
                (string)$r->width,
                (string)$r->height,
                (string)$r->conf,
                self::escapeTsvText($r->text),
            );

            $out .= implode("\t", $row) . "\n";
        }

        return $out;
    }

    /**
     * Parse TSV into a TsvPage using existing TsvParser.
     *
     * @param string $tsv
     * @param int $pageNumber The logical page number (1-based) for container
     * @return TsvPage
     */
    public static function fromTsvString($tsv, $pageNumber)
    {
        $parser = new TsvParser();
        $rows = $parser->parseRows($tsv);
        return new TsvPage((int)$pageNumber, $rows);
    }

    /**
     * JSON serialize pages for fixtures.
     *
     * @param TsvPage[] $pages
     * @return string
     */
    public static function toJson(array $pages)
    {
        $arr = array();
        foreach ($pages as $p) {
            if (!$p instanceof TsvPage) continue;
            $arr[] = self::pageToArray($p);
        }
        return json_encode($arr);
    }

    /**
     * JSON deserialize pages for fixtures.
     *
     * @param string $json
     * @return TsvPage[]
     */
    public static function fromJson($json)
    {
        $json = (string)$json;
        if ($json === '') return array();

        $data = json_decode($json, true);
        if (!is_array($data)) return array();

        $pages = array();
        foreach ($data as $p) {
            $pages[] = self::pageFromArray($p);
        }
        return $pages;
    }

    /**
     * @param TsvPage $page
     * @return array
     */
    public static function pageToArray(TsvPage $page)
    {
        $rows = array();
        foreach ($page->rows as $r) {
            $rows[] = array(
                'level' => $r->level,
                'page_num' => $r->page_num,
                'block_num' => $r->block_num,
                'par_num' => $r->par_num,
                'line_num' => $r->line_num,
                'word_num' => $r->word_num,
                'left' => $r->left,
                'top' => $r->top,
                'width' => $r->width,
                'height' => $r->height,
                'conf' => $r->conf,
                'text' => $r->text,
            );
        }

        return array(
            'pageNumber' => $page->pageNumber,
            'rows' => $rows,
        );
    }

    /**
     * @param array $arr
     * @return TsvPage
     */
    public static function pageFromArray($arr)
    {
        $pageNumber = isset($arr['pageNumber']) ? (int)$arr['pageNumber'] : 0;
        $rowsArr = isset($arr['rows']) && is_array($arr['rows']) ? $arr['rows'] : array();

        $rows = array();
        foreach ($rowsArr as $r) {
            $rows[] = new TsvRow($r);
        }

        return new TsvPage($pageNumber, $rows);
    }

    protected static function escapeTsvText($text)
    {
        $text = (string)$text;
        // Keep it simple: replace tabs/newlines so it remains one TSV field.
        $text = str_replace("\t", ' ', $text);
        $text = str_replace("\r", ' ', $text);
        $text = str_replace("\n", ' ', $text);
        return $text;
    }
}
