<?php

namespace boru\ocr\Tesseract\Tsv;

class TsvParser
{
    /**
     * Parse raw TSV into rows.
     * Note: tesseract includes header row with column names.
     *
     * @param string $tsv
     * @return TsvRow[]
     */
    public function parseRows($tsv)
    {
        $tsv = (string)$tsv;
        if ($tsv === '') return array();

        $lines = preg_split("/\\r\\n|\\n|\\r/", $tsv);
        if (!$lines || count($lines) < 2) return array();

        $header = str_getcsv($lines[0], "\t");
        $map = $this->buildHeaderMap($header);

        $rows = array();
        $count = count($lines);
        for ($i = 1; $i < $count; $i++) {
            $line = $lines[$i];
            if ($line === '') continue;

            $cols = str_getcsv($line, "\t");
            if (!$cols || count($cols) < 2) continue;

            $assoc = array();
            foreach ($map as $name => $idx) {
                $assoc[$name] = isset($cols[$idx]) ? $cols[$idx] : '';
            }

            $rows[] = new TsvRow($assoc);
        }

        return $rows;
    }

    /**
     * @param array $header
     * @return array<string,int>
     */
    protected function buildHeaderMap(array $header)
    {
        $map = array();
        $count = count($header);
        for ($i = 0; $i < $count; $i++) {
            $name = trim((string)$header[$i]);
            if ($name === '') continue;
            $map[$name] = $i;
        }
        return $map;
    }
}
