<?php

namespace boru\ocr\Agent;

use boru\ocr\Agent\OCRAgent;
use boru\ocr\OcrEngineResult;

/**
 * TableInterpreterAgent:
 * - consumes TSV baseline + table candidate hints (pages/regions)
 * - uses TSV evidence tools sparingly to interpret and structure tables
 * - outputs JSON only (tables + confidence)
 *
 * PHP 5.6+
 */
class TableInterpreterAgent
{
    /** @var OCRAgent */
    protected $agent;

    protected $logger = null;

    public static $instructions =
"You are the TABLE INTERPRETATION agent.\n\nGoal:\n- Identify and extract table-like structures (including parts lists, BOMs, schedules, menus, legal payment tables).\n- Use TABLE_CANDIDATES to focus. These are hints of where tables likely exist (page + optional bbox + kind).\n- Use TSV tools to read table cells precisely when needed.\n- Output STRICT JSON ONLY. No prose.\n\nTABLE_CANDIDATES format (important):\nEach candidate may include:\n- sourceKind: \"region\" | \"page\"\n- kind: \"grid\" | \"key_value\"    (table type hint)\n- bbox: {x,y,w,h} (optional; preferred)\n- region: label string (optional; e.g. \"TITLE BLOCK\")\n\nWhat to output (JSON schema):\n{\n  \"tables\": [\n    {\n      \"page\": 1,\n      \"source\": {\n        \"sourceKind\": \"region\"|\"page\",\n        \"candidateKind\": \"grid\"|\"key_value\"|null,\n        \"bbox\": {\"x\":0,\"y\":0,\"w\":0,\"h\":0} | null,\n        \"region\": \"TITLE BLOCK\"|null\n      },\n      \"columns\": [\"...\"],\n      \"rows\": [ { \"cells\": [\"...\",\"...\"] } ],\n      \"confidence\": 0.0,\n      \"notes\": [\"...\"]\n    }\n  ],\n  \"notes\": [\"...\"]\n}\n\nRules:\n- Only emit a table if you are confident it's truly a table (confidence >= 0.55).\n- Prefer extracting a smaller, correct table over a huge, noisy one.\n- If no tables are found, output {\"tables\":[],\"notes\":[...]}.\n- If candidateKind is \"key_value\":\n  - output columns MUST be [\"key\",\"value\"]\n  - each row MUST have exactly 2 cells: [key,value]\n  - key is the left label; value is the right value (often numeric, sometimes text)\n- If candidateKind is \"grid\":\n  - infer a header row if present; otherwise output generic column names like [\"col1\",\"col2\",...]\n\nTool use:\n- Use tsv_get_box_text when a candidate bbox is present (tight box).\n- Use tsv_find_text + tsv_get_near_match_text for anchors like 'Qty', 'Amount', 'Total', 'Part', 'Date', 'Description', 'Price'.\n- Do not hallucinate rows/columns; if TSV evidence is unclear, skip or output a smaller subset.\n\nNow produce output JSON.\n";

    public function __construct($pdfPath, array $tools = array(), $agentOptions = null, $logger = null)
    {
        if(!$agentOptions) {
            $agentOptions = new AgentOptions();
        }
        if(!$logger && isset($agentOptions->logger) && is_callable($agentOptions->logger)) {
            $logger = $agentOptions->logger;
        }
        if(!$agentOptions->reference()) {
            $agentOptions->reference = "TableInterpreterAgent_".basename($pdfPath)."_".date("YmdHis")."_".uniqid();
        }
        $this->logger = $logger;
        $this->agent = new OCRAgent($pdfPath, $agentOptions, $logger);
        $this->agent->instructions(self::$instructions);

        if (!empty($tools)) {
            $this->agent->tools($tools);
        }
    }

    /**
     * @param OcrEngineResult $bundle
     * @param string|null $planJson
     * @return string JSON string
     */
    public function run(OcrEngineResult $bundle, $planJson = null)
    {
        $this->log("TableInterpreterAgent::run");
        $msg = array();

        $msg[] = "TSV_BASELINE:\n" . $bundle->baselineFramed();
        $msg[] = "TABLE_CANDIDATES:\n" . $bundle->tableCandidatesFramed();

        if ($planJson !== null && trim((string)$planJson) !== '') {
            $msg[] = "PLANNER_PLAN_JSON:\n" . (string)$planJson;
        }

        $this->agent->message($msg);

        return $this->agent->run();
    }

    protected function log($msg)
    {
        if ($this->logger) {
            call_user_func($this->logger, $msg);
        }
    }
}
