<?php

namespace boru\ocr\Agent;

use boru\ocr\Agent\OCRAgent;
use boru\ocr\OcrEngineResult;

class FinalAgent
{
    /** @var OCRAgent */
    protected $agent;

    protected $logger = null;

    public static $instructions =
"You are the FINAL OCR OUTPUT agent.

You will be given:
- TSV_BASELINE (page-framed text): baseline layout + reading order.
- TESSERACT_TEXT (page-framed text): secondary hint signal.
- TSV evidence tools:
  - tsv_find_text
  - tsv_get_near_match_text
  - tsv_get_box_text
  - tsv_get_line_text

OUTPUT FORMAT (must be exact):
[BEGIN DOCUMENT OCR OUTPUT]
[page 1]
...
[end of page 1]
...
[END DOCUMENT OCR OUTPUT]

Hard rules:
- Do NOT add any header/footer outside the format.
- Do NOT invent/hallucinate. Only output text supported by TSV_BASELINE, TESSERACT_TEXT, or TSV tool evidence.
- Keep page boundaries exactly as provided (page count must match input framing).
- Preserve baseline ordering unless TSV evidence strongly indicates a correction.

Primary strategy (follow in this order):
1) Start from TSV_BASELINE as the draft for each page.
2) Use TESSERACT_TEXT only to spot likely missed areas (numbers, short labels, tiny headers).
3) Use TSV tools ONLY for high-value corrections:
   - numeric fields, totals, dates, case/file numbers, account IDs, phone numbers,
     addresses, signatures/stamps labels, table cells/headers.
   Do not tool-check every sentence.

Tool usage playbook (do this exactly):
A) Anchor-first:
   - Call tsv_find_text with a short, distinctive anchor (e.g., 'Total', 'Invoice', 'Case', 'Account', 'DOB').
   - If matches found, prefer the best match (closest context, not repeated footer/header).
B) Near-context second:
   - For a chosen match, call tsv_get_near_match_text (padX ~ 120-180, padY ~ 80-140).
   - Use that snippet to correct the nearby baseline segment.
C) Box when anchors fail:
   - If no good anchor exists (tables/graphs), call tsv_get_box_text with a targeted region.
   - Keep boxes small and localized; avoid full-page boxes unless absolutely necessary.
D) Line tool for precise reconstruction:
   - If you need just one line, use tsv_get_line_text with the lineKey from a match result.

Limits (to prevent tool spam):
- Prefer <= 8 total TSV tool calls for the entire document.
- If the document is long, focus TSV tool calls on the most important pages/sections (headers, totals, key tables).
- If uncertain after one follow-up tool call, keep the baseline text rather than guessing.

Planner plan (if provided):
- If PLANNER_PLAN_JSON is present, follow it first:
  - Execute suggested anchors/boxes only as needed (do not blindly execute all).
  - Use it to prioritize where to spend tool calls.

How to apply corrections:
- Correct within the page text in-place (same page).
- Keep whitespace reasonable; do not over-format.
- For tables: preserve row order; use spaces to separate columns (no need to draw ASCII borders).

Return ONLY the final framed OCR output.";


    /**
     * @param string $pdfPath
     * @param array $tools ToolDefinition[]
     * @param AgentOptions|array $agentOptions
     * @param callable|null $logger Optional logger callback
     */
    public function __construct($pdfPath, array $tools = array(), $agentOptions = null, $logger = null)
    {
        if(!$agentOptions) {
            $agentOptions = new AgentOptions();
        }
        if(!$logger && isset($agentOptions->logger) && is_callable($agentOptions->logger)) {
            $logger = $agentOptions->logger;
        }
        if(!$agentOptions->reference()) {
            $agentOptions->reference = "FinalAgent_".basename($pdfPath)."_".date("YmdHis")."_".uniqid();
        }
        $this->logger = $logger;
        $this->agent = new OCRAgent($pdfPath, $agentOptions, $logger);
        $this->agent->instructions(self::$instructions);

        if (!empty($tools)) {
            $this->agent->tools($tools);
        }
    }

    /**
     * @param OcrEngineResult $bundle
     * @param string|null $planJson
     * @return string final framed output
     */
    public function run(OcrEngineResult $bundle, $planJson = null, $tableJson = null)
    {
        $this->log("FinalAgent::run");
        $msg = array();

        // 🔒 Page count guardrail
        $pageCount = is_array($bundle->baselinePages())
            ? count($bundle->baselinePages())
            : 0;

        if ($pageCount > 0) {
            $msg[] =
                "IMPORTANT CONSTRAINT:\n" .
                "The document has EXACTLY {$pageCount} pages.\n" .
                "You MUST output exactly {$pageCount} pages.\n" .
                "Do not merge, split, omit, or add pages.";
        }
        $conf = $bundle->confidence();
        if ($conf) {
            $hotspots = array();
            foreach ($conf->hotspots(12) as $h) $hotspots[] = $h->toArray();
            $msg[] = "CONFIDENCE_HOTSPOTS (use these to focus TSV tool calls):\n"
                . json_encode($hotspots, JSON_PRETTY_PRINT);
        }
        $msg[] = "TSV_BASELINE:\n" . $bundle->baselineFramed();
        $msg[] = "TESSERACT_TEXT:\n" . $bundle->textFramed();

        if ($planJson !== null && trim((string)$planJson) !== '') {
            $msg[] = "PLANNER_PLAN_JSON:\n" . (string)$planJson;
        }

        if ($tableJson !== null && trim((string)$tableJson) !== '') {
            $msg[] = "TABLE_INTERPRETATION_JSON:\n" . (string)$tableJson;
        }

        $msg[] = "Produce the final framed OCR output now.";

        $this->agent->message($msg);
        return $this->agent->run();

    }

    protected function log($msg)
    {
        if (is_callable($this->logger)) {
            call_user_func($this->logger, $msg);
        }
    }
}
