from __future__ import annotations
import os, json, time
from typing import Any
import yaml
import pandas as pd
from tqdm import tqdm

from schemas import ConversationReport, BatchSummary, EntitiesByCategory, EntityItem, Evidence, PatientPerspective, EntityPresenceSentiment

from pdf_text import load_folder, group_conversations, merge_conversation_text
from llm_ollama import ollama_generate
from dedupe import dedupe_strings

# ✅ FIXED: Combined system rules for both tasks
SYSTEM_RULES = """
You are a medical transcript analyzer extracting structured information from diabetes eye care interviews.

CRITICAL RULES:
1) Extract ONLY information explicitly stated in the transcript
2) Do NOT invent, infer, or assume anything
3) Include SHORT exact quotes (max 15 words) as evidence
4) Return ONLY valid JSON (no markdown, no preamble, no explanations)
5) If information is not present, use empty lists []
"""

# ✅ MAJOR FIX: SINGLE COMPREHENSIVE PROMPT (instead of 2 separate prompts)
# This cuts processing time in HALF
COMPREHENSIVE_PROMPT_TEMPLATE = """
{rules}

TASK: Extract ALL the following information from the transcript in ONE pass:

A) QUESTIONS: What questions did the interviewer ask? (templates, not exact wording)
B) ENTITIES: Extract entities in these 6 categories with evidence
C) PATIENT PERSPECTIVE: Patient's description of occurrence, severity, concerns, goals
D) PREDEFINED ENTITY CHECKLIST: Mark EVERY predefined entity as present/absent + sentiment

---
PREDEFINED ENTITIES (YOU MUST CHECK ALL 24):
{entity_list_json}
---

FEW-SHOT EXAMPLE:

Input Transcript:
"S1: Do you have blurred vision? S2: Yes, it's been getting worse. I'm really worried about going blind. 
S1: Have you had your eyes dilated? S2: Yes, today. S1: How's your A1C? S2: I'm keeping it around 7."

Expected Output:
{{
  "questions_asked": ["Do you have [symptom]?", "Have you had [procedure]?", "How is your [health measure]?"],
  
  "entities": [
    {{
      "category": "Symptoms",
      "items": [
        {{"name": "blurred vision", "evidence": [{{"quote": "blurred vision getting worse", "speaker": "S2"}}]}}
      ]
    }},
    {{"category": "Ophthalmic Findings", "items": []}},
    {{"category": "Diagnostic Tools", "items": [{{"name": "dilated eye exam", "evidence": [{{"quote": "had my eyes dilated today", "speaker": "S2"}}]}}]}},
    {{"category": "Systemic Risk Factors", "items": [{{"name": "HbA1c", "evidence": [{{"quote": "keeping it around 7", "speaker": "S2"}}]}}]}},
    {{"category": "Treatment Options", "items": []}},
    {{"category": "Demographics/History", "items": []}}
  ],
  
  "patient_perspective": {{
    "occurrence": [],
    "severity": ["getting worse"],
    "concerns": ["worried about going blind"],
    "goals": [],
    "occurrence_evidence": [],
    "severity_evidence": [{{"quote": "been getting worse", "speaker": "S2"}}],
    "concerns_evidence": [{{"quote": "really worried about going blind", "speaker": "S2"}}],
    "goals_evidence": []
  }},
  
  "entity_presence_sentiment": [
    {{"category": "Symptoms", "entity": "blurred vision", "present": true, "sentiment": "negative", "evidence": [{{"quote": "getting worse really worried", "speaker": "S2"}}]}},
    {{"category": "Symptoms", "entity": "fluctuating vision", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Symptoms", "entity": "floaters", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Symptoms", "entity": "poor night vision", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Symptoms", "entity": "faded colors", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Ophthalmic Findings", "entity": "microaneurysms", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Ophthalmic Findings", "entity": "cotton wool spots", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Ophthalmic Findings", "entity": "retinal hemorrhages", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Ophthalmic Findings", "entity": "macular edema", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Ophthalmic Findings", "entity": "neovascularization", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Ophthalmic Findings", "entity": "scar tissue", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Diagnostic Tools", "entity": "dilated eye exam", "present": true, "sentiment": "neutral", "evidence": [{{"quote": "had my eyes dilated today", "speaker": "S2"}}]}},
    {{"category": "Diagnostic Tools", "entity": "OCT", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Diagnostic Tools", "entity": "fundus photography", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Diagnostic Tools", "entity": "fluorescein angiography", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Systemic Risk Factors", "entity": "HbA1c", "present": true, "sentiment": "positive", "evidence": [{{"quote": "keeping it around 7", "speaker": "S2"}}]}},
    {{"category": "Systemic Risk Factors", "entity": "blood pressure", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Systemic Risk Factors", "entity": "cholesterol", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Treatment Options", "entity": "intravitreal injection", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Treatment Options", "entity": "laser treatment", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Treatment Options", "entity": "vitrectomy", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Demographics/History", "entity": "duration of diabetes", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Demographics/History", "entity": "age over 65", "present": false, "sentiment": "unknown", "evidence": []}},
    {{"category": "Demographics/History", "entity": "smoking", "present": false, "sentiment": "unknown", "evidence": []}}
  ]
}}

NOTICE: The entity_presence_sentiment array has EXACTLY 24 rows - one for each predefined entity!

---
CRITICAL INSTRUCTIONS FOR ENTITY_PRESENCE_SENTIMENT:

1. You MUST return EXACTLY 24 rows in entity_presence_sentiment
2. One row for EACH predefined entity listed above
3. For EACH entity, determine:
   - present: true (if mentioned in transcript) OR false (if not mentioned)
   - sentiment: patient's attitude if present (positive/neutral/negative/mixed), otherwise "unknown"
   - evidence: quote if present (5-15 words), otherwise empty list []

4. DO NOT skip any entities. Even if not mentioned, include them with present=false

5. Evidence quotes must be:
   - 5-15 words long
   - Direct quotes from transcript
   - Grammatically complete
   - If quote not available, leave evidence as []

6. Sentiment guidelines:
   - positive: patient expresses satisfaction, good control, improvement
   - neutral: mentioned factually without emotion
   - negative: patient expresses worry, worsening, dissatisfaction
   - mixed: both positive and negative aspects
   - unknown: not mentioned (present=false) OR mentioned but no sentiment expressed

---
REQUIRED JSON SCHEMA:
{{
  "questions_asked": ["template 1", "template 2", ...],
  
  "entities": [
    {{"category": "Symptoms", "items": [...]}},
    {{"category": "Ophthalmic Findings", "items": [...]}},
    {{"category": "Diagnostic Tools", "items": [...]}},
    {{"category": "Systemic Risk Factors", "items": [...]}},
    {{"category": "Treatment Options", "items": [...]}},
    {{"category": "Demographics/History", "items": [...]}}
  ],
  
  "patient_perspective": {{
    "occurrence": ["when/how often/triggers"],
    "severity": ["how bad/impact"],
    "concerns": ["worries/fears"],
    "goals": ["what patient wants"],
    "occurrence_evidence": [{{"quote": "...", "speaker": "S2"}}],
    "severity_evidence": [{{"quote": "...", "speaker": "S2"}}],
    "concerns_evidence": [{{"quote": "...", "speaker": "S2"}}],
    "goals_evidence": [{{"quote": "...", "speaker": "S2"}}]
  }},
  
  "entity_presence_sentiment": [
    // ⚠️ MUST BE EXACTLY 24 ROWS - ONE FOR EACH PREDEFINED ENTITY
    {{"category": "Symptoms", "entity": "blurred vision", "present": true/false, "sentiment": "...", "evidence": [...]}},
    {{"category": "Symptoms", "entity": "fluctuating vision", "present": true/false, "sentiment": "...", "evidence": [...]}},
    // ... all 24 entities
  ]
}}

---
NOW ANALYZE THIS TRANSCRIPT:
<<<
{transcript}
>>>

REMEMBER:
- Return ONLY valid JSON (no markdown, no ```json, no extra text)
- entity_presence_sentiment MUST have EXACTLY 24 rows
- Check every predefined entity, even if not mentioned (mark as present=false)
- Evidence quotes must be 5-15 words or empty []
"""

def safe_json_load(s: str) -> Any:
    """✅ IMPROVED: Better JSON extraction with clearer error messages"""
    s = (s or "").strip()

    if not s:
        raise ValueError("LLM returned EMPTY response")

    # Remove markdown code blocks if present
    if s.startswith("```"):
        lines = s.split("\n")
        s = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
        s = s.replace("```json", "").replace("```", "").strip()

    # Try direct parse
    try:
        return json.loads(s)
    except json.JSONDecodeError as e:
        # Try to extract first complete JSON object
        start = s.find("{")
        end = s.rfind("}")
        if start != -1 and end != -1 and end > start:
            try:
                return json.loads(s[start:end+1])
            except:
                pass
        
        # If still failing, provide helpful error
        print(f"❌ JSON parsing failed: {e}")
        print(f"Response preview (first 500 chars): {s[:500]}")
        raise ValueError(f"Could not parse LLM response as JSON: {e}")

def normalize_entities(raw_entities) -> list[EntitiesByCategory]:
    """✅ UNCHANGED: This function is good"""
    allowed_categories = [
        "Symptoms",
        "Ophthalmic Findings",
        "Diagnostic Tools",
        "Systemic Risk Factors",
        "Treatment Options",
        "Demographics/History",
    ]

    if not isinstance(raw_entities, list):
        raw_entities = []

    by_cat = {c: [] for c in allowed_categories}

    for cat in raw_entities:
        if not isinstance(cat, dict):
            continue

        category = cat.get("category")
        if category not in by_cat:
            continue

        items_out = []
        for it in (cat.get("items") or []):
            if not isinstance(it, dict):
                continue

            evs = []
            for e in (it.get("evidence") or []):
                if not isinstance(e, dict):
                    continue
                evs.append(Evidence(quote=(e.get("quote", "")).strip(), speaker=e.get("speaker")))

            items_out.append(EntityItem(name=(it.get("name", "")).strip(), evidence=evs))

        by_cat[category] = items_out

    out = []
    for c in allowed_categories:
        out.append(EntitiesByCategory(category=c, items=by_cat[c]))
    return out


def normalize_patient_perspective(raw_pp: dict) -> PatientPerspective:
    """✅ UNCHANGED: This function is good"""
    def ev_list(key: str):
        evs = []
        for e in (raw_pp.get(key, []) or []):
            evs.append(Evidence(quote=(e.get("quote","").strip()), speaker=e.get("speaker")))
        return evs

    return PatientPerspective(
        occurrence=raw_pp.get("occurrence", []) or [],
        severity=raw_pp.get("severity", []) or [],
        concerns=raw_pp.get("concerns", []) or [],
        goals=raw_pp.get("goals", []) or [],
        occurrence_evidence=ev_list("occurrence_evidence"),
        severity_evidence=ev_list("severity_evidence"),
        concerns_evidence=ev_list("concerns_evidence"),
        goals_evidence=ev_list("goals_evidence"),
    )

def config_entity_list_json(cfg: dict) -> str:
    """✅ UNCHANGED: This function is good"""
    ent = cfg.get("predefined_entities", {}) or {}
    return json.dumps(ent, ensure_ascii=False, indent=2)

def normalize_presence_rows(rows) -> list[EntityPresenceSentiment]:
    """✅ UNCHANGED: This function is good"""
    if not isinstance(rows, list):
        return []
    out = []
    for r in rows:
        if not isinstance(r, dict):
            continue
        evs = []
        for e in (r.get("evidence") or []):
            if isinstance(e, dict):
                evs.append(Evidence(quote=(e.get("quote","").strip()), speaker=e.get("speaker")))
        out.append(EntityPresenceSentiment(
            category=r.get("category"),
            entity=(r.get("entity","").strip()),
            present=bool(r.get("present", False)),
            sentiment=r.get("sentiment","unknown"),
            evidence=evs,
        ))
    return out

def estimate_token_count(text: str) -> int:
    """✅ NEW: Rough token estimation (1 token ≈ 4 characters)"""
    return len(text) // 4

def chunk_transcript_if_needed(text: str, max_tokens: int = 10000) -> list[str]:
    """
    ✅ NEW: Split very long transcripts into chunks to prevent timeouts
    """
    tokens = estimate_token_count(text)
    if tokens <= max_tokens:
        return [text]
    
    # Split by clear boundaries (file markers)
    chunks = []
    current_chunk = []
    current_tokens = 0
    
    for line in text.split("\n"):
        line_tokens = estimate_token_count(line)
        
        if current_tokens + line_tokens > max_tokens and current_chunk:
            chunks.append("\n".join(current_chunk))
            current_chunk = [line]
            current_tokens = line_tokens
        else:
            current_chunk.append(line)
            current_tokens += line_tokens
    
    if current_chunk:
        chunks.append("\n".join(current_chunk))
    
    return chunks

def merge_chunked_results(chunk_results: list[dict]) -> dict:
    """
    ✅ NEW: Merge results from multiple chunks
    """
    merged = {
        "questions_asked": [],
        "entities": [],
        "patient_perspective": {
            "occurrence": [], "severity": [], "concerns": [], "goals": [],
            "occurrence_evidence": [], "severity_evidence": [], 
            "concerns_evidence": [], "goals_evidence": []
        },
        "entity_presence_sentiment": []
    }
    
    for result in chunk_results:
        # Merge questions
        merged["questions_asked"].extend(result.get("questions_asked", []))
        
        # Merge entities (combine items from same category)
        for cat in result.get("entities", []):
            category_name = cat.get("category")
            existing = next((c for c in merged["entities"] if c.get("category") == category_name), None)
            if existing:
                existing["items"].extend(cat.get("items", []))
            else:
                merged["entities"].append(cat)
        
        # Merge patient perspective
        pp = result.get("patient_perspective", {})
        for key in ["occurrence", "severity", "concerns", "goals"]:
            merged["patient_perspective"][key].extend(pp.get(key, []))
        for key in ["occurrence_evidence", "severity_evidence", "concerns_evidence", "goals_evidence"]:
            merged["patient_perspective"][key].extend(pp.get(key, []))
        
        # Merge entity presence (combine evidence for same entities)
        for item in result.get("entity_presence_sentiment", []):
            entity_name = item.get("entity")
            existing = next((e for e in merged["entity_presence_sentiment"] 
                           if e.get("entity") == entity_name), None)
            if existing:
                existing["evidence"].extend(item.get("evidence", []))
                if item.get("present"):
                    existing["present"] = True
                if item.get("sentiment") != "unknown":
                    existing["sentiment"] = item.get("sentiment")
            else:
                merged["entity_presence_sentiment"].append(item)
    
    return merged

def main(config_path: str = "config.yaml", only_ids: list[str] = None):
    """✅ IMPROVED: Better progress tracking and error handling"""
    
    # Load config
    with open(config_path, "r", encoding="utf-8") as f:
        cfg = yaml.safe_load(f)

    input_dir = cfg.get("input_dir")
    out_dir = cfg.get("output_dir", "outputs")
    os.makedirs(out_dir, exist_ok=True)

    model = cfg.get("model", "llama3.1:latest")
    host = cfg.get("ollama_host", "http://localhost:11434")
    num_ctx = int(cfg.get("num_ctx", 12288))  # ✅ FIXED: Lower default
    temperature = float(cfg.get("temperature", 0.1))  # ✅ FIXED: Lower default
    timeout = int(cfg.get("conversation_timeout", 1800))  # ✅ NEW: Configurable timeout
    max_transcript_tokens = int(cfg.get("max_transcript_tokens", 10000))  # ✅ NEW

    docs = load_folder(input_dir)
    groups = group_conversations(docs)

    print(f"\n{'='*60}")
    print(f"📊 EXTRACTION PIPELINE STARTING")
    print(f"{'='*60}")
    print(f"Model: {model}")
    print(f"Context window: {num_ctx} tokens")
    print(f"Temperature: {temperature}")
    print(f"Timeout per conversation: {timeout}s ({timeout//60} minutes)")
    print(f"Total conversations found: {len(groups)}")
    print(f"{'='*60}\n")

    # ✅ REBUILD MODE: if json files already exist, rebuild CSVs without calling LLM
    if cfg.get("rebuild_only", False):
        print("🔄 REBUILD MODE: Loading existing JSONs and regenerating CSVs...\n")
        conversation_reports = []
        for fn in os.listdir(out_dir):
            if fn.endswith(".json") and fn not in ("batch_summary.json",):
                with open(os.path.join(out_dir, fn), "r", encoding="utf-8") as f:
                    conversation_reports.append(ConversationReport.model_validate_json(f.read()))

        all_questions = []
        for r in conversation_reports:
            all_questions.extend(r.questions_asked)

        global_questions = dedupe_strings(all_questions, threshold=95)
        summary = BatchSummary(
            total_unique_conversations=len(conversation_reports),
            conversation_ids=[r.conversation_id for r in conversation_reports],
            global_distinct_questions=global_questions,
        )
        with open(os.path.join(out_dir, "batch_summary.json"), "w", encoding="utf-8") as f:
            f.write(summary.model_dump_json(indent=2))

        # patient_perspective.csv
        rows = []
        for r in conversation_reports:
            rows.append({
                "conversation_id": r.conversation_id,
                "source_files": "; ".join(r.source_files),
                "occurrence": " | ".join(r.patient_perspective.occurrence),
                "severity": " | ".join(r.patient_perspective.severity),
                "concerns": " | ".join(r.patient_perspective.concerns),
                "goals": " | ".join(r.patient_perspective.goals),
            })
        pd.DataFrame(rows).to_csv(os.path.join(out_dir, "patient_perspective.csv"), index=False)

        # entities.csv
        e_rows = []
        for r in conversation_reports:
            for cat in r.entities:
                for item in cat.items:
                    quotes = " || ".join([e.quote for e in item.evidence][:3])
                    e_rows.append({
                        "conversation_id": r.conversation_id,
                        "category": cat.category,
                        "entity": item.name,
                        "evidence_quotes": quotes,
                    })
        pd.DataFrame(e_rows).to_csv(os.path.join(out_dir, "entities.csv"), index=False)

        # entity_presence_sentiment.csv (MAIN TABULATION)
        ps_rows = []
        for r in conversation_reports:
            for row in (r.entity_presence_sentiment or []):
                quotes = " || ".join([e.quote for e in row.evidence][:2])
                ps_rows.append({
                    "conversation_id": r.conversation_id,
                    "category": row.category,
                    "entity": row.entity,
                    "present": row.present,
                    "sentiment": row.sentiment,
                    "evidence_quotes": quotes,
                })
        pd.DataFrame(ps_rows).to_csv(os.path.join(out_dir, "entity_presence_sentiment.csv"), index=False)

        # global_questions.csv
        pd.DataFrame({"question": global_questions}).to_csv(os.path.join(out_dir, "global_questions.csv"), index=False)

        print("\n✅ REBUILD DONE (no LLM calls)")
        print(f"- Total conversations: {summary.total_unique_conversations}")
        print(f"- Outputs saved to: {out_dir}")
        return


    # ✅ MAIN PROCESSING LOOP with better progress tracking
    conversation_reports: list[ConversationReport] = []
    all_questions: list[str] = []
    
    entity_list_json = config_entity_list_json(cfg)

    for idx, (cid, parts) in enumerate(groups, 1):
        # Filter by only_ids if provided
        if only_ids and cid not in only_ids:
            continue

        # Skip if already processed
        out_path = os.path.join(out_dir, f"{cid}.json")
        if os.path.exists(out_path):
            print(f"[{idx}/{len(groups)}] ⏭️  Skipping {cid} (already processed)")
            # Load existing report for summary
            with open(out_path, "r", encoding="utf-8") as f:
                existing = ConversationReport.model_validate_json(f.read())
                conversation_reports.append(existing)
                all_questions.extend(existing.questions_asked)
            continue

        print(f"\n{'─'*60}")
        print(f"[{idx}/{len(groups)}] 📄 Processing: {cid}")
        print(f"{'─'*60}")
        print(f"Source files: {', '.join([p.filename for p in parts])}")
        
        start_time = time.time()
        merged = merge_conversation_text(parts)
        
        # ✅ NEW: Check if transcript needs chunking
        tokens = estimate_token_count(merged)
        print(f"Transcript size: ~{tokens:,} tokens")
        
        chunks = chunk_transcript_if_needed(merged, max_transcript_tokens)
        if len(chunks) > 1:
            print(f"⚠️  Long transcript! Split into {len(chunks)} chunks for processing")

        # ✅ MAJOR CHANGE: SINGLE LLM CALL (or multiple for chunks)
        try:
            if len(chunks) == 1:
                # Single call for normal transcripts
                prompt = COMPREHENSIVE_PROMPT_TEMPLATE.format(
                    rules=SYSTEM_RULES,
                    entity_list_json=entity_list_json,
                    transcript=merged
                )
                
                resp = ollama_generate(
                    model=model, 
                    prompt=prompt, 
                    host=host, 
                    num_ctx=num_ctx, 
                    temperature=temperature,
                    timeout=timeout
                )
                
                # Save raw response
                with open(os.path.join(out_dir, f"{cid}_RAW.txt"), "w", encoding="utf-8") as f:
                    f.write(resp or "")
                
                data = safe_json_load(resp)
                
            else:
                # Multiple calls for chunked transcripts
                chunk_results = []
                for chunk_idx, chunk in enumerate(chunks, 1):
                    print(f"  Processing chunk {chunk_idx}/{len(chunks)}...")
                    
                    prompt = COMPREHENSIVE_PROMPT_TEMPLATE.format(
                        rules=SYSTEM_RULES,
                        entity_list_json=entity_list_json,
                        transcript=chunk
                    )
                    
                    resp = ollama_generate(
                        model=model, 
                        prompt=prompt, 
                        host=host, 
                        num_ctx=num_ctx, 
                        temperature=temperature,
                        timeout=timeout
                    )
                    
                    chunk_data = safe_json_load(resp)
                    chunk_results.append(chunk_data)
                
                # Merge chunk results
                data = merge_chunked_results(chunk_results)
                
                # Save merged raw response
                with open(os.path.join(out_dir, f"{cid}_RAW.txt"), "w", encoding="utf-8") as f:
                    f.write(json.dumps(data, indent=2))

            # ✅ Parse and normalize
            questions = dedupe_strings(data.get("questions_asked", []) or [], threshold=90)
            entities = normalize_entities(data.get("entities", []) or [])
            pp = normalize_patient_perspective(data.get("patient_perspective", {}) or {})
            presence_rows = normalize_presence_rows(data.get("entity_presence_sentiment", []))

            report = ConversationReport(
                conversation_id=cid,
                source_files=[p.filename for p in parts],
                questions_asked=questions,
                entities=entities,
                patient_perspective=pp,
                entity_presence_sentiment=presence_rows,
            )

            conversation_reports.append(report)
            all_questions.extend(questions)

            # Save per-conversation JSON
            with open(os.path.join(out_dir, f"{cid}.json"), "w", encoding="utf-8") as f:
                f.write(report.model_dump_json(indent=2))

            elapsed = time.time() - start_time
            print(f"✅ Completed in {elapsed:.1f}s ({elapsed/60:.1f} minutes)")
            print(f"   - Questions found: {len(questions)}")
            print(f"   - Entities found: {sum(len(cat.items) for cat in entities)}")
            print(f"   - Predefined entities checked: {len(presence_rows)}")

        except Exception as e:
            print(f"❌ FAILED for conversation {cid}: {e}")
            print(f"   Continuing to next conversation...")
            continue

    # ✅ Generate batch summary
    print(f"\n{'='*60}")
    print(f"📊 GENERATING SUMMARY AND CSV OUTPUTS")
    print(f"{'='*60}\n")

    global_questions = dedupe_strings(all_questions, threshold=90)
    summary = BatchSummary(
        total_unique_conversations=len(conversation_reports),
        conversation_ids=[r.conversation_id for r in conversation_reports],
        global_distinct_questions=global_questions,
    )
    with open(os.path.join(out_dir, "batch_summary.json"), "w", encoding="utf-8") as f:
        f.write(summary.model_dump_json(indent=2))

    # patient_perspective.csv
    rows = []
    for r in conversation_reports:
        rows.append({
            "conversation_id": r.conversation_id,
            "source_files": "; ".join(r.source_files),
            "occurrence": " | ".join(r.patient_perspective.occurrence),
            "severity": " | ".join(r.patient_perspective.severity),
            "concerns": " | ".join(r.patient_perspective.concerns),
            "goals": " | ".join(r.patient_perspective.goals),
        })
    pd.DataFrame(rows).to_csv(os.path.join(out_dir, "patient_perspective.csv"), index=False)

    # entities.csv
    e_rows = []
    for r in conversation_reports:
        for cat in r.entities:
            for item in cat.items:
                quotes = " || ".join([e.quote for e in item.evidence][:3])
                e_rows.append({
                    "conversation_id": r.conversation_id,
                    "category": cat.category,
                    "entity": item.name,
                    "evidence_quotes": quotes,
                })
    pd.DataFrame(e_rows).to_csv(os.path.join(out_dir, "entities.csv"), index=False)

    # ✅ MAIN TABULATION: entity_presence_sentiment.csv
    ps_rows = []
    for r in conversation_reports:
        for row in (r.entity_presence_sentiment or []):
            quotes = " || ".join([e.quote for e in row.evidence][:2])
            ps_rows.append({
                "conversation_id": r.conversation_id,
                "category": row.category,
                "entity": row.entity,
                "present": row.present,
                "sentiment": row.sentiment,
                "evidence_quotes": quotes,
            })
    pd.DataFrame(ps_rows).to_csv(os.path.join(out_dir, "entity_presence_sentiment.csv"), index=False)

    # global_questions.csv
    pd.DataFrame({"question": global_questions}).to_csv(os.path.join(out_dir, "global_questions.csv"), index=False)

    print("\n" + "="*60)
    print("✅ ALL PROCESSING COMPLETE!")
    print("="*60)
    print(f"📁 Output directory: {out_dir}")
    print(f"📊 Total conversations processed: {summary.total_unique_conversations}")
    print(f"❓ Total unique questions: {len(global_questions)}")
    print(f"\n📄 Generated files:")
    print(f"   1. batch_summary.json - Overall statistics")
    print(f"   2. patient_perspective.csv - Patient descriptions")
    print(f"   3. entities.csv - All extracted entities")
    print(f"   4. entity_presence_sentiment.csv - MAIN TABULATION (predefined entities)")
    print(f"   5. global_questions.csv - Distinct questions asked")
    print(f"   6. [ID].json files - Individual conversation reports")
    print("="*60 + "\n")

if __name__ == "__main__":
    main()


# To: -- test one sample
# if __name__ == "__main__":
#     main(only_ids=["BN1103"])