"""
Inductive Thematic Analysis for Barriers & Facilitators
Discovers themes from patient interviews without predefined categories
"""
from __future__ import annotations
import os
import json
from typing import List, Dict, Any
from dataclasses import dataclass
import pandas as pd

from llm_ollama import ollama_generate


@dataclass
class Theme:
    """Represents a discovered theme"""
    theme_name: str
    theme_description: str
    category_type: str  # "barrier" or "facilitator"
    patient_quotes: List[Dict[str, str]]  # [{"conversation_id": "BN1103", "quote": "..."}]
    frequency: int  # Number of patients mentioning this theme


INDUCTIVE_ANALYSIS_PROMPT = """
You are an expert qualitative researcher conducting rigorous thematic analysis on diabetic retinopathy patient interviews.

CRITICAL TASK: Perform DEEP INDUCTIVE THEMATIC ANALYSIS to discover 10-15 HIGH-QUALITY emergent themes about barriers and facilitators to follow-up adherence.

QUALITY OVER QUANTITY: It is better to have 10 well-supported themes with unique evidence than 20 themes with recycled quotes.

YOUR APPROACH MUST BE:
1. COMPREHENSIVE - Identify ALL major patterns across the 36 patient conversations
2. NUANCED - Distinguish between related but distinct themes (e.g., "Fear of Needles/Injections" vs "Fear of Vision Loss")
3. DATA-DRIVEN - Theme names must reflect what patients actually say, not researcher assumptions
4. EVIDENCE-BASED - Each theme needs quotes from 3+ different patients minimum

INDUCTIVE DISCOVERY PROCESS:
Step 1: Read ALL 233 excerpts below carefully
Step 2: Identify recurring words, phrases, concepts, emotions, experiences
Step 3: Group similar excerpts into clusters
Step 4: Name each cluster based on the dominant pattern (this becomes your theme)
Step 5: For each theme, extract supporting quotes from DIFFERENT patients
Step 6: Calculate frequency = number of UNIQUE patients mentioning this theme

EXPECTED THEME CATEGORIES (discover what emerges, aim for 15-25 themes total):

BARRIERS (aim for 10-15 barrier themes):
- Transportation & mobility issues
- Financial concerns (cost, insurance, copays, medication prices)
- Healthcare system frustrations (wait times, scheduling, communication)
- Provider/staff interactions (negative experiences)
- Belief-based resistance (asymptomatic, denial, fatalism)
- Knowledge gaps (don't understand severity, consequences)
- Competing life demands (work, caregiving, family responsibilities)
- Psychological barriers (fear, anxiety, depression, overwhelm)
- Physical limitations or comorbidities
- Language or literacy barriers
- Trust issues with medical system
- Any other emergent barrier patterns

FACILITATORS (aim for 5-10 facilitator themes):
- Family/social support systems
- Provider relationship quality (trust, communication, empathy)
- Technology tools (reminders, MyChart, telehealth)
- Educational interventions
- Access conveniences (location, hours, ease of scheduling)
- Financial assistance or insurance
- Personal motivation factors
- Any other emergent facilitator patterns

PATIENT INTERVIEW EXCERPTS (233 excerpts from 36 patients):
<
{combined_excerpts}
>>>

OUTPUT FORMAT (JSON) - MUST INCLUDE 15-25 THEMES:
{{
  "discovered_themes": [
    {{
      "theme_name": "Specific, descriptive theme name from patient language",
      "theme_description": "Clear 2-3 sentence explanation of this theme's meaning and significance",
      "category_type": "barrier" or "facilitator",
      "supporting_quotes": [
        {{
          "conversation_id": "PATIENT_ID",
          "quote": "Direct patient quote (5-20 words max) - MUST BE UNIQUE, DO NOT REUSE THE SAME QUOTE FOR MULTIPLE THEMES",
          "context": "Why this quote exemplifies THIS SPECIFIC theme"
        }},
        ... (include 3-8 DIFFERENT quotes from DIFFERENT patients)
      ],
      "frequency": count_of_unique_patients_mentioning_this_theme
    }},
    ... (CONTINUE FOR ALL 15-25 THEMES - DO NOT STOP AT 6)
  ]
}}

CRITICAL QUOTE RULES:
- Each quote can only be used for ONE theme
- If a quote appears in Theme A, it CANNOT appear in Theme B
- Find different quotes that specifically match each theme
- If you cannot find 3+ unique quotes for a theme, DO NOT create that theme
"""

def extract_barrier_facilitator_mentions(transcript: str, conversation_id: str) -> List[Dict[str, str]]:
    """
    Extract any mentions of barriers or facilitators from a single transcript.
    Returns list of relevant excerpts.
    """
    
    EXTRACTION_PROMPT = f"""
Extract SPECIFIC, CONCRETE statements where the patient discusses barriers or facilitators to eye care follow-up.

BARRIERS = Things that prevent/discourage follow-up (transportation, cost, fear, beliefs, time, etc.)
FACILITATORS = Things that help/encourage follow-up (family support, reminders, good experiences, etc.)

TRANSCRIPT:
<
{transcript}
>>>

EXTRACTION RULES:
1. Extract COMPLETE, MEANINGFUL quotes (10-30 words)
2. Include context: WHY is this a barrier/facilitator?
3. Extract SPECIFIC examples, not vague statements
4. Prioritize direct, emotional, or vivid language
5. Get quotes about DIFFERENT topics (don't extract 5 quotes all about cost)

Return JSON:
{{
  "excerpts": [
    {{
      "type": "barrier" or "facilitator",
      "quote": "Complete patient quote with context",
      "topic": "Specific topic (e.g., 'cannot afford medication', 'daughter drives me to appointments', 'afraid of injections')"
    }}
  ]
}}

Extract 5-10 diverse, meaningful excerpts. Return ONLY JSON.
"""
    
    try:
        response = ollama_generate(
            model="llama3.3:latest",
            prompt=EXTRACTION_PROMPT,
            num_ctx=8192,
            temperature=0.2,
            timeout=600
        )
        
        data = json.loads(response)
        excerpts = []
        
        for item in data.get("excerpts", []):
            excerpts.append({
                "conversation_id": conversation_id,
                "type": item.get("type"),
                "quote": item.get("quote"),
                "topic": item.get("topic")
            })
        
        return excerpts
        
    except Exception as e:
        print(f"  ⚠️  Failed to extract from {conversation_id}: {e}")
        return []


def discover_themes(all_excerpts: List[Dict[str, str]], output_dir: str) -> List[Theme]:
    """
    Perform thematic analysis on all collected excerpts.
    Discovers emergent themes across all patients.
    """
    
    # Combine all excerpts into one text
    combined_text = []
    for excerpt in all_excerpts:
        combined_text.append(
            f"[{excerpt['conversation_id']}] ({excerpt['type']}) {excerpt['quote']}"
        )
    
    combined_excerpts = "\n".join(combined_text)
    
    print("\n" + "="*80)
    print("DISCOVERING THEMES FROM PATIENT INTERVIEWS")
    print("="*80)
    print(f"Total excerpts to analyze: {len(all_excerpts)}")
    print(f"From {len(set(e['conversation_id'] for e in all_excerpts))} conversations")
    print("\nRunning thematic analysis (this may take 2-3 minutes)...")
    
    # Call LLM for thematic analysis
    prompt = INDUCTIVE_ANALYSIS_PROMPT.format(combined_excerpts=combined_excerpts)
    
    try:
        response = ollama_generate(
            model="llama3.3:latest",
            prompt=prompt,
            num_ctx=32768,  # ✅ INCREASED: Need more context for 233 excerpts
            temperature=0.4,  # ✅ INCREASED: More creativity for theme discovery
            timeout=3600  # ✅ INCREASED: 1 hour timeout for deep analysis
        )
        
        # Save raw response
        with open(os.path.join(output_dir, "inductive_themes_RAW.json"), "w") as f:
            f.write(response)
        
        data = json.loads(response)
        
        themes = []
        for theme_data in data.get("discovered_themes", []):
            theme = Theme(
                theme_name=theme_data.get("theme_name", "Unnamed Theme"),
                theme_description=theme_data.get("theme_description", ""),
                category_type=theme_data.get("category_type", "unknown"),
                patient_quotes=[
                    {
                        "conversation_id": q.get("conversation_id"),
                        "quote": q.get("quote"),
                        "context": q.get("context", "")
                    }
                    for q in theme_data.get("supporting_quotes", [])
                ],
                frequency=theme_data.get("frequency", len(theme_data.get("supporting_quotes", [])))
            )
            themes.append(theme)
        
        print(f"\n✅ Discovered {len(themes)} themes")
        print(f"   - Barriers: {sum(1 for t in themes if t.category_type == 'barrier')}")
        print(f"   - Facilitators: {sum(1 for t in themes if t.category_type == 'facilitator')}")
        
        # ✅ VALIDATION: Check for duplicate quotes
        print(f"\n🔍 Validating themes for quote reuse...")
        all_quotes = []
        quote_to_themes = {}
        
        for theme in themes:
            for q in theme.patient_quotes:
                quote_text = q['quote']
                all_quotes.append(quote_text)
                if quote_text not in quote_to_themes:
                    quote_to_themes[quote_text] = []
                quote_to_themes[quote_text].append(theme.theme_name)
        
        # Find reused quotes
        reused_quotes = {q: themes_list for q, themes_list in quote_to_themes.items() if len(themes_list) > 1}
        
        if reused_quotes:
            print(f"\n⚠️  CRITICAL WARNING: {len(reused_quotes)} quotes are reused across multiple themes!")
            print(f"   This indicates the LLM is fabricating themes without proper evidence.")
            print(f"\n   Examples of reused quotes:")
            for quote, theme_names in list(reused_quotes.items())[:3]:
                print(f"     - \"{quote[:60]}...\"")
                print(f"       Used in: {', '.join(theme_names[:3])}")
            print(f"\n   ⚠️  WARNING: These results may not be reliable for publication.")
        else:
            print(f"   ✅ All quotes are unique - no reuse detected")
        
        # Warn if theme count is too low
        if len(themes) < 10:
            print(f"\n⚠️  WARNING: Only {len(themes)} themes discovered.")
            print(f"   Expected 15-25 themes for rigorous inductive analysis.")
        
        return themes
        
    except Exception as e:
        print(f"\n❌ Thematic analysis failed: {e}")
        return []


def save_inductive_results(themes: List[Theme], output_dir: str):
    """
    Save discovered themes in multiple formats
    """
    
    # 1. Save as JSON
    themes_json = []
    for theme in themes:
        themes_json.append({
            "theme_name": theme.theme_name,
            "theme_description": theme.theme_description,
            "category_type": theme.category_type,
            "frequency": theme.frequency,
            "patient_quotes": theme.patient_quotes
        })
    
    with open(os.path.join(output_dir, "inductive_themes.json"), "w") as f:
        json.dump({"themes": themes_json}, f, indent=2)
    
    # 2. Save as CSV (summary)
    themes_csv = []
    for theme in themes:
        themes_csv.append({
            "theme_name": theme.theme_name,
            "category_type": theme.category_type,
            "frequency": theme.frequency,
            "description": theme.theme_description,
            "sample_quotes": " || ".join([q["quote"] for q in theme.patient_quotes[:3]])
        })
    
    pd.DataFrame(themes_csv).to_csv(
        os.path.join(output_dir, "inductive_themes_summary.csv"),
        index=False
    )
    
    # 3. Save detailed theme report
    with open(os.path.join(output_dir, "inductive_themes_REPORT.txt"), "w") as f:
        f.write("="*80 + "\n")
        f.write("INDUCTIVE THEMATIC ANALYSIS - BARRIERS & FACILITATORS\n")
        f.write("="*80 + "\n\n")
        
        # Barriers
        barriers = [t for t in themes if t.category_type == "barrier"]
        f.write(f"BARRIERS TO FOLLOW-UP ADHERENCE ({len(barriers)} themes)\n")
        f.write("-"*80 + "\n\n")
        
        for i, theme in enumerate(sorted(barriers, key=lambda x: x.frequency, reverse=True), 1):
            f.write(f"{i}. {theme.theme_name} (n={theme.frequency} patients)\n")
            f.write(f"   {theme.theme_description}\n\n")
            f.write("   Supporting quotes:\n")
            for q in theme.patient_quotes[:5]:  # Show top 5 quotes
                f.write(f"   - [{q['conversation_id']}] \"{q['quote']}\"\n")
            f.write("\n")
        
        # Facilitators
        facilitators = [t for t in themes if t.category_type == "facilitator"]
        f.write("\n" + "="*80 + "\n")
        f.write(f"FACILITATORS TO FOLLOW-UP ADHERENCE ({len(facilitators)} themes)\n")
        f.write("-"*80 + "\n\n")
        
        for i, theme in enumerate(sorted(facilitators, key=lambda x: x.frequency, reverse=True), 1):
            f.write(f"{i}. {theme.theme_name} (n={theme.frequency} patients)\n")
            f.write(f"   {theme.theme_description}\n\n")
            f.write("   Supporting quotes:\n")
            for q in theme.patient_quotes[:5]:
                f.write(f"   - [{q['conversation_id']}] \"{q['quote']}\"\n")
            f.write("\n")
    
    print(f"\n✅ Inductive analysis results saved:")
    print(f"   - inductive_themes.json (detailed data)")
    print(f"   - inductive_themes_summary.csv (for spreadsheet analysis)")
    print(f"   - inductive_themes_REPORT.txt (human-readable report)")


def run_inductive_analysis(pdf_dir: str, output_dir: str):
    """
    Main function to run inductive thematic analysis
    
    Args:
        pdf_dir: Directory containing original PDF transcripts
        output_dir: Where to save results
    """
    from pdf_text import load_folder, group_conversations, merge_conversation_text
    
    print("\n" + "="*80)
    print("INDUCTIVE THEMATIC ANALYSIS - STARTING")
    print("="*80)
    
    # ✅ STEP 1: Load original PDF transcripts (NOT processed JSONs)
    print("\nStep 1: Loading original PDF transcripts...")
    docs = load_folder(pdf_dir)
    groups = group_conversations(docs)
    
    print(f"✅ Loaded {len(groups)} conversations from PDFs")
    
    # ✅ STEP 2: Extract relevant excerpts from each full transcript
    all_excerpts = []
    
    print(f"\nStep 2: Extracting barrier/facilitator mentions from full transcripts...")
    
    for conversation_id, parts in groups:
        # Get full transcript text
        full_transcript = merge_conversation_text(parts)
        
        print(f"  📄 Processing {conversation_id} (~{len(full_transcript)} chars)...")
        
        # Extract excerpts from full transcript
        excerpts = extract_barrier_facilitator_mentions(full_transcript, conversation_id)
        all_excerpts.extend(excerpts)
        
        print(f"     ✅ Extracted {len(excerpts)} relevant excerpts")
    
    print(f"\n✅ Total excerpts collected: {len(all_excerpts)}")
    
    # ✅ STEP 3: Discover themes across all excerpts
    themes = discover_themes(all_excerpts, output_dir)
    
    # ✅ STEP 4: Save results
    if themes:
        save_inductive_results(themes, output_dir)
        
        print("\n" + "="*80)
        print("INDUCTIVE ANALYSIS COMPLETE")
        print("="*80)
        print(f"\nDiscovered {len(themes)} themes:")
        print(f"  - {sum(1 for t in themes if t.category_type == 'barrier')} barrier themes")
        print(f"  - {sum(1 for t in themes if t.category_type == 'facilitator')} facilitator themes")
        print("\nReview the generated files:")
        print("  1. inductive_themes_REPORT.txt - Human-readable thematic analysis")
        print("  2. inductive_themes_summary.csv - For spreadsheet/analysis")
        print("  3. inductive_themes.json - Detailed structured data")
    else:
        print("\n❌ No themes discovered. Check error messages above.")


if __name__ == "__main__":
    # Run inductive analysis from original PDFs
    pdf_dir = "/home/sandhiya/dr-transcripts-inductive/data/pdfs"
    output_dir = "/home/sandhiya/dr-transcripts-inductive/outputs"
    
    os.makedirs(output_dir, exist_ok=True)
    
    run_inductive_analysis(pdf_dir, output_dir)