"""
utils/fuzzy_match.py
Fuzzy quote matching logic — ported from fixed 10_generate_llm_justifications.py.
Used by the justifier pipeline step to find evidence snippets.
"""
import re
from difflib import SequenceMatcher


def clean_text(s: str) -> str:
    if s is None:
        return ""
    s = str(s)
    s = s.replace("\u2013", "-").replace("\u2014", "-")
    s = s.replace("\u2018", "'").replace("\u2019", "'")
    s = s.replace("\u201c", '"').replace("\u201d", '"')
    s = " ".join(s.split())
    return s


def strip_for_fuzzy(s: str) -> str:
    s = re.sub(r'\(.*?\)', ' ', s)
    s = re.sub(r'\b(?:Speaker\s*\d+|S\d+)\s*:', ' ', s, flags=re.IGNORECASE)
    s = s.replace("...", " ")
    s = re.sub(r'[^\w\s]', ' ', s)
    s = " ".join(s.split())
    return s.lower()


def find_evidence_snippet(cleaned_text: str, quote: str, window_chars: int = 280):
    """
    Returns (found_bool, evidence_snippet, match_index).
    Tries 6 levels of matching from exact to fuzzy.
    """
    text = cleaned_text or ""
    q = clean_text(quote)

    if not text or not q:
        return False, "", -1

    low_text = text.lower()
    low_q = q.lower()

    # Try 1: direct
    idx = low_text.find(low_q)
    if idx != -1:
        return True, text[max(0, idx - window_chars):idx + len(q) + window_chars], idx

    # Try 2: remove dashes
    low_q2 = " ".join(low_q.replace("--", " ").replace("-", " ").split())
    idx2 = low_text.find(low_q2)
    if idx2 != -1:
        return True, text[max(0, idx2 - window_chars):idx2 + len(low_q2) + window_chars], idx2

    # Try 3: remove double-commas
    low_q3 = " ".join(low_q.replace(", ,", ",").replace(",,", ",").split())
    idx3 = low_text.find(low_q3)
    if idx3 != -1:
        return True, text[max(0, idx3 - window_chars):idx3 + len(low_q3) + window_chars], idx3

    # Try 4: remove all commas and dashes
    low_q4 = " ".join(re.sub(r'[,\-]+', ' ', low_q).split())
    low_text4 = " ".join(re.sub(r'[,\-]+', ' ', low_text).split())
    idx4 = low_text4.find(low_q4)
    if idx4 != -1:
        return True, text[max(0, idx4 - window_chars):idx4 + len(low_q4) + window_chars], idx4

    # Try 5: strip all punctuation
    q5 = strip_for_fuzzy(q)
    text5 = strip_for_fuzzy(text)
    if q5:
        idx5 = text5.find(q5)
        if idx5 != -1:
            return True, text[max(0, idx5 - window_chars):idx5 + len(q5) + window_chars], idx5

    # Try 6: fuzzy sliding window
    q6 = strip_for_fuzzy(q)
    words_text = text5.split()
    words_q = q6.split()
    win_size = max(len(words_q), 5)
    best_ratio, best_idx = 0.0, -1

    for i in range(max(1, len(words_text) - win_size + 1)):
        window = " ".join(words_text[i:i + win_size])
        ratio = SequenceMatcher(None, q6, window).ratio()
        if ratio > best_ratio:
            best_ratio, best_idx = ratio, i

    if best_ratio >= 0.70 and best_idx != -1:
        char_idx = len(" ".join(words_text[:best_idx]))
        return True, text[max(0, char_idx - window_chars):char_idx + window_chars * 2], char_idx

    return False, "", -1
