"""
models/transcript.py
Each uploaded file becomes one Transcript record.
raw_text      = extracted directly from PDF/DOCX (before cleaning)
cleaned_text  = after 02_preprocessing clean_transcript()
chunks        = JSONB array of text chunks (same as chunked JSON full_text split)
"""
import uuid
from datetime import datetime
from sqlalchemy import Column, String, Text, DateTime, ForeignKey, Enum as SAEnum
from sqlalchemy.dialects.postgresql import UUID, JSONB
from sqlalchemy.orm import relationship
from app.database import Base
import enum


class TranscriptStatus(str, enum.Enum):
    uploaded   = "uploaded"
    extracted  = "extracted"
    preprocessed = "preprocessed"
    coded      = "coded"
    complete   = "complete"
    failed     = "failed"


class Transcript(Base):
    __tablename__ = "transcripts"

    id            = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    project_id    = Column(UUID(as_uuid=True), ForeignKey("projects.id"), nullable=False)
    original_filename = Column(String, nullable=False)
    file_path     = Column(String, nullable=False)       # path on DGX storage
    file_type     = Column(String, nullable=True)        # pdf, docx, txt, mp3, mp4
    raw_text      = Column(Text, nullable=True)
    cleaned_text  = Column(Text, nullable=True)
    chunks        = Column(JSONB, nullable=True)          # list of chunk strings
    status        = Column(SAEnum(TranscriptStatus), default=TranscriptStatus.uploaded)
    error_message = Column(Text, nullable=True)          # if failed, store reason
    created_at    = Column(DateTime, default=datetime.utcnow)
    updated_at    = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

    project      = relationship("Project", back_populates="transcripts")
    code_results = relationship("CodeResult", back_populates="transcript", cascade="all, delete-orphan")

    def __repr__(self):
        return f"<Transcript {self.original_filename} [{self.status}]>"
