#!/usr/bin/env python3
"""
Test pipeline with just 2 samples
"""
import sys
from pathlib import Path
from utils import setup_logging

logger = setup_logging()

def run_test_pipeline():
    """Run full pipeline on 2 test samples"""
    
    # Import all processing functions
    from scripts.import_01_pdf_extraction import process_all_pdfs
    from scripts.import_02_preprocessing import preprocess_all_transcripts
    from scripts.import_03_deductive_coding import process_all_transcripts_deductive
    
    # Setup test directories
    test_dirs = {
        'raw': 'data/raw_test',
        'processed': 'data/processed_test',
        'chunked': 'data/processed_test/chunked',
        'outputs': 'outputs_test/codes'
    }
    
    # Create all directories
    for dir_path in test_dirs.values():
        Path(dir_path).mkdir(parents=True, exist_ok=True)
    
    logger.info("="*60)
    logger.info("STARTING TEST PIPELINE - 2 SAMPLES")
    logger.info("="*60)
    
    # Step 1: PDF Extraction
    logger.info("\n[1/3] Extracting PDFs...")
    pdf_results = process_all_pdfs(test_dirs['raw'], test_dirs['processed'])
    logger.info(f"✓ Extracted {len(pdf_results)} PDFs")
    
    # Step 2: Preprocessing
    logger.info("\n[2/3] Preprocessing transcripts...")
    preprocess_all_transcripts(test_dirs['processed'], test_dirs['chunked'], chunk_size=20)
    
    # Check if chunked files were created
    chunked_files = list(Path(test_dirs['chunked']).glob('*.json'))
    logger.info(f"✓ Created {len(chunked_files)} chunked files")
    
    if len(chunked_files) == 0:
        logger.error("ERROR: No chunked files created!")
        return
    
    # Step 3: Deductive Coding
    logger.info("\n[3/3] Running deductive coding...")
    codes = process_all_transcripts_deductive(
        test_dirs['chunked'], 
        test_dirs['outputs'],
        model='llama3.3'
    )
    
    logger.info("="*60)
    logger.info("TEST PIPELINE COMPLETE")
    logger.info("="*60)
    logger.info(f"Total codes extracted: {len(codes)}")
    
    # Show sample codes
    if len(codes) > 0:
        logger.info("\nSample codes:")
        for i, code in enumerate(codes[:5]):
            logger.info(f"{i+1}. {code.get('code', 'N/A')} - {code.get('category', 'N/A')}")
    else:
        logger.warning("WARNING: No codes extracted! Check LLM responses in logs.")
    
    return codes

if __name__ == "__main__":
    codes = run_test_pipeline()