import pdfplumber
import os
from pathlib import Path
from utils import setup_logging

logger = setup_logging()

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF using pdfplumber"""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() + "\n"
        return text
    except Exception as e:
        logger.error(f"Error extracting {pdf_path}: {str(e)}")
        return None

def process_all_pdfs(input_dir, output_dir):
    """Process all PDFs in directory"""
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    pdf_files = list(Path(input_dir).glob('*.pdf'))
    logger.info(f"Found {len(pdf_files)} PDF files")
    
    results = {}
    for pdf_path in pdf_files:
        logger.info(f"Processing: {pdf_path.name}")
        text = extract_text_from_pdf(pdf_path)
        
        if text:
            # Save as text file
            output_file = Path(output_dir) / f"{pdf_path.stem}.txt"
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(text)
            
            results[pdf_path.stem] = {
                'status': 'success',
                'length': len(text),
                'output': str(output_file)
            }
        else:
            results[pdf_path.stem] = {'status': 'failed'}
    
    return results

if __name__ == "__main__":
    input_dir = "data/raw"
    output_dir = "data/processed"
    
    results = process_all_pdfs(input_dir, output_dir)
    logger.info(f"Processed {len(results)} files")
    logger.info(f"Successful: {sum(1 for r in results.values() if r['status']=='success')}")