""" Tests for PDF processing functionality. """ import pytest import os import fitz # PyMuPDF from app.core.pdf_processor import PDFProcessor from app.core.chunking import chunk_text @pytest.fixture def sample_pdf_path(tmp_path): """Create a sample PDF for testing.""" pdf_path = tmp_path / "test.pdf" doc = fitz.open() page = doc.new_page() page.insert_text((50, 50), "This is a test document. It contains some sample text for testing.") doc.save(pdf_path) doc.close() return str(pdf_path) def test_pdf_processor_extract_text(sample_pdf_path): """Test PDF text extraction.""" processor = PDFProcessor() text = processor.extract_text(sample_pdf_path) assert text is not None assert len(text) > 0 assert "test document" in text.lower() def test_pdf_processor_extract_pages(sample_pdf_path): """Test PDF page extraction.""" processor = PDFProcessor() pages = processor.extract_pages(sample_pdf_path) assert len(pages) == 1 assert pages[0]["page_number"] == 1 assert len(pages[0]["text"]) > 0 assert "metadata" in pages[0] def test_chunking(): """Test text chunking.""" text = "This is a test. " * 100 # Create a longer text chunks = chunk_text(text, max_tokens=50, overlap=10) assert len(chunks) > 0 assert all("chunk_text" in chunk for chunk in chunks) assert all("token_count" in chunk for chunk in chunks) assert all(chunk["token_count"] <= 50 for chunk in chunks) def test_empty_pdf(): """Test handling of empty PDF.""" processor = PDFProcessor() text = processor.extract_text("nonexistent.pdf") assert text is None