diff --git a/README.md b/README.md index 3309f26..70c5491 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# LLMLoader +# SmartLLMLoader -llm-loader is a lightweight yet powerful Python package that transforms any document into LLM-ready chunks. It handles the entire document processing pipeline: +smart-llm-loader is a lightweight yet powerful Python package that transforms any document into LLM-ready chunks. It handles the entire document processing pipeline: - 📄 Converts documents to clean markdown - 🔍 Built-in OCR for scanned documents and images @@ -9,9 +9,9 @@ llm-loader is a lightweight yet powerful Python package that transforms any docu - 📦 Ready for vector stores and LLM ingestion Spend less time on preprocessing headaches and more time building what matters. From RAG systems to chatbots to document Q&A, -LLMLoader handles the heavy lifting so you can focus on creating exceptional AI applications. +SmartLLMLoader handles the heavy lifting so you can focus on creating exceptional AI applications. -LLMLoader's chunking approach has been benchmarked against traditional methods, showing superior performance particularly when paired with Google's Gemini Flash model. This combination offers an efficient and cost-effective solution for document chunking in RAG systems. View the detailed performance comparison [here](https://www.sergey.fyi/articles/gemini-flash-2). +SmartLLMLoader's chunking approach has been benchmarked against traditional methods, showing superior performance particularly when paired with Google's Gemini Flash model. This combination offers an efficient and cost-effective solution for document chunking in RAG systems. View the detailed performance comparison [here](https://www.sergey.fyi/articles/gemini-flash-2). ## Features @@ -45,24 +45,24 @@ brew install poppler ### Package Installation -You can install LLMLoader using pip: +You can install SmartLLMLoader using pip: ```bash -pip install llm-loader +pip install smart-llm-loader ``` Or using Poetry: ```bash -poetry add llm-loader +poetry add smart-llm-loader ``` ## Quick Start -llm-loader package uses litellm to call the LLM so any arguments supported by litellm can be used. You can find the litellm documentation [here](https://docs.litellm.ai/docs/providers). +smart-llm-loader package uses litellm to call the LLM so any arguments supported by litellm can be used. You can find the litellm documentation [here](https://docs.litellm.ai/docs/providers). You can use any multi-modal model supported by litellm. ```python -from llm_loader import LLMLoader +from smart_llm_loader import SmartLLMLoader # Using Gemini Flash model @@ -79,7 +79,7 @@ model = "anthropic/claude-3-5-sonnet" # Initialize the document loader -loader = LLMLoader( +loader = SmartLLMLoader( file_path="your_document.pdf", chunk_strategy="contextual", model=model, @@ -91,7 +91,7 @@ documents = loader.load_and_split() ## Parameters ```python -class LLMLoader(BaseLoader): +class SmartLLMLoader(BaseLoader): """A flexible document loader that supports multiple input types.""" def __init__( @@ -110,7 +110,7 @@ class LLMLoader(BaseLoader): ## Comparison with Traditional Methods -Let's see LLMLoader in action! We'll compare it with PyMuPDF (a popular traditional document loader) to demonstrate why LLMLoader's intelligent chunking makes such a difference in real-world applications. +Let's see SmartLLMLoader in action! We'll compare it with PyMuPDF (a popular traditional document loader) to demonstrate why SmartLLMLoader's intelligent chunking makes such a difference in real-world applications. ### The Challenge: Processing an Invoice We'll process this sample invoice that includes headers, tables, and complex formatting: @@ -119,8 +119,8 @@ We'll process this sample invoice that includes headers, tables, and complex for ### Head-to-Head Comparison -#### 1. LLMLoader Output -LLMLoader intelligently breaks down the document into semantic chunks, preserving structure and meaning (note that the json output below has been formatted for readability): +#### 1. SmartLLMLoader Output +SmartLLMLoader intelligently breaks down the document into semantic chunks, preserving structure and meaning (note that the json output below has been formatted for readability): ```json [ @@ -232,7 +232,7 @@ Let's see how this difference affects a real Question-Answering system: ```python question = "What is the total gross worth for item 1 and item 7?" -# LLMLoader Result ✅ +# SmartLLMLoader Result ✅ "The total gross worth for item 1 (Lilly Pulitzer dress) is $247.50 and for item 7 (J.Crew Collection sweater dress) is $33.00. Total: $280.50" @@ -242,7 +242,7 @@ Total: $280.50" Total: $78.00" ``` -**Why LLMLoader Won:** +**Why SmartLLMLoader Won:** - 🎯 Maintained table structure - 💡 Preserved relationships between data - 📊 Accurate calculations diff --git a/examples/data/test_ocr_doc.png b/examples/data/test_ocr_doc.png new file mode 100644 index 0000000..1686db5 Binary files /dev/null and b/examples/data/test_ocr_doc.png differ diff --git a/examples/ocr_example.py b/examples/ocr_example.py index cfc519e..c542354 100644 --- a/examples/ocr_example.py +++ b/examples/ocr_example.py @@ -1,10 +1,10 @@ """ -Example usage of different document loaders (llm_loader and PyMuPDF) for RAG applications. +Example usage of different document loaders (smart-llm-loader and PyMuPDF) for RAG applications. """ import os from dotenv import load_dotenv -from llm_loader.document_loader import LLMLoader +from smart_llm_loader import SmartLLMLoader # Load environment variables load_dotenv() @@ -13,14 +13,14 @@ os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY" # Gemini API key since we are using the gemini flash model -os.environ["GEMINI_API_KEY"] = "YOUR_GEMINI" +os.environ["GEMINI_API_KEY"] = "YOUR_GEMINI_API_KEY" def process_with_llmloader(): - """Process documents using LLMLoader with Gemini Flash.""" + """Process documents using SmartLLMLoader with Gemini Flash.""" - # Initialize the loader from the llm_loader package - loader = LLMLoader( + # Initialize the loader from the smart-llm-loader package + loader = SmartLLMLoader( file_path="./data/test_ocr_doc.pdf", chunk_strategy="contextual", model="gemini/gemini-1.5-flash", diff --git a/examples/rag_example.py b/examples/rag_example.py index c739e87..5ccf052 100644 --- a/examples/rag_example.py +++ b/examples/rag_example.py @@ -1,17 +1,17 @@ """ -Example usage of different document loaders (llm_loader and PyMuPDF) for RAG applications. +Example usage of different document loaders (smart-llm-loader and PyMuPDF) for RAG applications. """ import os from dotenv import load_dotenv from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.chat_models import ChatOpenAI from langchain_community.document_loaders import PyMuPDFLoader -from langchain_community.embeddings import OpenAIEmbeddings from langchain_core.output_parsers import StrOutputParser +from langchain_core.prompts import PromptTemplate from langchain_core.runnables import RunnablePassthrough +from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_community.vectorstores import FAISS -from llm_loader.document_loader import LLMLoader -from langchain_core.prompts import PromptTemplate + +from smart_llm_loader import SmartLLMLoader # Load environment variables load_dotenv() @@ -20,7 +20,7 @@ os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY" # Gemini API key since we are using the gemini flash model -os.environ["GEMINI_API_KEY"] = "YOUR_GEMINI" +os.environ["GEMINI_API_KEY"] = "YOUR_GEMINI_API_KEY" def create_rag_chain(retriever, llm): @@ -48,11 +48,11 @@ def format_docs(docs): def process_with_llmloader(): - """Process documents using LLMLoader with Gemini Flash.""" + """Process documents using SmartLLMLoader with Gemini Flash.""" llm = ChatOpenAI(model="gpt-4o-mini") - # Initialize the loader from the llm_loader package - loader = LLMLoader( + # Initialize the loader from the smart-llm-loader package + loader = SmartLLMLoader( file_path="./data/test_ocr_doc.pdf", chunk_strategy="contextual", model="gemini/gemini-1.5-flash", diff --git a/pyproject.toml b/pyproject.toml index e7cc94d..2c8c589 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,12 @@ [tool.poetry] -name = "llm-loader" +name = "smart-llm-loader" version = "0.1.0" description = "A powerful PDF processing toolkit that seamlessly integrates with LLMs for intelligent document chunking and RAG applications. Features smart context-aware segmentation, multi-LLM support, and optimized content extraction for enhanced RAG performance." authors = ["drmingler "] readme = "README.md" -packages = [{include = "llm_loader"}] +packages = [{include = "smart_llm_loader"}] license = "MIT" -repository = "https://github.com/drmingler/llm-loader" +repository = "https://github.com/drmingler/smart-llm-loader" keywords = ["pdf", "llm", "rag", "document-processing", "ai"] classifiers = [ "Development Status :: 4 - Beta", @@ -44,5 +44,5 @@ build-backend = "poetry.core.masonry.api" [tool.pytest.ini_options] testpaths = ["tests"] python_files = ["test_*.py"] -addopts = "-v --cov=llm_loader --cov-report=term-missing" +addopts = "-v --cov=smart_llm_loader --cov-report=term-missing" asyncio_mode = "auto" \ No newline at end of file diff --git a/llm_loader/__init__.py b/smart_llm_loader/__init__.py similarity index 97% rename from llm_loader/__init__.py rename to smart_llm_loader/__init__.py index 3688e2a..0f7ea2a 100644 --- a/llm_loader/__init__.py +++ b/smart_llm_loader/__init__.py @@ -38,6 +38,6 @@ def _check_poppler_installation(): _check_poppler_installation() # Import main package components -from .document_loader import LLMLoader +from .document_loader import SmartLLMLoader __version__ = "0.1.0" diff --git a/llm_loader/document_loader.py b/smart_llm_loader/document_loader.py similarity index 97% rename from llm_loader/document_loader.py rename to smart_llm_loader/document_loader.py index 06a96ab..9310063 100644 --- a/llm_loader/document_loader.py +++ b/smart_llm_loader/document_loader.py @@ -9,11 +9,11 @@ from langchain_core.documents import Document import requests -from llm_loader.llm import ImageProcessor, LLMProcessing -from llm_loader.utils import copy_file, save_output_file, is_pdf +from smart_llm_loader.llm import ImageProcessor, LLMProcessing +from smart_llm_loader.utils import copy_file, save_output_file, is_pdf -class LLMLoader(BaseLoader): +class SmartLLMLoader(BaseLoader): """A flexible document loader that supports multiple input types.""" def __init__( diff --git a/llm_loader/llm.py b/smart_llm_loader/llm.py similarity index 97% rename from llm_loader/llm.py rename to smart_llm_loader/llm.py index 3a1a7ad..478dccc 100644 --- a/llm_loader/llm.py +++ b/smart_llm_loader/llm.py @@ -10,9 +10,9 @@ from pdf2image import convert_from_path from litellm import completion, validate_environment, supports_vision, check_valid_key, acompletion -from llm_loader.prompts import DEFAULT_PAGE_CHUNK_PROMPT, DEFAULT_CHUNK_PROMPT -from llm_loader.schema import OCRResponse -from llm_loader.utils import save_output_file +from smart_llm_loader.prompts import DEFAULT_PAGE_CHUNK_PROMPT, DEFAULT_CHUNK_PROMPT +from smart_llm_loader.schema import OCRResponse +from smart_llm_loader.utils import save_output_file class ImageProcessor: diff --git a/llm_loader/prompts.py b/smart_llm_loader/prompts.py similarity index 100% rename from llm_loader/prompts.py rename to smart_llm_loader/prompts.py diff --git a/llm_loader/schema.py b/smart_llm_loader/schema.py similarity index 100% rename from llm_loader/schema.py rename to smart_llm_loader/schema.py diff --git a/llm_loader/utils.py b/smart_llm_loader/utils.py similarity index 100% rename from llm_loader/utils.py rename to smart_llm_loader/utils.py diff --git a/tests/test_document_loader.py b/tests/test_document_loader.py index e7e7bdf..20c6dba 100644 --- a/tests/test_document_loader.py +++ b/tests/test_document_loader.py @@ -4,15 +4,15 @@ from unittest.mock import Mock from langchain_core.documents import Document -from llm_loader.document_loader import LLMLoader +from smart_llm_loader.document_loader import SmartLLMLoader @pytest.fixture(autouse=True) def mock_llm_validation(mocker): """Mock LLM validation for all tests.""" - mocker.patch('llm_loader.llm.validate_environment', return_value={"keys_in_environment": True}) - mocker.patch('llm_loader.llm.supports_vision', return_value=True) - mocker.patch('llm_loader.llm.check_valid_key', return_value=True) + mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": True}) + mocker.patch('smart_llm_loader.llm.supports_vision', return_value=True) + mocker.patch('smart_llm_loader.llm.check_valid_key', return_value=True) @pytest.fixture @@ -31,7 +31,7 @@ def mock_response(): def test_init_with_file_path(sample_pdf_path): - loader = LLMLoader(file_path=sample_pdf_path) + loader = SmartLLMLoader(file_path=sample_pdf_path) assert str(loader.file_path) == str(sample_pdf_path) assert loader.output_dir is None @@ -42,23 +42,23 @@ def test_init_with_url(mocker, mock_response): with tempfile.NamedTemporaryFile(suffix='.pdf') as temp_file: mocker.patch('tempfile.NamedTemporaryFile', return_value=temp_file) - loader = LLMLoader(url=url) + loader = SmartLLMLoader(url=url) assert isinstance(loader.file_path, Path) def test_init_with_both_file_and_url(sample_pdf_path): with pytest.raises(ValueError, match=r"Only one of file_path or url should be provided\."): - LLMLoader(file_path=sample_pdf_path, url="http://example.com/test.pdf") + SmartLLMLoader(file_path=sample_pdf_path, url="http://example.com/test.pdf") def test_init_with_neither_file_nor_url(): with pytest.raises(ValueError, match=r"Either file_path or url must be provided\."): - LLMLoader() + SmartLLMLoader() def test_load_from_path_with_output_dir(sample_pdf_path, tmp_path): output_dir = tmp_path / "output" - loader = LLMLoader(file_path=sample_pdf_path, save_output=True, output_dir=output_dir) + loader = SmartLLMLoader(file_path=sample_pdf_path, save_output=True, output_dir=output_dir) assert loader.output_dir == output_dir assert (output_dir / sample_pdf_path.name).exists() @@ -72,14 +72,14 @@ def test_load_from_url_invalid_content(mocker): mocker.patch('requests.get', return_value=mock_resp) with pytest.raises(ValueError, match=r"The URL does not point to a PDF file\."): - LLMLoader(url=url) + SmartLLMLoader(url=url) def test_load_method(mocker, sample_pdf_path): mock_documents = [Document(page_content="Test content")] - mocker.patch('llm_loader.llm.LLMProcessing.process_document_with_llm', return_value=mock_documents) + mocker.patch('smart_llm_loader.llm.LLMProcessing.process_document_with_llm', return_value=mock_documents) - loader = LLMLoader(file_path=sample_pdf_path) + loader = SmartLLMLoader(file_path=sample_pdf_path) documents = loader.load() assert len(documents) == 1 @@ -89,9 +89,9 @@ def test_load_method(mocker, sample_pdf_path): @pytest.mark.asyncio async def test_aload_method(mocker, sample_pdf_path): mock_documents = [Document(page_content="Test content")] - mocker.patch('llm_loader.llm.LLMProcessing.async_process_document_with_llm', return_value=mock_documents) + mocker.patch('smart_llm_loader.llm.LLMProcessing.async_process_document_with_llm', return_value=mock_documents) - loader = LLMLoader(file_path=sample_pdf_path) + loader = SmartLLMLoader(file_path=sample_pdf_path) documents = await loader.aload() assert len(documents) == 1 @@ -100,9 +100,9 @@ async def test_aload_method(mocker, sample_pdf_path): def test_load_and_split_method(mocker, sample_pdf_path): mock_documents = [Document(page_content="Test content")] - mocker.patch('llm_loader.llm.LLMProcessing.process_document_with_llm', return_value=mock_documents) + mocker.patch('smart_llm_loader.llm.LLMProcessing.process_document_with_llm', return_value=mock_documents) - loader = LLMLoader(file_path=sample_pdf_path, chunk_strategy="contextual") + loader = SmartLLMLoader(file_path=sample_pdf_path, chunk_strategy="contextual") documents = loader.load_and_split() assert len(documents) == 1 @@ -110,7 +110,7 @@ def test_load_and_split_method(mocker, sample_pdf_path): def test_create_document(sample_pdf_path): - loader = LLMLoader(file_path=sample_pdf_path) + loader = SmartLLMLoader(file_path=sample_pdf_path) chunk = {"content": "Test content", "theme": "Test theme"} page_num = 1 @@ -128,10 +128,10 @@ def test_lazy_load(mocker, sample_pdf_path): mock_images = [Mock()] mock_result = {"markdown_chunks": [{"content": "Test content", "theme": "Test theme"}]} - mocker.patch('llm_loader.llm.ImageProcessor.pdf_to_images', return_value=mock_images) - mocker.patch('llm_loader.llm.LLMProcessing.process_image_with_llm', return_value=mock_result) + mocker.patch('smart_llm_loader.llm.ImageProcessor.pdf_to_images', return_value=mock_images) + mocker.patch('smart_llm_loader.llm.LLMProcessing.process_image_with_llm', return_value=mock_result) - loader = LLMLoader(file_path=sample_pdf_path) + loader = SmartLLMLoader(file_path=sample_pdf_path) documents = list(loader.lazy_load()) assert len(documents) == 1 @@ -145,10 +145,10 @@ async def test_alazy_load(mocker, sample_pdf_path): mock_images = [Mock()] mock_result = {"markdown_chunks": [{"content": "Test content", "theme": "Test theme"}]} - mocker.patch('llm_loader.llm.ImageProcessor.pdf_to_images', return_value=mock_images) - mocker.patch('llm_loader.llm.LLMProcessing.async_process_image_with_llm', return_value=mock_result) + mocker.patch('smart_llm_loader.llm.ImageProcessor.pdf_to_images', return_value=mock_images) + mocker.patch('smart_llm_loader.llm.LLMProcessing.async_process_image_with_llm', return_value=mock_result) - loader = LLMLoader(file_path=sample_pdf_path) + loader = SmartLLMLoader(file_path=sample_pdf_path) documents = [doc async for doc in loader.alazy_load()] assert len(documents) == 1 diff --git a/tests/test_image_processor.py b/tests/test_image_processor.py index 79cfbbf..4edf658 100644 --- a/tests/test_image_processor.py +++ b/tests/test_image_processor.py @@ -3,7 +3,7 @@ import io import base64 -from llm_loader.llm import ImageProcessor +from smart_llm_loader.llm import ImageProcessor @pytest.fixture @@ -24,7 +24,7 @@ def sample_image(): def test_pdf_to_images(sample_pdf_path, mocker): # Mock pdf2image.convert_from_path mock_images = [Image.new('RGB', (100, 100)) for _ in range(2)] - mocker.patch('llm_loader.llm.convert_from_path', return_value=mock_images) + mocker.patch('smart_llm_loader.llm.convert_from_path', return_value=mock_images) images = ImageProcessor.pdf_to_images(sample_pdf_path) diff --git a/tests/test_llm_processing.py b/tests/test_llm_processing.py index eaa9f02..5d59014 100644 --- a/tests/test_llm_processing.py +++ b/tests/test_llm_processing.py @@ -3,16 +3,16 @@ from langchain_core.documents import Document from unittest.mock import Mock -from llm_loader.llm import LLMProcessing -from llm_loader.prompts import DEFAULT_PAGE_CHUNK_PROMPT, DEFAULT_CHUNK_PROMPT +from smart_llm_loader.llm import LLMProcessing +from smart_llm_loader.prompts import DEFAULT_PAGE_CHUNK_PROMPT, DEFAULT_CHUNK_PROMPT @pytest.fixture def llm_processor(mocker): # Mock all validation functions - mocker.patch('llm_loader.llm.validate_environment', return_value={"keys_in_environment": True}) - mocker.patch('llm_loader.llm.supports_vision', return_value=True) - mocker.patch('llm_loader.llm.check_valid_key', return_value=True) + mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": True}) + mocker.patch('smart_llm_loader.llm.supports_vision', return_value=True) + mocker.patch('smart_llm_loader.llm.check_valid_key', return_value=True) return LLMProcessing(model="gemini/gemini-2.0-flash") @@ -23,24 +23,24 @@ def sample_image(): def test_validate_model_valid(mocker): # Mock the validation functions - mocker.patch('llm_loader.llm.validate_environment', return_value={"keys_in_environment": True}) - mocker.patch('llm_loader.llm.supports_vision', return_value=True) - mocker.patch('llm_loader.llm.check_valid_key', return_value=True) + mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": True}) + mocker.patch('smart_llm_loader.llm.supports_vision', return_value=True) + mocker.patch('smart_llm_loader.llm.check_valid_key', return_value=True) # Should not raise any exceptions LLMProcessing(model="gemini/gemini-2.0-flash") def test_validate_model_missing_env_vars(mocker): - mocker.patch('llm_loader.llm.validate_environment', return_value={"keys_in_environment": False}) + mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": False}) with pytest.raises(ValueError, match="Missing environment variables"): LLMProcessing(model="gemini/gemini-2.0-flash") def test_validate_model_unsupported_vision(mocker): - mocker.patch('llm_loader.llm.validate_environment', return_value={"keys_in_environment": True}) - mocker.patch('llm_loader.llm.supports_vision', return_value=False) + mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": True}) + mocker.patch('smart_llm_loader.llm.supports_vision', return_value=False) with pytest.raises(ValueError, match="not a supported vision model"): LLMProcessing(model="unsupported-model") @@ -116,7 +116,7 @@ async def test_async_process_image_with_llm_success(llm_processor, sample_image, ) ) ] - mocker.patch('llm_loader.llm.acompletion', return_value=mock_response) + mocker.patch('smart_llm_loader.llm.acompletion', return_value=mock_response) result = await llm_processor.async_process_image_with_llm(sample_image, "Test prompt") @@ -127,7 +127,7 @@ async def test_async_process_image_with_llm_success(llm_processor, sample_image, @pytest.mark.asyncio async def test_async_process_image_with_llm_error(llm_processor, sample_image, mocker): - mocker.patch('llm_loader.llm.acompletion', side_effect=Exception("Test error")) + mocker.patch('smart_llm_loader.llm.acompletion', side_effect=Exception("Test error")) result = await llm_processor.async_process_image_with_llm(sample_image, "Test prompt")