PacktPublishing
diff --git a/‎ch6/pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎ch6/pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ch6/requirements/doc.txt‎ b/‎ch6/requirements/doc.txt‎
diff --git a/‎ch7/api-dockerization/app.py‎ b/‎ch7/api-dockerization/app.py‎
diff --git a/‎ch7/api-dockerization/indexing_pipeline.py‎
Lines changed: 146 additions & 0 deletions b/‎ch7/api-dockerization/indexing_pipeline.py‎
Lines changed: 146 additions & 0 deletions
diff --git a/‎ch7/api-dockerization/query_pipeline.py‎
Lines changed: 63 additions & 0 deletions b/‎ch7/api-dockerization/query_pipeline.py‎
Lines changed: 63 additions & 0 deletions
@@ -33,6 +33,7 @@ dependencies = [
   "trafilatura",
   "wandb",
   "ragas-haystack",
+  "bytewax>=0.21"
 ]
 
 [build-system]
 
@@ -0,0 +1,146 @@
+from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
+from haystack.components.embedders import OpenAIDocumentEmbedder
+from haystack import Pipeline
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores.types import DuplicatePolicy
+from haystack.utils import Secret
+from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
+
+
+from haystack import component, Document
+from typing import Any, Dict, List, Union
+from haystack.dataclasses import ByteStream
+
+import json
+from dotenv import load_dotenv
+import os
+
+import re
+from bs4 import BeautifulSoup
+from pathlib import Path
+
+import logging
+
+load_dotenv(".env")
+open_ai_key = os.environ.get("OPENAI_API_KEY")
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def read_jsonl_file(file_path):
+    """
+    Reads a JSONL (JSON Lines) file and returns a list of dictionaries representing each valid JSON object.
+    Lines with JSON decoding errors are skipped.
+    
+    :param file_path: The path to the JSONL file.
+    :return: A list of dictionaries, each representing a parsed JSON object.
+    """
+    data = []
+    
+    try:
+        with open(file_path, 'r') as file:
+            for line in file:
+                try:
+                    # Attempt to load the JSON data from the current line
+                    json_data = json.loads(line)
+                    data.append(json_data)
+                except json.JSONDecodeError as e:
+                    # Log an error message for any lines that can't be decoded
+                    logger.error(f"Error decoding JSON on line: {line[:30]}... - {e}")
+    except FileNotFoundError as e:
+        logger.error(f"File not found: {e}")
+    
+    return data
+
+        
+@component
+class BenzingaNews:
+    
+    @component.output_types(documents=List[Document])
+    def run(self, sources: Dict[str, Any]) -> None:
+        logger.info("Starting BenzingaNews.run with sources")
+        documents = []
+        try:
+            for source in sources:
+                logger.debug(f"Processing source: {source.get('headline', 'Unknown headline')}")
+                for key in source:
+                    if isinstance(source[key], str):
+                        source[key] = self.clean_text(source[key])
+                
+                if source['content'] == "":
+                    logger.warning(f"Skipping source due to empty content: {source.get('headline', 'Unknown headline')}")
+                    continue
+
+                # Create a Document with the cleaned content and metadata
+                content = source['content']
+                document = Document(content=content, meta=source)
+                documents.append(document)
+            
+            logger.info(f"Successfully processed {len(documents)} documents.")
+        
+        except Exception as e:
+            logger.error(f"Error during BenzingaNews.run: {e}")
+        
+        return {"documents": documents}
+               
+    def clean_text(self, text):
+        logger.debug("Cleaning text content.")
+        try:
+            # Remove HTML tags using BeautifulSoup
+            soup = BeautifulSoup(text, "html.parser")
+            text = soup.get_text()
+            # Remove extra whitespace
+            text = re.sub(r'\s+', ' ', text).strip()
+            logger.debug("Text cleaned successfully.")
+        except Exception as e:
+            logger.error(f"Error during text cleaning: {e}")
+            raise
+        return text
+
+
+@component
+class BenzingaEmbeder:
+    
+    def __init__(self):
+        logger.info("Initializing BenzingaEmbeder pipeline.")
+        try:
+            get_news = BenzingaNews()
+            document_store = ElasticsearchDocumentStore(embedding_similarity_function="cosine", hosts="http://localhost:9200")
+            document_cleaner = DocumentCleaner(
+                                remove_empty_lines=True,
+                                remove_extra_whitespaces=True,
+                                remove_repeated_substrings=False
+                            )
+            document_splitter = DocumentSplitter(split_by="passage", split_length=5)
+            document_writer = DocumentWriter(document_store=document_store,
+                                             policy=DuplicatePolicy.OVERWRITE)
+            embedding = OpenAIDocumentEmbedder(api_key=Secret.from_token(open_ai_key))
+
+            self.pipeline = Pipeline()
+            self.pipeline.add_component("get_news", get_news)
+            self.pipeline.add_component("document_cleaner", document_cleaner)
+            self.pipeline.add_component("document_splitter", document_splitter)
+            self.pipeline.add_component("embedding", embedding)
+            self.pipeline.add_component("document_writer", document_writer)
+
+            self.pipeline.connect("get_news", "document_cleaner")
+            self.pipeline.connect("document_cleaner", "document_splitter")
+            self.pipeline.connect("document_splitter", "embedding")
+            self.pipeline.connect("embedding", "document_writer")
+
+            logger.info("Pipeline initialized successfully.")
+        except Exception as e:
+            logger.error(f"Error during BenzingaEmbeder initialization: {e}")
+            raise
+
+    @component.output_types(documents=List[Document])
+    def run(self, event: List[Union[str, Path, ByteStream]]):
+        logger.info(f"Running BenzingaEmbeder with event: {event}")
+        try:
+            documents = self.pipeline.run({"get_news": {"sources": [event]}})
+            self.pipeline.draw("benzinga_pipeline.png")
+            logger.info("Pipeline executed successfully, drawing pipeline graph.")
+            return documents
+        except Exception as e:
+            logger.error(f"Error during pipeline execution: {e}")
+            raise
@@ -0,0 +1,63 @@
+from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
+from haystack import Pipeline
+from haystack.components.embedders import OpenAITextEmbedder 
+from haystack.utils import Secret
+from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
+from haystack.components.builders import PromptBuilder
+from haystack.components.generators import OpenAIGenerator
+
+from dotenv import load_dotenv
+import os
+import wandb
+import time
+
+load_dotenv(".env")
+open_ai_key = os.environ.get("OPENAI_API_KEY")
+
+# Initialize ElasticsearchDocumentStore
+document_store = ElasticsearchDocumentStore(hosts = "http://localhost:9200")
+
+# Initialize a text embedder to create an embedding for the user query.
+text_embedder = OpenAITextEmbedder(api_key=Secret.from_token(open_ai_key))
+
+# Initialize retriever
+retriever = ElasticsearchEmbeddingRetriever(document_store=document_store)
+
+# Define the template prompt
+template = """
+Given the following information, answer the question.
+Context:
+{% for document in documents %}
+{{ document.content }}
+{% endfor %}
+Question: {{question}}
+Answer:
+"""
+prompt_builder = PromptBuilder(template=template)
+
+# Initialize Generator (Replace 'your-api-key' with your OpenAI API Key)
+generator = OpenAIGenerator(model="gpt-4o-mini")
+generator.api_key = open_ai_key
+
+# Build the Pipeline
+query_pipeline = Pipeline()
+query_pipeline.add_component("text_embedder", text_embedder)
+query_pipeline.add_component("retriever", retriever)
+query_pipeline.add_component("prompt_builder", prompt_builder)
+query_pipeline.add_component("llm", generator)
+query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
+query_pipeline.connect("retriever", "prompt_builder.documents")
+query_pipeline.connect("prompt_builder", "llm")
+
+if __name__ == "__main__":
+    query_pipeline.draw(path="query_pipeline.png")
+
+
+    # Running the pipeline
+    question = "Tell me about what you know"
+    start_time = time.time()
+    response = query_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})
+    end_time = time.time()
+    
+            
+    print(response["llm"]["replies"][0])
Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@ dependencies = [`
`33`	`33`	`"trafilatura",`
`34`	`34`	`"wandb",`
`35`	`35`	`"ragas-haystack",`
	`36`	`+ "bytewax>=0.21"`
`36`	`37`	`]`
`37`	`38`
`38`	`39`	`[build-system]`