Skip to content

Commit d4fdb34

Browse files
committed
init chapter 7 material
1 parent 3e82dac commit d4fdb34

File tree

6 files changed

+776
-0
lines changed

6 files changed

+776
-0
lines changed

ch6/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ dependencies = [
3333
"trafilatura",
3434
"wandb",
3535
"ragas-haystack",
36+
"bytewax>=0.21"
3637
]
3738

3839
[build-system]

ch6/requirements/doc.txt

Whitespace-only changes.

ch7/api-dockerization/app.py

Whitespace-only changes.
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
2+
from haystack.components.embedders import OpenAIDocumentEmbedder
3+
from haystack import Pipeline
4+
from haystack.components.writers import DocumentWriter
5+
from haystack.document_stores.types import DuplicatePolicy
6+
from haystack.utils import Secret
7+
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
8+
9+
10+
from haystack import component, Document
11+
from typing import Any, Dict, List, Union
12+
from haystack.dataclasses import ByteStream
13+
14+
import json
15+
from dotenv import load_dotenv
16+
import os
17+
18+
import re
19+
from bs4 import BeautifulSoup
20+
from pathlib import Path
21+
22+
import logging
23+
24+
load_dotenv(".env")
25+
open_ai_key = os.environ.get("OPENAI_API_KEY")
26+
27+
logging.basicConfig(level=logging.INFO)
28+
logger = logging.getLogger(__name__)
29+
30+
def read_jsonl_file(file_path):
31+
"""
32+
Reads a JSONL (JSON Lines) file and returns a list of dictionaries representing each valid JSON object.
33+
Lines with JSON decoding errors are skipped.
34+
35+
:param file_path: The path to the JSONL file.
36+
:return: A list of dictionaries, each representing a parsed JSON object.
37+
"""
38+
data = []
39+
40+
try:
41+
with open(file_path, 'r') as file:
42+
for line in file:
43+
try:
44+
# Attempt to load the JSON data from the current line
45+
json_data = json.loads(line)
46+
data.append(json_data)
47+
except json.JSONDecodeError as e:
48+
# Log an error message for any lines that can't be decoded
49+
logger.error(f"Error decoding JSON on line: {line[:30]}... - {e}")
50+
except FileNotFoundError as e:
51+
logger.error(f"File not found: {e}")
52+
53+
return data
54+
55+
56+
@component
57+
class BenzingaNews:
58+
59+
@component.output_types(documents=List[Document])
60+
def run(self, sources: Dict[str, Any]) -> None:
61+
logger.info("Starting BenzingaNews.run with sources")
62+
documents = []
63+
try:
64+
for source in sources:
65+
logger.debug(f"Processing source: {source.get('headline', 'Unknown headline')}")
66+
for key in source:
67+
if isinstance(source[key], str):
68+
source[key] = self.clean_text(source[key])
69+
70+
if source['content'] == "":
71+
logger.warning(f"Skipping source due to empty content: {source.get('headline', 'Unknown headline')}")
72+
continue
73+
74+
# Create a Document with the cleaned content and metadata
75+
content = source['content']
76+
document = Document(content=content, meta=source)
77+
documents.append(document)
78+
79+
logger.info(f"Successfully processed {len(documents)} documents.")
80+
81+
except Exception as e:
82+
logger.error(f"Error during BenzingaNews.run: {e}")
83+
84+
return {"documents": documents}
85+
86+
def clean_text(self, text):
87+
logger.debug("Cleaning text content.")
88+
try:
89+
# Remove HTML tags using BeautifulSoup
90+
soup = BeautifulSoup(text, "html.parser")
91+
text = soup.get_text()
92+
# Remove extra whitespace
93+
text = re.sub(r'\s+', ' ', text).strip()
94+
logger.debug("Text cleaned successfully.")
95+
except Exception as e:
96+
logger.error(f"Error during text cleaning: {e}")
97+
raise
98+
return text
99+
100+
101+
@component
102+
class BenzingaEmbeder:
103+
104+
def __init__(self):
105+
logger.info("Initializing BenzingaEmbeder pipeline.")
106+
try:
107+
get_news = BenzingaNews()
108+
document_store = ElasticsearchDocumentStore(embedding_similarity_function="cosine", hosts="http://localhost:9200")
109+
document_cleaner = DocumentCleaner(
110+
remove_empty_lines=True,
111+
remove_extra_whitespaces=True,
112+
remove_repeated_substrings=False
113+
)
114+
document_splitter = DocumentSplitter(split_by="passage", split_length=5)
115+
document_writer = DocumentWriter(document_store=document_store,
116+
policy=DuplicatePolicy.OVERWRITE)
117+
embedding = OpenAIDocumentEmbedder(api_key=Secret.from_token(open_ai_key))
118+
119+
self.pipeline = Pipeline()
120+
self.pipeline.add_component("get_news", get_news)
121+
self.pipeline.add_component("document_cleaner", document_cleaner)
122+
self.pipeline.add_component("document_splitter", document_splitter)
123+
self.pipeline.add_component("embedding", embedding)
124+
self.pipeline.add_component("document_writer", document_writer)
125+
126+
self.pipeline.connect("get_news", "document_cleaner")
127+
self.pipeline.connect("document_cleaner", "document_splitter")
128+
self.pipeline.connect("document_splitter", "embedding")
129+
self.pipeline.connect("embedding", "document_writer")
130+
131+
logger.info("Pipeline initialized successfully.")
132+
except Exception as e:
133+
logger.error(f"Error during BenzingaEmbeder initialization: {e}")
134+
raise
135+
136+
@component.output_types(documents=List[Document])
137+
def run(self, event: List[Union[str, Path, ByteStream]]):
138+
logger.info(f"Running BenzingaEmbeder with event: {event}")
139+
try:
140+
documents = self.pipeline.run({"get_news": {"sources": [event]}})
141+
self.pipeline.draw("benzinga_pipeline.png")
142+
logger.info("Pipeline executed successfully, drawing pipeline graph.")
143+
return documents
144+
except Exception as e:
145+
logger.error(f"Error during pipeline execution: {e}")
146+
raise
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
2+
from haystack import Pipeline
3+
from haystack.components.embedders import OpenAITextEmbedder
4+
from haystack.utils import Secret
5+
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
6+
from haystack.components.builders import PromptBuilder
7+
from haystack.components.generators import OpenAIGenerator
8+
9+
from dotenv import load_dotenv
10+
import os
11+
import wandb
12+
import time
13+
14+
load_dotenv(".env")
15+
open_ai_key = os.environ.get("OPENAI_API_KEY")
16+
17+
# Initialize ElasticsearchDocumentStore
18+
document_store = ElasticsearchDocumentStore(hosts = "http://localhost:9200")
19+
20+
# Initialize a text embedder to create an embedding for the user query.
21+
text_embedder = OpenAITextEmbedder(api_key=Secret.from_token(open_ai_key))
22+
23+
# Initialize retriever
24+
retriever = ElasticsearchEmbeddingRetriever(document_store=document_store)
25+
26+
# Define the template prompt
27+
template = """
28+
Given the following information, answer the question.
29+
Context:
30+
{% for document in documents %}
31+
{{ document.content }}
32+
{% endfor %}
33+
Question: {{question}}
34+
Answer:
35+
"""
36+
prompt_builder = PromptBuilder(template=template)
37+
38+
# Initialize Generator (Replace 'your-api-key' with your OpenAI API Key)
39+
generator = OpenAIGenerator(model="gpt-4o-mini")
40+
generator.api_key = open_ai_key
41+
42+
# Build the Pipeline
43+
query_pipeline = Pipeline()
44+
query_pipeline.add_component("text_embedder", text_embedder)
45+
query_pipeline.add_component("retriever", retriever)
46+
query_pipeline.add_component("prompt_builder", prompt_builder)
47+
query_pipeline.add_component("llm", generator)
48+
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
49+
query_pipeline.connect("retriever", "prompt_builder.documents")
50+
query_pipeline.connect("prompt_builder", "llm")
51+
52+
if __name__ == "__main__":
53+
query_pipeline.draw(path="query_pipeline.png")
54+
55+
56+
# Running the pipeline
57+
question = "Tell me about what you know"
58+
start_time = time.time()
59+
response = query_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})
60+
end_time = time.time()
61+
62+
63+
print(response["llm"]["replies"][0])

0 commit comments

Comments
 (0)