Skip to content

Commit 686e499

Browse files
committed
add reproducible project
1 parent 17748ae commit 686e499

File tree

13 files changed

+4016
-0
lines changed

13 files changed

+4016
-0
lines changed

ch6/README.md

Whitespace-only changes.

ch6/benzinga_pipeline.png

72 KB
Loading

ch6/docker-compose.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
services:
2+
elasticsearch:
3+
image: "docker.elastic.co/elasticsearch/elasticsearch:8.11.1"
4+
ports:
5+
- 9200:9200
6+
restart: on-failure
7+
environment:
8+
- discovery.type=single-node
9+
- xpack.security.enabled=false
10+
- "ES_JAVA_OPTS=-Xms1024m -Xmx1024m"
11+
healthcheck:
12+
test: curl --fail http://localhost:9200/_cat/health || exit 1
13+
interval: 10s
14+
timeout: 1s
15+
retries: 10

ch6/indexingpipeline.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
from haystack.components.preprocessors import DocumentCleaner
2+
from haystack.components.embedders import OpenAIDocumentEmbedder
3+
from haystack import Pipeline
4+
from haystack.components.embedders import OpenAIDocumentEmbedder
5+
from haystack.components.preprocessors import DocumentCleaner
6+
from haystack.components.preprocessors import DocumentSplitter
7+
from haystack.components.writers import DocumentWriter
8+
from haystack.document_stores.types import DuplicatePolicy
9+
from haystack.document_stores.in_memory import InMemoryDocumentStore
10+
from haystack.utils import Secret
11+
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
12+
13+
14+
from haystack import component, Document
15+
from typing import Any, Dict, List, Optional, Union
16+
from haystack.dataclasses import ByteStream
17+
18+
import json
19+
from dotenv import load_dotenv
20+
import os
21+
22+
import re
23+
from bs4 import BeautifulSoup
24+
from pathlib import Path
25+
26+
import logging
27+
28+
load_dotenv(".env")
29+
open_ai_key = os.environ.get("OPENAI_API_KEY")
30+
31+
logging.basicConfig(level=logging.INFO)
32+
logger = logging.getLogger(__name__)
33+
34+
import json
35+
36+
def read_jsonl_file(file_path):
37+
"""
38+
Reads a JSONL (JSON Lines) file and returns a list of dictionaries representing each valid JSON object.
39+
Lines with JSON decoding errors are skipped.
40+
41+
:param file_path: The path to the JSONL file.
42+
:return: A list of dictionaries, each representing a parsed JSON object.
43+
"""
44+
data = []
45+
46+
try:
47+
with open(file_path, 'r') as file:
48+
for line in file:
49+
try:
50+
# Attempt to load the JSON data from the current line
51+
json_data = json.loads(line)
52+
data.append(json_data)
53+
except json.JSONDecodeError as e:
54+
# Print an error message for any lines that can't be decoded
55+
print(f"Error decoding JSON on line: {line[:30]}... - {e}")
56+
except FileNotFoundError as e:
57+
print(f"File not found: {e}")
58+
59+
return data
60+
61+
62+
@component
63+
class BenzingaNews:
64+
65+
@component.output_types(documents=List[Document])
66+
def run(self, sources: Dict[str, Any]) -> None:
67+
68+
documents = []
69+
for source in sources:
70+
71+
for key in source:
72+
if type(source[key]) == str:
73+
source[key] = self.clean_text(source[key])
74+
75+
if source['content'] == "":
76+
continue
77+
78+
#drop content from source dictionary
79+
content = source['content']
80+
document = Document(content=content, meta=source)
81+
82+
documents.append(document)
83+
84+
return {"documents": documents}
85+
86+
def clean_text(self, text):
87+
# Remove HTML tags using BeautifulSoup
88+
soup = BeautifulSoup(text, "html.parser")
89+
text = soup.get_text()
90+
# Remove extra whitespace
91+
text = re.sub(r'\s+', ' ', text).strip()
92+
return text
93+
94+
@component
95+
class BenzingaEmbeder:
96+
97+
def __init__(self):
98+
get_news = BenzingaNews()
99+
document_store = ElasticsearchDocumentStore(embedding_similarity_function="cosine", hosts = "http://localhost:9200")
100+
document_cleaner = DocumentCleaner(
101+
remove_empty_lines=True,
102+
remove_extra_whitespaces=True,
103+
remove_repeated_substrings=False
104+
)
105+
document_splitter = DocumentSplitter(split_by="passage", split_length=5)
106+
document_writer = DocumentWriter(document_store=document_store,
107+
policy = DuplicatePolicy.OVERWRITE)
108+
embedding = OpenAIDocumentEmbedder(api_key=Secret.from_token(open_ai_key))
109+
110+
self.pipeline = Pipeline()
111+
self.pipeline.add_component("get_news", get_news)
112+
self.pipeline.add_component("document_cleaner", document_cleaner)
113+
self.pipeline.add_component("document_splitter", document_splitter)
114+
self.pipeline.add_component("embedding", embedding)
115+
self.pipeline.add_component("document_writer", document_writer)
116+
117+
self.pipeline.connect("get_news", "document_cleaner")
118+
self.pipeline.connect("document_cleaner", "document_splitter")
119+
self.pipeline.connect("document_splitter", "embedding")
120+
self.pipeline.connect("embedding", "document_writer")
121+
122+
123+
@component.output_types(documents=List[Document])
124+
def run(self, event: List[Union[str, Path, ByteStream]]):
125+
126+
documents = self.pipeline.run({"get_news": {"sources": [event]}})
127+
128+
self.pipeline.draw("benzinga_pipeline.png")
129+
return documents
130+
131+
132+
document_embedder = BenzingaEmbeder()
133+
data = read_jsonl_file("./news_out.jsonl")
134+
135+
136+
for ite in data:
137+
print(document_embedder.run(ite))

ch6/justfile

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Show this help list
2+
help:
3+
@echo 'Run `just get-started` to init a development env.'
4+
@just --list
5+
6+
# Init a development env
7+
get-started:
8+
@echo 'Checking that you have `uv` installed'
9+
@echo 'If you need it, I recommend installing `pipx` from https://pipx.pypa.io/stable/ then `pipx install uv`'
10+
uv --version
11+
@echo 'Checking that you have Python 3.12 installed'
12+
@echo 'If you need it, I recommend installing `pyenv` from https://github.com/pyenv/pyenv then `pyenv install 3.12`'
13+
@echo 'You also might need to activate the global shim with `pyenv global system 3.12`'
14+
python3.12 --version
15+
@echo 'Creating the development virtual env in `venvs/dev/`'
16+
mkdir -p venvs
17+
test -d venvs/dev/ || uv venv -p 3.12 venvs/dev/
18+
@echo 'Compiling all dependencies'
19+
just venv-compile-all
20+
@echo 'Installing all the tools and dependencies'
21+
just venv-sync dev
22+
@echo 'All done!'
23+
@echo 'Each time before you do any work in this repo you should run `. venvs/dev/bin/activate`'
24+
@echo 'Once the `dev` venv is activated, run:'
25+
@echo
26+
@echo '`just develop` to re-build Bytewax and install it in the venv'
27+
@echo '`just test-py` to run the Python test suite'
28+
@echo '`just lint` to lint the source code'
29+
@echo '`just --list` to show more advanced recipes'
30+
31+
# Assert we are in a venv.
32+
_assert-venv:
33+
#!/usr/bin/env python
34+
import sys
35+
p = sys.prefix
36+
if not (p.endswith("venvs/dev") or p.endswith("venv")):
37+
print("You must activate the `dev` venv with `. venvs/dev/bin/activate` before running this command", file=sys.stderr)
38+
sys.exit(1)
39+
40+
41+
# Install the library locally in an editable state
42+
develop: _assert-venv
43+
@# You never need to run with `-E` / `--extras`; the `dev` and test
44+
@# virtualenvs already have the optional dependencies pinned.
45+
uv pip install -e .
46+
47+
venv-sync venv:
48+
VIRTUAL_ENV={{justfile_directory()}}/venvs/{{venv}} uv pip sync --strict requirements/{{venv}}.txt
49+
50+
# Sync all venvs
51+
venv-sync-all: (venv-sync "doc") (venv-sync "dev")
52+
53+
54+
venv-compile-all:
55+
uv pip compile --generate-hashes -p 3.8 --all-extras pyproject.toml -o requirements/lib-py3.8.txt
56+
uv pip compile --generate-hashes -p 3.9 --all-extras pyproject.toml -o requirements/lib-py3.9.txt
57+
uv pip compile --generate-hashes -p 3.10 --all-extras pyproject.toml -o requirements/lib-py3.10.txt
58+
uv pip compile --generate-hashes -p 3.11 --all-extras pyproject.toml -o requirements/lib-py3.11.txt
59+
uv pip compile --generate-hashes -p 3.12 --all-extras pyproject.toml -o requirements/lib-py3.12.txt
60+

0 commit comments

Comments
 (0)