PacktPublishing
diff --git a/‎ch6/README.md‎ b/‎ch6/README.md‎
diff --git a/‎ch6/benzinga_pipeline.png‎
72 KB b/‎ch6/benzinga_pipeline.png‎
72 KB
diff --git a/‎ch6/docker-compose.yml‎
Lines changed: 15 additions & 0 deletions b/‎ch6/docker-compose.yml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎ch6/indexingpipeline.py‎
Lines changed: 137 additions & 0 deletions b/‎ch6/indexingpipeline.py‎
Lines changed: 137 additions & 0 deletions
diff --git a/‎ch6/justfile‎
Lines changed: 60 additions & 0 deletions b/‎ch6/justfile‎
Lines changed: 60 additions & 0 deletions
@@ -0,0 +1,15 @@
+services:
+  elasticsearch:
+    image: "docker.elastic.co/elasticsearch/elasticsearch:8.11.1"
+    ports:
+      - 9200:9200
+    restart: on-failure
+    environment:
+      - discovery.type=single-node
+      - xpack.security.enabled=false
+      - "ES_JAVA_OPTS=-Xms1024m -Xmx1024m"
+    healthcheck:
+        test: curl --fail http://localhost:9200/_cat/health || exit 1
+        interval: 10s
+        timeout: 1s
+        retries: 10
@@ -0,0 +1,137 @@
+from haystack.components.preprocessors import DocumentCleaner
+from haystack.components.embedders import OpenAIDocumentEmbedder
+from haystack import Pipeline
+from haystack.components.embedders import OpenAIDocumentEmbedder
+from haystack.components.preprocessors import DocumentCleaner
+from haystack.components.preprocessors import DocumentSplitter
+from haystack.components.writers import DocumentWriter
+from haystack.document_stores.types import DuplicatePolicy
+from haystack.document_stores.in_memory import InMemoryDocumentStore
+from haystack.utils import Secret
+from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
+
+
+from haystack import component, Document
+from typing import Any, Dict, List, Optional, Union
+from haystack.dataclasses import ByteStream
+
+import json
+from dotenv import load_dotenv
+import os
+
+import re
+from bs4 import BeautifulSoup
+from pathlib import Path
+
+import logging
+
+load_dotenv(".env")
+open_ai_key = os.environ.get("OPENAI_API_KEY")
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+import json
+
+def read_jsonl_file(file_path):
+    """
+    Reads a JSONL (JSON Lines) file and returns a list of dictionaries representing each valid JSON object.
+    Lines with JSON decoding errors are skipped.
+    
+    :param file_path: The path to the JSONL file.
+    :return: A list of dictionaries, each representing a parsed JSON object.
+    """
+    data = []
+    
+    try:
+        with open(file_path, 'r') as file:
+            for line in file:
+                try:
+                    # Attempt to load the JSON data from the current line
+                    json_data = json.loads(line)
+                    data.append(json_data)
+                except json.JSONDecodeError as e:
+                    # Print an error message for any lines that can't be decoded
+                    print(f"Error decoding JSON on line: {line[:30]}... - {e}")
+    except FileNotFoundError as e:
+        print(f"File not found: {e}")
+    
+    return data
+
+        
+@component
+class BenzingaNews:
+    
+    @component.output_types(documents=List[Document])
+    def run(self, sources: Dict[str, Any]) -> None:
+             
+        documents = []
+        for source in sources:
+        
+            for key in source:
+                if type(source[key]) == str:
+                    source[key] = self.clean_text(source[key])
+                    
+            if source['content'] == "":
+                continue
+
+            #drop content from source dictionary
+            content = source['content']
+            document = Document(content=content, meta=source) 
+            
+            documents.append(document)
+         
+        return {"documents": documents}
+               
+    def clean_text(self, text):
+        # Remove HTML tags using BeautifulSoup
+        soup = BeautifulSoup(text, "html.parser")
+        text = soup.get_text()
+        # Remove extra whitespace
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
+    
+@component
+class BenzingaEmbeder:
+    
+    def __init__(self):
+        get_news = BenzingaNews()
+        document_store = ElasticsearchDocumentStore(embedding_similarity_function="cosine", hosts = "http://localhost:9200")
+        document_cleaner = DocumentCleaner(
+                            remove_empty_lines=True,
+                            remove_extra_whitespaces=True,
+                            remove_repeated_substrings=False
+                        )
+        document_splitter = DocumentSplitter(split_by="passage", split_length=5)
+        document_writer = DocumentWriter(document_store=document_store,
+                                        policy = DuplicatePolicy.OVERWRITE)
+        embedding = OpenAIDocumentEmbedder(api_key=Secret.from_token(open_ai_key))
+
+        self.pipeline = Pipeline()
+        self.pipeline.add_component("get_news", get_news)
+        self.pipeline.add_component("document_cleaner", document_cleaner)
+        self.pipeline.add_component("document_splitter", document_splitter)
+        self.pipeline.add_component("embedding", embedding)
+        self.pipeline.add_component("document_writer", document_writer)
+
+        self.pipeline.connect("get_news", "document_cleaner")
+        self.pipeline.connect("document_cleaner", "document_splitter")
+        self.pipeline.connect("document_splitter", "embedding")
+        self.pipeline.connect("embedding", "document_writer")
+        
+        
+    @component.output_types(documents=List[Document])
+    def run(self, event: List[Union[str, Path, ByteStream]]):
+        
+        documents = self.pipeline.run({"get_news": {"sources": [event]}})
+        
+        self.pipeline.draw("benzinga_pipeline.png")
+        return documents
+    
+
+document_embedder = BenzingaEmbeder()
+data = read_jsonl_file("./news_out.jsonl")
+
+
+for ite in data:
+    print(document_embedder.run(ite))
@@ -0,0 +1,60 @@
+# Show this help list
+help:
+    @echo 'Run `just get-started` to init a development env.'
+    @just --list
+
+# Init a development env
+get-started:
+    @echo 'Checking that you have `uv` installed'
+    @echo 'If you need it, I recommend installing `pipx` from https://pipx.pypa.io/stable/ then `pipx install uv`'
+    uv --version
+    @echo 'Checking that you have Python 3.12 installed'
+    @echo 'If you need it, I recommend installing `pyenv` from https://github.com/pyenv/pyenv then `pyenv install 3.12`'
+    @echo 'You also might need to activate the global shim with `pyenv global system 3.12`'
+    python3.12 --version
+    @echo 'Creating the development virtual env in `venvs/dev/`'
+    mkdir -p venvs
+    test -d venvs/dev/ || uv venv -p 3.12 venvs/dev/
+    @echo 'Compiling all dependencies'
+    just venv-compile-all
+    @echo 'Installing all the tools and dependencies'
+    just venv-sync dev
+    @echo 'All done!'
+    @echo 'Each time before you do any work in this repo you should run `. venvs/dev/bin/activate`'
+    @echo 'Once the `dev` venv is activated, run:'
+    @echo
+    @echo '`just develop` to re-build Bytewax and install it in the venv'
+    @echo '`just test-py` to run the Python test suite'
+    @echo '`just lint` to lint the source code'
+    @echo '`just --list` to show more advanced recipes'
+
+# Assert we are in a venv.
+_assert-venv:
+    #!/usr/bin/env python
+    import sys
+    p = sys.prefix
+    if not (p.endswith("venvs/dev") or p.endswith("venv")):
+        print("You must activate the `dev` venv with `. venvs/dev/bin/activate` before running this command", file=sys.stderr)
+        sys.exit(1)
+
+
+# Install the library locally in an editable state
+develop: _assert-venv
+    @# You never need to run with `-E` / `--extras`; the `dev` and test
+    @# virtualenvs already have the optional dependencies pinned.
+    uv pip install -e .
+
+venv-sync venv:
+    VIRTUAL_ENV={{justfile_directory()}}/venvs/{{venv}} uv pip sync --strict requirements/{{venv}}.txt
+
+# Sync all venvs
+venv-sync-all: (venv-sync "doc") (venv-sync "dev")
+
+
+venv-compile-all:
+    uv pip compile --generate-hashes -p 3.8 --all-extras pyproject.toml -o requirements/lib-py3.8.txt
+    uv pip compile --generate-hashes -p 3.9 --all-extras pyproject.toml -o requirements/lib-py3.9.txt
+    uv pip compile --generate-hashes -p 3.10 --all-extras pyproject.toml -o requirements/lib-py3.10.txt
+    uv pip compile --generate-hashes -p 3.11 --all-extras pyproject.toml -o requirements/lib-py3.11.txt
+    uv pip compile --generate-hashes -p 3.12 --all-extras pyproject.toml -o requirements/lib-py3.12.txt
+