solygambas
diff --git a/‎06-qa-bot/F1_QA_Assistant.ipynb
+944 b/‎06-qa-bot/F1_QA_Assistant.ipynb
+944
diff --git a/‎06-qa-bot/README.md
+29 b/‎06-qa-bot/README.md
+29
diff --git a/‎06-qa-bot/cache.db
2.48 MB b/‎06-qa-bot/cache.db
2.48 MB
diff --git a/‎06-qa-bot/embeddings.db
46.6 MB b/‎06-qa-bot/embeddings.db
46.6 MB
diff --git a/‎06-qa-bot/f1_2022.csv
+24 b/‎06-qa-bot/f1_2022.csv
+24
diff --git a/‎06-qa-bot/f1_utilities.py
+86 b/‎06-qa-bot/f1_utilities.py
+86
diff --git a/‎06-qa-bot/screenshot.png
72.7 KB b/‎06-qa-bot/screenshot.png
72.7 KB
diff --git a/‎06-qa-bot/utilities.py
+152 b/‎06-qa-bot/utilities.py
+152
diff --git a/‎README.md
+21-1 b/‎README.md
+21-1
@@ -0,0 +1,29 @@
+# Q&A Bot
+
+A dynamic Q&A Bot using GPT-4.
+
+<p align="center">
+    <img src="screenshot.png">
+</p>
+
+## Setup
+
+You need to create a virtual env and install the packages listed in `requirements.txt`. You can then run Jupyter Notebooks in VS Code.
+
+Follow these steps: [How to Work with Python Virtual Environments, Jupyter Notebooks and VS Code](https://python.plainenglish.io/how-to-work-with-python-virtual-environments-jupyter-notebooks-and-vs-code-536fac3d93a1).
+
+You need to create a `.env` file with your `OPENAI_API_KEY`.
+
+# Usage
+
+Open `F1_QA_Assistant.ipynb`.
+
+## Features
+
+- scraping data from Wikipedia.
+- generating a bunch of embeddings on the last Formula One season.
+- turning the questions from users into embeddings.
+- finding the K nearest neighbors to that embedding.
+- including the matching texts in the prompt to expand GPT-4 knowledge.
+
+Based on [Mastering OpenAI Python APIs: Unleash the Power of GPT4](https://www.udemy.com/course/mastering-openai/) by Colt Steele (2023).
@@ -0,0 +1,24 @@
+Link
+2022_Formula_One_World_Championship
+2022_Abu_Dhabi_Grand_Prix
+2022_Sao_Paulo_Grand_Prix
+2022_Mexico_City_Grand_Prix
+2022_United_States_Grand_Prix
+2022_Japanese_Grand_Prix
+2022_Singapore_Grand_Prix
+2022_Italian_Grand_Prix
+2022_Dutch_Grand_Prix
+2022_Belgian_Grand_Prix
+2022_Hungarian_Grand_Prix
+2022_French_Grand_Prix
+2022_Austrian_Grand_Prix
+2022_British_Grand_Prix
+2022_Canadian_Grand_Prix
+2022_Azerbaijan_Grand_Prix
+2022_Monaco_Grand_Prix
+2022_Spanish_Grand_Prix
+2022_Miami_Grand_Prix
+2022_Emilia_Romagna_Grand_Prix
+2022_Australian_Grand_Prix
+2022_Saudi_Arabian_Grand_Prix
+2022_Bahrain_Grand_Prix
@@ -0,0 +1,86 @@
+import fnmatch
+import os
+import re
+from dataclasses import dataclass
+from typing import Generator, Iterable, List
+
+import openai
+import pandas as pd
+import tiktoken
+from dotenv import load_dotenv
+from openai.embeddings_utils import cosine_similarity
+from utilities import get_embedding, num_tokens_from_messages
+
+# Thanks to http://www.oldmanumby.com/ for his remaster and converion of the Dungeons
+# and Dragons 5th Edition SRD (Systems Reference Document)
+# https://github.com/OldManUmby/DND.SRD.Wiki
+
+# Thanks to Wizards of the Coast for DnD and preserving its openness with the Open Gaming License.
+
+
+@dataclass(frozen=True, repr=True)
+class WikipediaPath:
+    article: str
+    header: str
+
+    def __str__(self):
+        return f"{self.article} - {self.header}"
+
+
+@dataclass(frozen=True, repr=True)
+class Section:
+    """
+    A segment is defined by anything that follows an h1 header (# ...) or
+    an entire document if the file has no h1 headers.
+    """
+
+    location: WikipediaPath
+    text: str
+
+    def __str__(self):
+        return f"{self.location}:\n{self.text}"
+
+
+def wikipedia_splitter(contents: str, article_title: str, token_limit: int, split_point_regexes: List[str]) -> Iterable[Section]:
+    # Take a markdown file and the article split on `==` sections.
+    """
+    Generate sections of Wikipedia pages.
+    :param contents: The contents of the wikipedia page
+    :param article_title: The title of the article, to be included in the emitted section object
+    :param token_limit: The maximum number of tokens to allow in a section
+    :param split_point_regexes: A list of regexes to split on. The first one is the highest precedence.
+        If we can't fit a section into the token limit, we'll split on the next lower regex.
+    """
+    split_point_regex = split_point_regexes[0]
+    sections = re.split(split_point_regex, contents)
+
+    if not sections[0].strip():
+        # Remove the first section if it's empty (this happens when the file starts with a "#" line)
+        sections.pop(0)
+    else:
+        # Otherwise: Wikipedia articles often begin with a section that has no `==` header.
+        first_section = sections.pop(0)
+        yield Section(location=WikipediaPath(article=article_title, header=article_title), text=first_section)
+
+    # And now proceed into splitting sections based on the `==` header
+    for section in sections:
+        if not section.strip():
+            # Remove trailing empty sections.
+            continue
+
+        header = section.splitlines()[0].strip()
+        if "=" in split_point_regex:
+            # If we're splitting on equal-sign headers, then we need to remove the trailing equal signs
+            header = re.sub(r"=+$", "", header).strip()
+
+        # To be better steer embeddings, we include the article's title and section name with one another above the text.
+        emit = Section(location=WikipediaPath(article=article_title, header=header), text=f"{article_title}: {section}")
+
+        if len(str(section).replace("\n", " ")) > token_limit:
+            print(f"Section is too long: {emit.location}, splitting")
+            subtitle = f"{article_title} - {header}"
+            # If the section is too long, split it on a lower precedence split point
+
+            yield from wikipedia_splitter(section, subtitle, token_limit, split_point_regexes[1:])
+        else:
+            yield emit
@@ -0,0 +1,152 @@
+import hashlib
+import json
+import os
+import sqlite3
+import zipfile
+from typing import Dict, List, Tuple, TypeVar
+
+import numpy as np
+import openai
+import tiktoken
+from openai.embeddings_utils import cosine_similarity
+from openai.error import APIConnectionError, APIError, RateLimitError
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
+
+
+def get_file_with_zip_fallback(file_name: str, zip_file_name: str) -> str:
+    # Check if the CSV file exists
+    if not os.path.exists(file_name):
+        # If not, check if the ZIP file exists and unzip it
+        if os.path.exists(zip_file_name):
+            with zipfile.ZipFile(zip_file_name, "r") as zip_ref:
+                zip_ref.extractall()
+        else:
+            raise ValueError(f"Neither {file_name} nor {zip_file_name} were found in the current directory.")
+
+    # Read the contents of the CSV file
+    with open(file_name, "r", encoding="utf-8") as file:
+        contents = file.read()
+
+    return contents
+
+
+# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
+def num_tokens_from_messages(messages: List[Dict], model: str) -> int:
+    """Returns the number of tokens used by a list of messages."""
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        encoding = tiktoken.get_encoding("cl100k_base")
+    if model == "gpt-3.5-turbo":
+        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
+    elif model == "gpt-4":
+        return num_tokens_from_messages(messages, model="gpt-4-0314")
+    elif model == "gpt-3.5-turbo-0301":
+        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
+        tokens_per_name = -1  # if there's a name, the role is omitted
+    elif model == "gpt-4-0314":
+        tokens_per_message = 3
+        tokens_per_name = 1
+    else:
+        raise NotImplementedError(
+            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
+        )
+    num_tokens = 0
+    for message in messages:
+        num_tokens += tokens_per_message
+        for key, value in message.items():
+            num_tokens += len(encoding.encode(value))
+            if key == "name":
+                num_tokens += tokens_per_name
+    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
+    return num_tokens
+
+
+def memoize_to_sqlite(filename: str = "cache.db"):
+    """
+    Memoization decorator that caches the output of a method in a SQLite database.
+    The database connection is persisted across calls.
+    """
+    db_conn = sqlite3.connect(filename)
+    db_conn.execute("CREATE TABLE IF NOT EXISTS cache (hash TEXT PRIMARY KEY, result TEXT)")
+
+    def memoize(func):
+        def wrapped(*args):
+            # Compute the hash of the argument
+            arg_hash = hashlib.sha256(repr(tuple(args)).encode("utf-8")).hexdigest()
+
+            # Check if the result is already cached
+            cursor = db_conn.cursor()
+            cursor.execute("SELECT result FROM cache WHERE hash = ?", (arg_hash,))
+            row = cursor.fetchone()
+            if row is not None:
+                print(f"Cached result found for {arg_hash}. Returning it.")
+                return json.loads(row[0])
+
+            # Compute the result and cache it
+            result = func(*args)
+            cursor.execute("INSERT INTO cache (hash, result) VALUES (?, ?)", (arg_hash, json.dumps(result)))
+            db_conn.commit()
+
+            return result
+
+        return wrapped
+
+    return memoize
+
+
+# This is not optimized for massive reads and writes, but it's good enough for this example
+@memoize_to_sqlite(filename="embeddings.db")
+@retry(
+    wait=wait_random_exponential(multiplier=1, max=30),
+    stop=stop_after_attempt(3),
+    retry=retry_if_exception_type(APIConnectionError) | retry_if_exception_type(APIError) | retry_if_exception_type(RateLimitError),
+)
+def get_embedding(text: str) -> List[float]:
+    """
+    :param text: The text to compute an embedding for
+    :return: The embedding for the text
+    """
+    # replace newlines, which can negatively affect performance.
+    text_no_newlines = text.replace("\n", " ")
+    print(f"Computing embedding for {text_no_newlines[:50]}")
+    response = openai.Embedding.create(input=text_no_newlines, model="text-embedding-ada-002")
+    embeddings = response["data"][0]["embedding"]
+    return embeddings
+
+
+T = TypeVar("T")  # Declare type variable
+
+
+def get_n_nearest_neighbors(query_embedding: List[float], embeddings: Dict[T, List[float]], n: int) -> List[Tuple[T, float]]:
+    """
+    :param query_embedding: The embedding to find the nearest neighbors for
+    :param embeddings: A dictionary of embeddings, where the keys are the entity type (e.g. Movie, Segment)
+        and the values are the that entity's embeddings
+    :param n: The number of nearest neighbors to return
+    :return: A list of tuples, where the first element is the entity and the second element is the cosine
+        similarity between -1 and 1
+    """
+
+    # This is not optimized for rapid indexing, but it's good enough for this example
+    # If you're using this in production, you should use a more efficient vector datastore such as
+    # those mentioned specifically by OpenAI here
+    #
+    #  https://platform.openai.com/docs/guides/embeddings/how-can-i-retrieve-k-nearest-embedding-vectors-quickly
+    #
+    #  * Pinecone, a fully managed vector database
+    #  * Weaviate, an open-source vector search engine
+    #  * Redis as a vector database
+    #  * Qdrant, a vector search engine
+    #  * Milvus, a vector database built for scalable similarity search
+    #  * Chroma, an open-source embeddings store
+    #
+
+    target_embedding = np.array(query_embedding)
+
+    similarities = [(segment, cosine_similarity(target_embedding, np.array(embedding))) for segment, embedding in embeddings.items()]
+
+    # Sort by similarity and get the top n results
+    nearest_neighbors = sorted(similarities, key=lambda x: x[1], reverse=True)[:n]
+
+    return nearest_neighbors
@@ -1,6 +1,6 @@
 # OpenAI Projects
 
-5 projects using OpenAI APIs with Python.
+6 projects using OpenAI APIs with Python.
 
 ## Setup
 
@@ -112,6 +112,26 @@ An embedding-powered movie recommendation algorithm using Nomic Atlas.
 - visualizing our embeddings with Atlas.
 - recommending movies using our embeddings.
 
+## Q&A Bot
+
+A dynamic Q&A Bot using GPT-4.
+
+[Check the 06-qa-bot folder](06-qa-bot)
+
+<p align="center">
+    <a href="06-qa-bot">
+        <img src="06-qa-bot/screenshot.png">
+    </a>
+</p>
+
+### Features
+
+- scraping data from Wikipedia.
+- generating a bunch of embeddings on the last Formula One season.
+- turning the questions from users into embeddings.
+- finding the K nearest neighbors to that embedding.
+- including the matching texts in the prompt to expand GPT-4 knowledge.
+
 ## Playground
 
 [Check the playground](playground/) to understand the basics.