solygambas
diff --git a/‎05-movie-recommendation-system/README.md
+27 b/‎05-movie-recommendation-system/README.md
+27
diff --git a/‎playground/datasets/movie_plots.csv renamed to ‎05-movie-recommendation-system/datasets/movie_plots.csv b/‎playground/datasets/movie_plots.csv renamed to ‎05-movie-recommendation-system/datasets/movie_plots.csv
diff --git a/‎playground/embeddings/movie_embeddings_cache.pkl renamed to ‎05-movie-recommendation-system/embeddings/movie_embeddings_cache.pkl b/‎playground/embeddings/movie_embeddings_cache.pkl renamed to ‎05-movie-recommendation-system/embeddings/movie_embeddings_cache.pkl
diff --git a/‎05-movie-recommendation-system/movie-recommendation.ipynb
+327 b/‎05-movie-recommendation-system/movie-recommendation.ipynb
+327
diff --git a/‎playground/embeddings/screenshot.png renamed to ‎05-movie-recommendation-system/screenshot.png b/‎playground/embeddings/screenshot.png renamed to ‎05-movie-recommendation-system/screenshot.png
diff --git a/‎README.md
+21-1 b/‎README.md
+21-1
@@ -0,0 +1,27 @@
+# Movie Recommendation System
+
+An embedding-powered movie recommendation algorithm using Nomic Atlas.
+
+<p align="center">
+    <img src="screenshot.png">
+</p>
+
+## Setup
+
+You need to create a virtual env and install the packages listed in `requirements.txt`. You can then run Jupyter Notebooks in VS Code.
+
+Follow these steps: [How to Work with Python Virtual Environments, Jupyter Notebooks and VS Code](https://python.plainenglish.io/how-to-work-with-python-virtual-environments-jupyter-notebooks-and-vs-code-536fac3d93a1).
+
+You need to create a `.env` file with your `OPENAI_API_KEY`.
+
+You also need to create an account on [Nomic](https://atlas.nomic.ai/cli-login) and authenticate by running `nomic login` and `nomic login [token]` in your terminal.
+
+## Features
+
+- getting our movie data.
+- getting our movie data ready.
+- generating embeddings for 50 movies.
+- visualizing our embeddings with Atlas.
+- recommending movies using our embeddings.
+
+Based on [Mastering OpenAI Python APIs: Unleash the Power of GPT4](https://www.udemy.com/course/mastering-openai/) by Colt Steele (2023).
@@ -0,0 +1,327 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dotenv import dotenv_values\n",
+    "config = dotenv_values(\".env\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "openai.api_key = config[\"OPENAI_API_KEY\"]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Movies plotting with Atlas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_path = \"./datasets/movie_plots.csv\"\n",
+    "df = pd.read_csv(dataset_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movies = df[df[\"Origin/Ethnicity\"] == \"American\"].sort_values(\"Release Year\", ascending=False).head(50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "movie_plots = movies[\"Plot\"].values"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generating the embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tenacity import retry, wait_random_exponential, stop_after_attempt\n",
+    "import pickle\n",
+    "import tiktoken"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))\n",
+    "def get_embedding(text, model=\"text-embedding-ada-002\"):\n",
+    "\n",
+    "    # replace newlines, which can negatively affect performance.\n",
+    "    text = text.replace(\"\\n\", \" \")\n",
+    "\n",
+    "    return openai.Embedding.create(input=text, model=model)[\"data\"][0][\"embedding\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "enc = tiktoken.encoding_for_model(\"text-embedding-ada-002\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "16751"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "total_tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Estimated cost $0.01\n"
+     ]
+    }
+   ],
+   "source": [
+    "cost = total_tokens * (.0004 / 1000)\n",
+    "print(f\"Estimated cost ${cost:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# establish a cache of embeddings to avoid recomputing\n",
+    "# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file\n",
+    "\n",
+    "# set path to embedding cache\n",
+    "embedding_cache_path = \"./embeddings/movie_embeddings_cache.pkl\"\n",
+    "\n",
+    "# load the cache if it exists, and save a copy to disk\n",
+    "try:\n",
+    "    embedding_cache = pd.read_pickle(embedding_cache_path)\n",
+    "except FileNotFoundError:\n",
+    "    embedding_cache = {}\n",
+    "with open(embedding_cache_path, \"wb\") as embedding_cache_file:\n",
+    "    pickle.dump(embedding_cache, embedding_cache_file)\n",
+    "\n",
+    "# define a function to retrieve embeddings from the cache if present, and otherwise request via the API\n",
+    "def embedding_from_string(\n",
+    "    string,\n",
+    "    model=\"text-embedding-ada-002\",\n",
+    "    embedding_cache=embedding_cache\n",
+    "):\n",
+    "    \"\"\"Return embedding of given string, using a cache to avoid recomputing.\"\"\"\n",
+    "    if (string, model) not in embedding_cache.keys():\n",
+    "        embedding_cache[(string, model)] = get_embedding(string, model)\n",
+    "        print(f\"GOT EMBEDDING FROM OPENAI FOR {string[:20]}\")\n",
+    "        with open(embedding_cache_path, \"wb\") as embedding_cache_file:\n",
+    "            pickle.dump(embedding_cache, embedding_cache_file)\n",
+    "    return embedding_cache[(string, model)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GOT EMBEDDING FROM OPENAI FOR Meek clerk Elmer Lam\n",
+      "GOT EMBEDDING FROM OPENAI FOR Nick and Nora Charle\n",
+      "GOT EMBEDDING FROM OPENAI FOR A card sharp steps i\n",
+      "GOT EMBEDDING FROM OPENAI FOR Template:Section Edi\n",
+      "GOT EMBEDDING FROM OPENAI FOR Taft, a policeman, h\n",
+      "GOT EMBEDDING FROM OPENAI FOR Geoffrey Sherwood, r\n",
+      "GOT EMBEDDING FROM OPENAI FOR Stenographer Marilyn\n",
+      "GOT EMBEDDING FROM OPENAI FOR Kay Parrish is the d\n",
+      "GOT EMBEDDING FROM OPENAI FOR The film centers on \n",
+      "GOT EMBEDDING FROM OPENAI FOR Secretary Mirabel Mi\n",
+      "GOT EMBEDDING FROM OPENAI FOR One year after gradu\n",
+      "GOT EMBEDDING FROM OPENAI FOR Ellen Garfield refus\n",
+      "GOT EMBEDDING FROM OPENAI FOR California gubernato\n",
+      "GOT EMBEDDING FROM OPENAI FOR In San Francisco in \n",
+      "GOT EMBEDDING FROM OPENAI FOR Freckles, a young ma\n",
+      "GOT EMBEDDING FROM OPENAI FOR A radical campus gro\n",
+      "GOT EMBEDDING FROM OPENAI FOR A suicidal woman, Li\n",
+      "GOT EMBEDDING FROM OPENAI FOR Broadway star Al How\n",
+      "GOT EMBEDDING FROM OPENAI FOR In 1925 London, midd\n",
+      "GOT EMBEDDING FROM OPENAI FOR When Mary Beekman (I\n",
+      "GOT EMBEDDING FROM OPENAI FOR Set somewhere in Vie\n",
+      "GOT EMBEDDING FROM OPENAI FOR At Hampstead Court H\n",
+      "GOT EMBEDDING FROM OPENAI FOR When top Broadway bo\n",
+      "GOT EMBEDDING FROM OPENAI FOR Diamond Jim Brady (E\n",
+      "GOT EMBEDDING FROM OPENAI FOR Lieut. Bill Branniga\n",
+      "GOT EMBEDDING FROM OPENAI FOR Rodeo star John Scot\n",
+      "GOT EMBEDDING FROM OPENAI FOR Paul Madvig (Edward \n",
+      "GOT EMBEDDING FROM OPENAI FOR Luisa Ginglebusher (\n",
+      "GOT EMBEDDING FROM OPENAI FOR In the resort of Lak\n",
+      "GOT EMBEDDING FROM OPENAI FOR John Mason chases af\n",
+      "GOT EMBEDDING FROM OPENAI FOR In the time of Jesus\n",
+      "GOT EMBEDDING FROM OPENAI FOR In New York City, Dr\n",
+      "GOT EMBEDDING FROM OPENAI FOR Don Phelan, the ace \n",
+      "GOT EMBEDDING FROM OPENAI FOR Wealthy and charitab\n",
+      "GOT EMBEDDING FROM OPENAI FOR In Manhattan's lower\n",
+      "GOT EMBEDDING FROM OPENAI FOR In Dublin in 1922, G\n",
+      "GOT EMBEDDING FROM OPENAI FOR Lawrence (Pat O'Brie\n",
+      "GOT EMBEDDING FROM OPENAI FOR Jim Buchanan (Marsha\n",
+      "GOT EMBEDDING FROM OPENAI FOR Kay Bentley (Joan Cr\n",
+      "GOT EMBEDDING FROM OPENAI FOR In London, Stella Pa\n",
+      "GOT EMBEDDING FROM OPENAI FOR Annette Monard Stree\n",
+      "GOT EMBEDDING FROM OPENAI FOR Belle McGill is unaw\n",
+      "GOT EMBEDDING FROM OPENAI FOR A ranch foreman trie\n",
+      "GOT EMBEDDING FROM OPENAI FOR A publisher bets an \n",
+      "GOT EMBEDDING FROM OPENAI FOR A racketeer known as\n",
+      "GOT EMBEDDING FROM OPENAI FOR Dr. Holden (Ralph Be\n",
+      "GOT EMBEDDING FROM OPENAI FOR The life and loves o\n",
+      "GOT EMBEDDING FROM OPENAI FOR Brought up in povert\n",
+      "GOT EMBEDDING FROM OPENAI FOR Before the First Wor\n",
+      "GOT EMBEDDING FROM OPENAI FOR Laura Bayles has bee\n"
+     ]
+    }
+   ],
+   "source": [
+    "# This line actaully generates the embeddings\n",
+    "plot_embeddings = [embedding_from_string(plot, model=\"text-embedding-ada-002\") for plot in movie_plots]"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Visualizing our embeddings with Atlas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nomic import atlas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = movies[[\"Title\", \"Genre\"]].to_dict(\"records\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "atlas.map_embeddings(embeddings=np.array(plot_embeddings), data=data)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -1,6 +1,6 @@
 # OpenAI Projects
 
-4 projects using OpenAI APIs with Python.
+5 projects using OpenAI APIs with Python.
 
 ## Setup
 
@@ -92,6 +92,26 @@ A playlist generator for Spotify with OpenAI's GPT-4.
 - adding in OpenAI.
 - accepting command line arguments.
 
+## Movie Recommendation System
+
+An embedding-powered movie recommendation algorithm using Nomic Atlas.
+
+[Check the 05-movie-recommendation-system folder](05-movie-recommendation-system)
+
+<p align="center">
+    <a href="05-movie-recommendation-system">
+        <img src="05-movie-recommendation-system/screenshot.png">
+    </a>
+</p>
+
+### Features
+
+- getting our movie data.
+- getting our movie data ready.
+- generating embeddings for 50 movies.
+- visualizing our embeddings with Atlas.
+- recommending movies using our embeddings.
+
 ## Playground
 
 [Check the playground](playground/) to understand the basics.