Skip to content

Commit 2076db0

Browse files
committed
docs: add movie recommendation system
1 parent c71f016 commit 2076db0

File tree

8 files changed

+375
-279
lines changed

8 files changed

+375
-279
lines changed
+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Movie Recommendation System
2+
3+
An embedding-powered movie recommendation algorithm using Nomic Atlas.
4+
5+
<p align="center">
6+
<img src="screenshot.png">
7+
</p>
8+
9+
## Setup
10+
11+
You need to create a virtual env and install the packages listed in `requirements.txt`. You can then run Jupyter Notebooks in VS Code.
12+
13+
Follow these steps: [How to Work with Python Virtual Environments, Jupyter Notebooks and VS Code](https://python.plainenglish.io/how-to-work-with-python-virtual-environments-jupyter-notebooks-and-vs-code-536fac3d93a1).
14+
15+
You need to create a `.env` file with your `OPENAI_API_KEY`.
16+
17+
You also need to create an account on [Nomic](https://atlas.nomic.ai/cli-login) and authenticate by running `nomic login` and `nomic login [token]` in your terminal.
18+
19+
## Features
20+
21+
- getting our movie data.
22+
- getting our movie data ready.
23+
- generating embeddings for 50 movies.
24+
- visualizing our embeddings with Atlas.
25+
- recommending movies using our embeddings.
26+
27+
Based on [Mastering OpenAI Python APIs: Unleash the Power of GPT4](https://www.udemy.com/course/mastering-openai/) by Colt Steele (2023).
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import openai"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 2,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"from dotenv import dotenv_values\n",
19+
"config = dotenv_values(\".env\")"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": 3,
25+
"metadata": {},
26+
"outputs": [],
27+
"source": [
28+
"openai.api_key = config[\"OPENAI_API_KEY\"]"
29+
]
30+
},
31+
{
32+
"attachments": {},
33+
"cell_type": "markdown",
34+
"metadata": {},
35+
"source": [
36+
"## Movies plotting with Atlas"
37+
]
38+
},
39+
{
40+
"cell_type": "code",
41+
"execution_count": 4,
42+
"metadata": {},
43+
"outputs": [],
44+
"source": [
45+
"import pandas as pd\n",
46+
"import numpy as np"
47+
]
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": 5,
52+
"metadata": {},
53+
"outputs": [],
54+
"source": [
55+
"dataset_path = \"./datasets/movie_plots.csv\"\n",
56+
"df = pd.read_csv(dataset_path)"
57+
]
58+
},
59+
{
60+
"cell_type": "code",
61+
"execution_count": 6,
62+
"metadata": {},
63+
"outputs": [],
64+
"source": [
65+
"movies = df[df[\"Origin/Ethnicity\"] == \"American\"].sort_values(\"Release Year\", ascending=False).head(50)"
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": 7,
71+
"metadata": {},
72+
"outputs": [],
73+
"source": [
74+
"movie_plots = movies[\"Plot\"].values"
75+
]
76+
},
77+
{
78+
"attachments": {},
79+
"cell_type": "markdown",
80+
"metadata": {},
81+
"source": [
82+
"## Generating the embeddings"
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": 8,
88+
"metadata": {},
89+
"outputs": [],
90+
"source": [
91+
"from tenacity import retry, wait_random_exponential, stop_after_attempt\n",
92+
"import pickle\n",
93+
"import tiktoken"
94+
]
95+
},
96+
{
97+
"cell_type": "code",
98+
"execution_count": 9,
99+
"metadata": {},
100+
"outputs": [],
101+
"source": [
102+
"@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))\n",
103+
"def get_embedding(text, model=\"text-embedding-ada-002\"):\n",
104+
"\n",
105+
" # replace newlines, which can negatively affect performance.\n",
106+
" text = text.replace(\"\\n\", \" \")\n",
107+
"\n",
108+
" return openai.Embedding.create(input=text, model=model)[\"data\"][0][\"embedding\"]"
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": 10,
114+
"metadata": {},
115+
"outputs": [],
116+
"source": [
117+
"enc = tiktoken.encoding_for_model(\"text-embedding-ada-002\")"
118+
]
119+
},
120+
{
121+
"cell_type": "code",
122+
"execution_count": 11,
123+
"metadata": {},
124+
"outputs": [],
125+
"source": [
126+
"total_tokens = sum([len(enc.encode(plot)) for plot in movie_plots])"
127+
]
128+
},
129+
{
130+
"cell_type": "code",
131+
"execution_count": 12,
132+
"metadata": {},
133+
"outputs": [
134+
{
135+
"data": {
136+
"text/plain": [
137+
"16751"
138+
]
139+
},
140+
"execution_count": 12,
141+
"metadata": {},
142+
"output_type": "execute_result"
143+
}
144+
],
145+
"source": [
146+
"total_tokens"
147+
]
148+
},
149+
{
150+
"cell_type": "code",
151+
"execution_count": 13,
152+
"metadata": {},
153+
"outputs": [
154+
{
155+
"name": "stdout",
156+
"output_type": "stream",
157+
"text": [
158+
"Estimated cost $0.01\n"
159+
]
160+
}
161+
],
162+
"source": [
163+
"cost = total_tokens * (.0004 / 1000)\n",
164+
"print(f\"Estimated cost ${cost:.2f}\")"
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": 16,
170+
"metadata": {},
171+
"outputs": [],
172+
"source": [
173+
"# establish a cache of embeddings to avoid recomputing\n",
174+
"# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file\n",
175+
"\n",
176+
"# set path to embedding cache\n",
177+
"embedding_cache_path = \"./embeddings/movie_embeddings_cache.pkl\"\n",
178+
"\n",
179+
"# load the cache if it exists, and save a copy to disk\n",
180+
"try:\n",
181+
" embedding_cache = pd.read_pickle(embedding_cache_path)\n",
182+
"except FileNotFoundError:\n",
183+
" embedding_cache = {}\n",
184+
"with open(embedding_cache_path, \"wb\") as embedding_cache_file:\n",
185+
" pickle.dump(embedding_cache, embedding_cache_file)\n",
186+
"\n",
187+
"# define a function to retrieve embeddings from the cache if present, and otherwise request via the API\n",
188+
"def embedding_from_string(\n",
189+
" string,\n",
190+
" model=\"text-embedding-ada-002\",\n",
191+
" embedding_cache=embedding_cache\n",
192+
"):\n",
193+
" \"\"\"Return embedding of given string, using a cache to avoid recomputing.\"\"\"\n",
194+
" if (string, model) not in embedding_cache.keys():\n",
195+
" embedding_cache[(string, model)] = get_embedding(string, model)\n",
196+
" print(f\"GOT EMBEDDING FROM OPENAI FOR {string[:20]}\")\n",
197+
" with open(embedding_cache_path, \"wb\") as embedding_cache_file:\n",
198+
" pickle.dump(embedding_cache, embedding_cache_file)\n",
199+
" return embedding_cache[(string, model)]"
200+
]
201+
},
202+
{
203+
"cell_type": "code",
204+
"execution_count": 15,
205+
"metadata": {},
206+
"outputs": [
207+
{
208+
"name": "stdout",
209+
"output_type": "stream",
210+
"text": [
211+
"GOT EMBEDDING FROM OPENAI FOR Meek clerk Elmer Lam\n",
212+
"GOT EMBEDDING FROM OPENAI FOR Nick and Nora Charle\n",
213+
"GOT EMBEDDING FROM OPENAI FOR A card sharp steps i\n",
214+
"GOT EMBEDDING FROM OPENAI FOR Template:Section Edi\n",
215+
"GOT EMBEDDING FROM OPENAI FOR Taft, a policeman, h\n",
216+
"GOT EMBEDDING FROM OPENAI FOR Geoffrey Sherwood, r\n",
217+
"GOT EMBEDDING FROM OPENAI FOR Stenographer Marilyn\n",
218+
"GOT EMBEDDING FROM OPENAI FOR Kay Parrish is the d\n",
219+
"GOT EMBEDDING FROM OPENAI FOR The film centers on \n",
220+
"GOT EMBEDDING FROM OPENAI FOR Secretary Mirabel Mi\n",
221+
"GOT EMBEDDING FROM OPENAI FOR One year after gradu\n",
222+
"GOT EMBEDDING FROM OPENAI FOR Ellen Garfield refus\n",
223+
"GOT EMBEDDING FROM OPENAI FOR California gubernato\n",
224+
"GOT EMBEDDING FROM OPENAI FOR In San Francisco in \n",
225+
"GOT EMBEDDING FROM OPENAI FOR Freckles, a young ma\n",
226+
"GOT EMBEDDING FROM OPENAI FOR A radical campus gro\n",
227+
"GOT EMBEDDING FROM OPENAI FOR A suicidal woman, Li\n",
228+
"GOT EMBEDDING FROM OPENAI FOR Broadway star Al How\n",
229+
"GOT EMBEDDING FROM OPENAI FOR In 1925 London, midd\n",
230+
"GOT EMBEDDING FROM OPENAI FOR When Mary Beekman (I\n",
231+
"GOT EMBEDDING FROM OPENAI FOR Set somewhere in Vie\n",
232+
"GOT EMBEDDING FROM OPENAI FOR At Hampstead Court H\n",
233+
"GOT EMBEDDING FROM OPENAI FOR When top Broadway bo\n",
234+
"GOT EMBEDDING FROM OPENAI FOR Diamond Jim Brady (E\n",
235+
"GOT EMBEDDING FROM OPENAI FOR Lieut. Bill Branniga\n",
236+
"GOT EMBEDDING FROM OPENAI FOR Rodeo star John Scot\n",
237+
"GOT EMBEDDING FROM OPENAI FOR Paul Madvig (Edward \n",
238+
"GOT EMBEDDING FROM OPENAI FOR Luisa Ginglebusher (\n",
239+
"GOT EMBEDDING FROM OPENAI FOR In the resort of Lak\n",
240+
"GOT EMBEDDING FROM OPENAI FOR John Mason chases af\n",
241+
"GOT EMBEDDING FROM OPENAI FOR In the time of Jesus\n",
242+
"GOT EMBEDDING FROM OPENAI FOR In New York City, Dr\n",
243+
"GOT EMBEDDING FROM OPENAI FOR Don Phelan, the ace \n",
244+
"GOT EMBEDDING FROM OPENAI FOR Wealthy and charitab\n",
245+
"GOT EMBEDDING FROM OPENAI FOR In Manhattan's lower\n",
246+
"GOT EMBEDDING FROM OPENAI FOR In Dublin in 1922, G\n",
247+
"GOT EMBEDDING FROM OPENAI FOR Lawrence (Pat O'Brie\n",
248+
"GOT EMBEDDING FROM OPENAI FOR Jim Buchanan (Marsha\n",
249+
"GOT EMBEDDING FROM OPENAI FOR Kay Bentley (Joan Cr\n",
250+
"GOT EMBEDDING FROM OPENAI FOR In London, Stella Pa\n",
251+
"GOT EMBEDDING FROM OPENAI FOR Annette Monard Stree\n",
252+
"GOT EMBEDDING FROM OPENAI FOR Belle McGill is unaw\n",
253+
"GOT EMBEDDING FROM OPENAI FOR A ranch foreman trie\n",
254+
"GOT EMBEDDING FROM OPENAI FOR A publisher bets an \n",
255+
"GOT EMBEDDING FROM OPENAI FOR A racketeer known as\n",
256+
"GOT EMBEDDING FROM OPENAI FOR Dr. Holden (Ralph Be\n",
257+
"GOT EMBEDDING FROM OPENAI FOR The life and loves o\n",
258+
"GOT EMBEDDING FROM OPENAI FOR Brought up in povert\n",
259+
"GOT EMBEDDING FROM OPENAI FOR Before the First Wor\n",
260+
"GOT EMBEDDING FROM OPENAI FOR Laura Bayles has bee\n"
261+
]
262+
}
263+
],
264+
"source": [
265+
"# This line actaully generates the embeddings\n",
266+
"plot_embeddings = [embedding_from_string(plot, model=\"text-embedding-ada-002\") for plot in movie_plots]"
267+
]
268+
},
269+
{
270+
"attachments": {},
271+
"cell_type": "markdown",
272+
"metadata": {},
273+
"source": [
274+
"## Visualizing our embeddings with Atlas"
275+
]
276+
},
277+
{
278+
"cell_type": "code",
279+
"execution_count": 18,
280+
"metadata": {},
281+
"outputs": [],
282+
"source": [
283+
"from nomic import atlas"
284+
]
285+
},
286+
{
287+
"cell_type": "code",
288+
"execution_count": 21,
289+
"metadata": {},
290+
"outputs": [],
291+
"source": [
292+
"data = movies[[\"Title\", \"Genre\"]].to_dict(\"records\")"
293+
]
294+
},
295+
{
296+
"cell_type": "code",
297+
"execution_count": null,
298+
"metadata": {},
299+
"outputs": [],
300+
"source": [
301+
"atlas.map_embeddings(embeddings=np.array(plot_embeddings), data=data)"
302+
]
303+
}
304+
],
305+
"metadata": {
306+
"kernelspec": {
307+
"display_name": ".venv",
308+
"language": "python",
309+
"name": "python3"
310+
},
311+
"language_info": {
312+
"codemirror_mode": {
313+
"name": "ipython",
314+
"version": 3
315+
},
316+
"file_extension": ".py",
317+
"mimetype": "text/x-python",
318+
"name": "python",
319+
"nbconvert_exporter": "python",
320+
"pygments_lexer": "ipython3",
321+
"version": "3.10.5"
322+
},
323+
"orig_nbformat": 4
324+
},
325+
"nbformat": 4,
326+
"nbformat_minor": 2
327+
}

README.md

+21-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# OpenAI Projects
22

3-
4 projects using OpenAI APIs with Python.
3+
5 projects using OpenAI APIs with Python.
44

55
## Setup
66

@@ -92,6 +92,26 @@ A playlist generator for Spotify with OpenAI's GPT-4.
9292
- adding in OpenAI.
9393
- accepting command line arguments.
9494

95+
## Movie Recommendation System
96+
97+
An embedding-powered movie recommendation algorithm using Nomic Atlas.
98+
99+
[Check the 05-movie-recommendation-system folder](05-movie-recommendation-system)
100+
101+
<p align="center">
102+
<a href="05-movie-recommendation-system">
103+
<img src="05-movie-recommendation-system/screenshot.png">
104+
</a>
105+
</p>
106+
107+
### Features
108+
109+
- getting our movie data.
110+
- getting our movie data ready.
111+
- generating embeddings for 50 movies.
112+
- visualizing our embeddings with Atlas.
113+
- recommending movies using our embeddings.
114+
95115
## Playground
96116

97117
[Check the playground](playground/) to understand the basics.

0 commit comments

Comments
 (0)