|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": 1, |
| 5 | + "execution_count": 6, |
6 | 6 | "metadata": {},
|
7 | 7 | "outputs": [],
|
8 | 8 | "source": [
|
|
11 | 11 | },
|
12 | 12 | {
|
13 | 13 | "cell_type": "code",
|
14 |
| - "execution_count": 2, |
| 14 | + "execution_count": 7, |
15 | 15 | "metadata": {},
|
16 | 16 | "outputs": [],
|
17 | 17 | "source": [
|
|
21 | 21 | },
|
22 | 22 | {
|
23 | 23 | "cell_type": "code",
|
24 |
| - "execution_count": 3, |
| 24 | + "execution_count": 8, |
25 | 25 | "metadata": {},
|
26 | 26 | "outputs": [],
|
27 | 27 | "source": [
|
|
38 | 38 | },
|
39 | 39 | {
|
40 | 40 | "cell_type": "code",
|
41 |
| - "execution_count": 4, |
| 41 | + "execution_count": 9, |
42 | 42 | "metadata": {},
|
43 | 43 | "outputs": [],
|
44 | 44 | "source": [
|
|
48 | 48 | },
|
49 | 49 | {
|
50 | 50 | "cell_type": "code",
|
51 |
| - "execution_count": 5, |
| 51 | + "execution_count": 10, |
52 | 52 | "metadata": {},
|
53 | 53 | "outputs": [],
|
54 | 54 | "source": [
|
|
58 | 58 | },
|
59 | 59 | {
|
60 | 60 | "cell_type": "code",
|
61 |
| - "execution_count": 6, |
| 61 | + "execution_count": 11, |
62 | 62 | "metadata": {},
|
63 | 63 | "outputs": [],
|
64 | 64 | "source": [
|
|
67 | 67 | },
|
68 | 68 | {
|
69 | 69 | "cell_type": "code",
|
70 |
| - "execution_count": 7, |
| 70 | + "execution_count": 12, |
71 | 71 | "metadata": {},
|
72 | 72 | "outputs": [],
|
73 | 73 | "source": [
|
|
84 | 84 | },
|
85 | 85 | {
|
86 | 86 | "cell_type": "code",
|
87 |
| - "execution_count": 8, |
| 87 | + "execution_count": 14, |
88 | 88 | "metadata": {},
|
89 | 89 | "outputs": [],
|
90 | 90 | "source": [
|
|
95 | 95 | },
|
96 | 96 | {
|
97 | 97 | "cell_type": "code",
|
98 |
| - "execution_count": 9, |
| 98 | + "execution_count": 16, |
99 | 99 | "metadata": {},
|
100 | 100 | "outputs": [],
|
101 | 101 | "source": [
|
|
110 | 110 | },
|
111 | 111 | {
|
112 | 112 | "cell_type": "code",
|
113 |
| - "execution_count": 10, |
| 113 | + "execution_count": 17, |
114 | 114 | "metadata": {},
|
115 | 115 | "outputs": [],
|
116 | 116 | "source": [
|
|
119 | 119 | },
|
120 | 120 | {
|
121 | 121 | "cell_type": "code",
|
122 |
| - "execution_count": 11, |
| 122 | + "execution_count": 18, |
123 | 123 | "metadata": {},
|
124 | 124 | "outputs": [],
|
125 | 125 | "source": [
|
|
128 | 128 | },
|
129 | 129 | {
|
130 | 130 | "cell_type": "code",
|
131 |
| - "execution_count": 12, |
| 131 | + "execution_count": 19, |
132 | 132 | "metadata": {},
|
133 | 133 | "outputs": [
|
134 | 134 | {
|
|
137 | 137 | "16751"
|
138 | 138 | ]
|
139 | 139 | },
|
140 |
| - "execution_count": 12, |
| 140 | + "execution_count": 19, |
141 | 141 | "metadata": {},
|
142 | 142 | "output_type": "execute_result"
|
143 | 143 | }
|
|
148 | 148 | },
|
149 | 149 | {
|
150 | 150 | "cell_type": "code",
|
151 |
| - "execution_count": 13, |
| 151 | + "execution_count": 20, |
152 | 152 | "metadata": {},
|
153 | 153 | "outputs": [
|
154 | 154 | {
|
|
166 | 166 | },
|
167 | 167 | {
|
168 | 168 | "cell_type": "code",
|
169 |
| - "execution_count": 16, |
| 169 | + "execution_count": 27, |
170 | 170 | "metadata": {},
|
171 | 171 | "outputs": [],
|
172 | 172 | "source": [
|
173 | 173 | "# establish a cache of embeddings to avoid recomputing\n",
|
174 | 174 | "# cache is a dict of tuples (text, model) -> embedding, saved as a pickle file\n",
|
175 | 175 | "\n",
|
176 | 176 | "# set path to embedding cache\n",
|
177 |
| - "embedding_cache_path = \"./embeddings/movie_embeddings_cache.pkl\"\n", |
| 177 | + "embedding_cache_path = \"movie_embeddings_cache.pkl\"\n", |
178 | 178 | "\n",
|
179 | 179 | "# load the cache if it exists, and save a copy to disk\n",
|
180 | 180 | "try:\n",
|
|
201 | 201 | },
|
202 | 202 | {
|
203 | 203 | "cell_type": "code",
|
204 |
| - "execution_count": 15, |
| 204 | + "execution_count": 28, |
205 | 205 | "metadata": {},
|
206 | 206 | "outputs": [
|
207 | 207 | {
|
|
300 | 300 | "source": [
|
301 | 301 | "atlas.map_embeddings(embeddings=np.array(plot_embeddings), data=data)"
|
302 | 302 | ]
|
| 303 | + }, |
| 304 | + { |
| 305 | + "attachments": {}, |
| 306 | + "cell_type": "markdown", |
| 307 | + "metadata": {}, |
| 308 | + "source": [ |
| 309 | + "## Recommending movies by plot" |
| 310 | + ] |
| 311 | + }, |
| 312 | + { |
| 313 | + "cell_type": "code", |
| 314 | + "execution_count": 30, |
| 315 | + "metadata": {}, |
| 316 | + "outputs": [], |
| 317 | + "source": [ |
| 318 | + "from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances" |
| 319 | + ] |
| 320 | + }, |
| 321 | + { |
| 322 | + "cell_type": "code", |
| 323 | + "execution_count": 35, |
| 324 | + "metadata": {}, |
| 325 | + "outputs": [], |
| 326 | + "source": [ |
| 327 | + "def print_recommendations_from_strings(\n", |
| 328 | + " strings,\n", |
| 329 | + " index_of_source_string,\n", |
| 330 | + " k_nearest_neighbors=3,\n", |
| 331 | + " model=\"text-embedding-ada-002\"\n", |
| 332 | + "):\n", |
| 333 | + " # get all the embeddings\n", |
| 334 | + " embeddings = [embedding_from_string(string) for string in strings]\n", |
| 335 | + " # get embedding for our specific query string\n", |
| 336 | + " query_embedding = embeddings[index_of_source_string]\n", |
| 337 | + " # get distances between our embedding and all other embeddings\n", |
| 338 | + " distances = distances_from_embeddings(query_embedding, embeddings)\n", |
| 339 | + " # get indices of the nearest neighbors\n", |
| 340 | + " indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)\n", |
| 341 | + " query_string = strings[index_of_source_string]\n", |
| 342 | + " match_count = 0\n", |
| 343 | + " for i in indices_of_nearest_neighbors:\n", |
| 344 | + " # skip the searched movie\n", |
| 345 | + " if query_string == strings[i]:\n", |
| 346 | + " continue\n", |
| 347 | + " if match_count >= k_nearest_neighbors:\n", |
| 348 | + " break\n", |
| 349 | + " match_count += 1\n", |
| 350 | + " print(f\"Found {match_count} closest match: \")\n", |
| 351 | + " print(f\"Distance of: {distances[i]}\")\n", |
| 352 | + " print(strings[i])" |
| 353 | + ] |
| 354 | + }, |
| 355 | + { |
| 356 | + "cell_type": "code", |
| 357 | + "execution_count": 36, |
| 358 | + "metadata": {}, |
| 359 | + "outputs": [ |
| 360 | + { |
| 361 | + "name": "stdout", |
| 362 | + "output_type": "stream", |
| 363 | + "text": [ |
| 364 | + "Found 1 closest match: \n", |
| 365 | + "Distance of: 0.15240804182456968\n", |
| 366 | + "Belle McGill is unaware of husband Jimmy's gambling problem. First he loses $100 at the racetrack and vows never to place another wager. Then he persuades future son-in-law Ben to bet on a sure thing, Leadpipe, but gets a tip on another horse just before the race, bets Ben's money on that instead, then watches Leadpipe win.\n", |
| 367 | + "In danger of losing his business, if not his family, Jimmy delays paying off Ben, who excitedly believes his horse was the winner. Unbenknowst to all, Belle has been making bets of her own. When a horse called Honey Girl comes along, Belle and Jimmy risk everything they have, and they come out winners.\n", |
| 368 | + "Found 2 closest match: \n", |
| 369 | + "Distance of: 0.16723019461008426\n", |
| 370 | + "A racketeer known as \"Sunshine Joe\" specializes in ticket scalping. His gang of colorfully nicknamed thugs includes Liverlips, Sam the Gonoph and Bennie South Street, as well as \"Georgie the Chaser,\" who was dubbed that way because of his penchant for chasing after women.\n", |
| 371 | + "On a train, Georgie happens upon Clarice Van Cleve, an heiress who loves to fall in love, particularly with men in uniform. This has created many a headache for her father, who already has seen Clarice elope three times with military types, each tryst ending badly.\n", |
| 372 | + "Mr. Van Cleve diverts his daughter to a New Jersey health resort, where he introduces her to his friend Mr. Wilmot and handsome son Hector, in the hope that Clarice and Hector will hit it off. Georgie the gigolo still has Clarice's eye, however, pretending to be a combat pilot. But when Clarice turns up and begins acting like a homemaker, driving him crazy, Georgie, learning she's been disinherited by her dad, leaves by claiming he's needed by \"the King\" to fly a mission.\n", |
| 373 | + "Sunshine Joe runs off with money earned from scalped tickets to the Harvard-Yale college football game. It so happens Hector is a member of the Yale team, so all of Joe's goons go to New Haven, Connecticut for the game and place bets. Shocked to find Hector is a benchwarmer, they pull a gun on the coach, demanding Hector be permitted to play. He kicks a field goal to win the game, then ends up, a man in another kind of uniform, in the arms of Clarice.\n", |
| 374 | + "Found 3 closest match: \n", |
| 375 | + "Distance of: 0.17311384937127183\n", |
| 376 | + "A card sharp steps in when a Mexican family's ranch is threatened by swindlers and cheats.\n" |
| 377 | + ] |
| 378 | + } |
| 379 | + ], |
| 380 | + "source": [ |
| 381 | + "print_recommendations_from_strings(movie_plots, 0)" |
| 382 | + ] |
303 | 383 | }
|
304 | 384 | ],
|
305 | 385 | "metadata": {
|
|
0 commit comments