diff --git a/examples/product-recommender/lancedb_cloud/README.md b/examples/product-recommender/lancedb_cloud/README.md deleted file mode 100644 index 368725cc..00000000 --- a/examples/product-recommender/lancedb_cloud/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# Product Recommender using Collaborative Filtering and LanceDB - -Use LanceDB and collaborative filtering to recommend products based on a user's past buying history. We used the Instacart dataset as our data for this example. -Colab walkthrough - Open In Colab - -### Get dataset -To run this example, please download the dataset from our s3 bucket: http://vectordb-recipes.s3.us-west-2.amazonaws.com/product-recommender.zip -!!!This example needs to be run on GPU otherwise it will be very slow. -It covers how to create a LanceDB table remotely, how to create an index on the vector column to accelerate search, followed by search on the remote table where results are saved as a pandas Dataframe. - -``` -wget http://vectordb-recipes.s3.us-west-2.amazonaws.com/product-recommender.zip -unzip product-recommender.zip -cp product-recommender/*.zip . -rm -fr product-recommender -``` - -### Set credentials -if you would like to set api key through an environment variable: -``` -export LANCEDB_API_KEY="sk_..." -``` - -replace the following lines in main.py with your project slug and api key" -``` -db_url = "db://your-project-name" - api_key="sk_..." -``` - -Run the script -```python -python main.py -``` - -| Argument | Default Value | Description | -|---|---|---| -| factors | 100 | dimension of latent factor vectors | -| regularization | 0.05 | strength of penalty term | -| iterations | 50 | number of iterations to update | -| num-threads | 1 | amount of parallelization | -| num-partitions | 256 | number of partitions of the index | -| num-sub-vectors | 16 | number of sub-vectors (M) that will be created during Product Quantization (PQ) | diff --git a/examples/product-recommender/lancedb_cloud/main.ipynb b/examples/product-recommender/lancedb_cloud/main.ipynb deleted file mode 100644 index 822ef0da..00000000 --- a/examples/product-recommender/lancedb_cloud/main.ipynb +++ /dev/null @@ -1,3333 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "YmdWGrw4t5G2" - }, - "source": [ - "# Product Recommender using Collaborative Filtering and LanceDB\n", - "\n", - "We are going to use **LanceDB** and **Collaborative Filtering** to recommend products based on a user's past buying history. We used the **Instacart dataset** as our data for this example.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Credentials\n", - "\n", - "Copy and paste the project name and the api key from your project page.\n", - "These will be used later to [connect to LanceDB Cloud](#scroll-to=5q8m6GMD7sGu)" - ], - "metadata": { - "id": "sCtHNvkbzSot" - } - }, - { - "cell_type": "code", - "source": [ - "project_slug = \"your-project-slug\" # @param {type:\"string\"}" - ], - "metadata": { - "id": "zpPM2T8zzZkw" - }, - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "api_key = \"sk_...\" # @param {type:\"string\"}" - ], - "metadata": { - "id": "xgCqtc99zwUQ" - }, - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "You can also set the LANCEDB_API_KEY as an environment variable with one of the options below" - ], - "metadata": { - "id": "eEITDnEczz7G" - } - }, - { - "cell_type": "code", - "source": [ - "!export LANCEDB_API_KEY=\"sk_...\"" - ], - "metadata": { - "id": "Md5kS8s7z0-j" - }, - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import os\n", - "import getpass\n", - "os.environ[\"LANCEDB_API_KEY\"] = getpass.getpass(\"Enter Your LANCEDB API Key:\")" - ], - "metadata": { - "id": "d7gq19Wez3JZ" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9-fnXVuO8XQ0" - }, - "source": [ - "## Get dataset\n", - "Download and unzip the dataset from LanceDB s3 bucket." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "3jXSVspr7sGe", - "vscode": { - "languageId": "shellscript" - }, - "outputId": "4c09916d-85de-46d6-9c16-ed6746ac4e19", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "--2024-01-23 03:30:37-- http://vectordb-recipes.s3.us-west-2.amazonaws.com/product-recommender.zip\n", - "Resolving vectordb-recipes.s3.us-west-2.amazonaws.com (vectordb-recipes.s3.us-west-2.amazonaws.com)... 3.5.84.12, 3.5.84.155, 3.5.84.131, ...\n", - "Connecting to vectordb-recipes.s3.us-west-2.amazonaws.com (vectordb-recipes.s3.us-west-2.amazonaws.com)|3.5.84.12|:80... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: 411510857 (392M) [application/zip]\n", - "Saving to: ‘product-recommender.zip’\n", - "\n", - "product-recommender 100%[===================>] 392.45M 22.5MB/s in 19s \n", - "\n", - "2024-01-23 03:30:56 (20.8 MB/s) - ‘product-recommender.zip’ saved [411510857/411510857]\n", - "\n", - "Archive: product-recommender.zip\n", - " creating: product-recommender/\n", - " inflating: __MACOSX/._product-recommender \n", - " inflating: product-recommender/order_products__prior.csv.zip \n", - " inflating: __MACOSX/product-recommender/._order_products__prior.csv.zip \n", - " inflating: product-recommender/order_products__train.csv.zip \n", - " inflating: __MACOSX/product-recommender/._order_products__train.csv.zip \n", - " inflating: product-recommender/orders.csv.zip \n", - " inflating: __MACOSX/product-recommender/._orders.csv.zip \n", - " inflating: product-recommender/products.csv.zip \n", - " inflating: __MACOSX/product-recommender/._products.csv.zip \n", - " inflating: product-recommender/instacart-market-basket-analysis.zip \n", - " inflating: __MACOSX/product-recommender/._instacart-market-basket-analysis.zip \n" - ] - } - ], - "source": [ - "!wget http://vectordb-recipes.s3.us-west-2.amazonaws.com/product-recommender.zip\n", - "!unzip product-recommender.zip\n", - "!cp product-recommender/*.zip .\n", - "!rm -fr product-recommender" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xVLHZB8BzJQG" - }, - "source": [ - "Install dependencies:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "R3_Hq2VC4_zT", - "outputId": "fc920fc5-ac48-48e6-a2b2-0f84d4436ef7" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (1.23.5)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (1.5.3)\n", - "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (1.11.4)\n", - "Requirement already satisfied: kaggle in /usr/local/lib/python3.10/dist-packages (1.5.16)\n", - "Collecting implicit\n", - " Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.9/8.9 MB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.1.0+cu121)\n", - "Collecting lancedb\n", - " Downloading lancedb-0.5.0-py3-none-any.whl (87 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.4/87.4 kB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2023.3.post1)\n", - "Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.10/dist-packages (from kaggle) (1.16.0)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from kaggle) (2023.11.17)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.31.0)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from kaggle) (4.66.1)\n", - "Requirement already satisfied: python-slugify in /usr/local/lib/python3.10/dist-packages (from kaggle) (8.0.1)\n", - "Requirement already satisfied: urllib3 in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.0.7)\n", - "Requirement already satisfied: bleach in /usr/local/lib/python3.10/dist-packages (from kaggle) (6.1.0)\n", - "Requirement already satisfied: threadpoolctl in /usr/local/lib/python3.10/dist-packages (from implicit) (3.2.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch) (3.13.1)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch) (4.5.0)\n", - "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.12)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.2.1)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.3)\n", - "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch) (2023.6.0)\n", - "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.1.0)\n", - "Collecting deprecation (from lancedb)\n", - " Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)\n", - "Collecting pylance==0.9.6 (from lancedb)\n", - " Downloading pylance-0.9.6-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.6/18.6 MB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting ratelimiter~=1.0 (from lancedb)\n", - " Downloading ratelimiter-1.2.0.post0-py3-none-any.whl (6.6 kB)\n", - "Collecting retry>=0.9.2 (from lancedb)\n", - " Downloading retry-0.9.2-py2.py3-none-any.whl (8.0 kB)\n", - "Requirement already satisfied: pydantic>=1.10 in /usr/local/lib/python3.10/dist-packages (from lancedb) (1.10.13)\n", - "Requirement already satisfied: attrs>=21.3.0 in /usr/local/lib/python3.10/dist-packages (from lancedb) (23.2.0)\n", - "Collecting semver>=3.0 (from lancedb)\n", - " Downloading semver-3.0.2-py3-none-any.whl (17 kB)\n", - "Requirement already satisfied: cachetools in /usr/local/lib/python3.10/dist-packages (from lancedb) (5.3.2)\n", - "Requirement already satisfied: pyyaml>=6.0 in /usr/local/lib/python3.10/dist-packages (from lancedb) (6.0.1)\n", - "Requirement already satisfied: click>=8.1.7 in /usr/local/lib/python3.10/dist-packages (from lancedb) (8.1.7)\n", - "Collecting overrides>=0.7 (from lancedb)\n", - " Downloading overrides-7.6.0-py3-none-any.whl (17 kB)\n", - "Collecting pyarrow>=12 (from pylance==0.9.6->lancedb)\n", - " Downloading pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.3/38.3 MB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (3.6)\n", - "Requirement already satisfied: decorator>=3.4.2 in /usr/local/lib/python3.10/dist-packages (from retry>=0.9.2->lancedb) (4.4.2)\n", - "Collecting py<2.0.0,>=1.4.26 (from retry>=0.9.2->lancedb)\n", - " Downloading py-1.11.0-py2.py3-none-any.whl (98 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.7/98.7 kB\u001b[0m \u001b[31m13.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: webencodings in /usr/local/lib/python3.10/dist-packages (from bleach->kaggle) (0.5.1)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from deprecation->lancedb) (23.2)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.3)\n", - "Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.10/dist-packages (from python-slugify->kaggle) (1.3)\n", - "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n", - "Installing collected packages: ratelimiter, semver, pyarrow, py, overrides, deprecation, retry, pylance, implicit, lancedb\n", - " Attempting uninstall: pyarrow\n", - " Found existing installation: pyarrow 10.0.1\n", - " Uninstalling pyarrow-10.0.1:\n", - " Successfully uninstalled pyarrow-10.0.1\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "ibis-framework 7.1.0 requires pyarrow<15,>=2, but you have pyarrow 15.0.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0mSuccessfully installed deprecation-2.1.0 implicit-0.7.2 lancedb-0.5.0 overrides-7.6.0 py-1.11.0 pyarrow-15.0.0 pylance-0.9.6 ratelimiter-1.2.0.post0 retry-0.9.2 semver-3.0.2\n" - ] - } - ], - "source": [ - "!pip install numpy pandas scipy kaggle implicit torch lancedb" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i_eatRhaIGIz" - }, - "source": [ - "First, let's import all the required modules for this example." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "emp_MSXZt5G8" - }, - "outputs": [], - "source": [ - "import zipfile\n", - "import numpy as np\n", - "import pandas as pd\n", - "import scipy.sparse\n", - "import torch\n", - "import implicit\n", - "from implicit import evaluation\n", - "import pydantic\n", - "import lancedb\n", - "from lancedb.pydantic import pydantic_to_schema, vector" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "K4Q4cOX-4_zY" - }, - "source": [ - "We must now extract the zip files." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "f3g296nL4_zZ" - }, - "outputs": [], - "source": [ - "files = [\n", - " 'instacart-market-basket-analysis.zip',\n", - " 'order_products__train.csv.zip',\n", - " 'order_products__prior.csv.zip',\n", - " 'products.csv.zip',\n", - " 'orders.csv.zip'\n", - "]\n", - "\n", - "for filename in files:\n", - " with zipfile.ZipFile(filename, 'r') as zip_ref:\n", - " zip_ref.extractall('./')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oLgkRIfq4_zZ" - }, - "source": [ - "Now we can move on to loading the dataset. We'll first read the csv files and create dataframes." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "cBbbR7Rut5G_" - }, - "outputs": [], - "source": [ - "products = pd.read_csv('products.csv')\n", - "orders = pd.read_csv('orders.csv')\n", - "order_products = pd.concat([pd.read_csv('order_products__train.csv'), pd.read_csv('order_products__prior.csv')])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5FV_GGjst5HA" - }, - "source": [ - "Since there isn't a user rating attribute, we'll gather \"confidence\" data by looking at the frequency of each item purchased by a user, and store this in the `data` dataframe." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "id": "ZjRh7RYpt5HB" - }, - "outputs": [], - "source": [ - "customer_order_products = pd.merge(orders, order_products, how='inner',on='order_id')\n", - "\n", - "# create confidence table\n", - "data = customer_order_products.groupby(['user_id', 'product_id'])[['order_id']].count().reset_index()\n", - "data.columns=[\"user_id\", \"product_id\", \"total_orders\"]\n", - "data.product_id = data.product_id.astype('int64')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "77lvwm0St5HC" - }, - "source": [ - "Let's create a couple of test users to examine the recommendations later:\n", - "- 1st test user: buys 50 sodas: **Zero Calorie Cola**\n", - "- 2nd test user: buys organic produce: **Organic Whole Milk** and **Organic Blackberries**" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 224 - }, - "id": "A06EfAf-t5HC", - "outputId": "af9c06f5-1cbd-4ee1-9876-c62591fe95bd" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "13863749\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " user_id product_id total_orders\n", - "13863744 206209 48697 1\n", - "13863745 206209 48742 2\n", - "13863746 206210 46149 50\n", - "13863747 206211 27845 49\n", - "13863748 206211 26604 32" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idproduct_idtotal_orders
13863744206209486971
13863745206209487422
138637462062104614950
138637472062112784549
138637482062112660432
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 11 - } - ], - "source": [ - "data_new = pd.DataFrame([[data.user_id.max() + 1, 46149, 50],\n", - " [data.user_id.max() + 2, 27845, 49],\n", - " [data.user_id.max() + 2, 26604, 32]\n", - " ], columns=['user_id', 'product_id', 'total_orders'])\n", - "data = pd.concat([data, data_new]).reset_index(drop = True)\n", - "data.tail()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xBC-8PFTt5HD" - }, - "source": [ - "In the next step, we will extract user and product unique ids, in order to create a CSR (Compressed Sparse Row) matrix. This will allow us to perform collaborative filtering.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "id": "v2_2R7zmt5HE" - }, - "outputs": [], - "source": [ - "# extract unique user and product ids\n", - "unique_users = list(np.sort(data.user_id.unique()))\n", - "unique_products = list(np.sort(products.product_id.unique()))\n", - "purchases = list(data.total_orders)\n", - "\n", - "# create zero-based index position <-> user/item ID mappings\n", - "index_to_user = pd.Series(unique_users)\n", - "\n", - "# create reverse mappings from user/item ID to index positions\n", - "user_to_index = pd.Series(data=index_to_user.index + 1, index=index_to_user.values)\n", - "\n", - "# create row and column for user and product ids\n", - "users_rows = data.user_id.astype(int)\n", - "products_cols = data.product_id.astype(int)\n", - "\n", - "# create CSR matrix\n", - "matrix = scipy.sparse.csr_matrix((purchases, (users_rows, products_cols)), shape=(len(unique_users) + 1, len(unique_products) + 1))\n", - "matrix.data = np.nan_to_num(matrix.data, copy=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "II6wOH96t5HF" - }, - "source": [ - "Let's now create a recommender model using the **implicit** library. The recommendation model is based off the algorithms described in the paper [Collaborative Filtering for Implicit Feedback Datasets](https://www.researchgate.net/publication/220765111_Collaborative_Filtering_for_Implicit_Feedback_Datasets) with performance optimizations described in [Applications of the Conjugate Gradient Method for Implicit Feedback Collaborative Filtering](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.379.6473&rep=rep1&type=pdf).\n", - "\n", - "Note: this step will take about 17 minutes with the current parameter setup." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 105, - "referenced_widgets": [ - "2c0101b0a3574a14b2a37fc431eb2908", - "31c3c90fa42f489796fba11d57799089", - "e13993dda2da40ff806d6e31a6e987d3", - "0bff70b647f3404fa15690ec9f3d0c78", - "674cf2d29d044cada59480813e0e8e58", - "bfd4ff099ed14ab1bd79233beea7f402", - "000f9e8fd1db4bc0a7aceeb822ca2b2e", - "75b270d981de425ba1fd9a790b2a68ff", - "baafe1d810594384af1a5ffa4f2f5cb4", - "bf95fd811f79425bb2248525aeab7da0", - "46fb5083adf24ce4ae3fd4ea9aa4772e" - ] - }, - "id": "k0GW99kxt5HF", - "outputId": "d3e22ae9-ff96-4d89-f0aa-c3b5cd47d354" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.10/dist-packages/implicit/cpu/als.py:95: RuntimeWarning: OpenBLAS is configured to use 2 threads. It is highly recommended to disable its internal threadpool by setting the environment variable 'OPENBLAS_NUM_THREADS=1' or by calling 'threadpoolctl.threadpool_limits(1, \"blas\")'. Having OpenBLAS use a threadpool can lead to severe performance issues here.\n", - " check_blas_config()\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - " 0%| | 0/50 [00:00\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idproduct_idtotal_orders
0119611
111025810
21103261
311242710
41130324
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "
\n", - " \n" - ] - }, - "metadata": {}, - "execution_count": 21 - } - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "id": "ufHsF0o4t5HI" - }, - "outputs": [], - "source": [ - "class ProductModel(pydantic.BaseModel):\n", - " product_id: int\n", - " product_name: str\n", - " vector: vector(128)\n", - "schema = pydantic_to_schema(ProductModel)\n", - "table_name = 'product_recommender'\n", - "db.drop_table(table_name)\n", - "try:\n", - " tbl = db.create_table(table_name, schema=schema)\n", - "except:\n", - " tbl = db.open_table(table_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0-2K-g4-t5HJ" - }, - "source": [ - "Let's now store our item factors into the table via the vector column of `product_entries`." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "id": "NOOPF9zOt5HJ" - }, - "outputs": [], - "source": [ - "# Transform items into factors\n", - "items_factors = model.item_factors\n", - "product_entries = products[['product_id', 'product_name']].drop_duplicates()\n", - "product_entries['product_id'] = product_entries.product_id.astype('int64')\n", - "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - "item_embeddings = items_factors[1:].tolist()\n", - "product_entries['vector'] = item_embeddings\n", - "\n", - "tbl.add(product_entries)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "j3aU4z-tSbWE" - }, - "source": [ - "## Let's create an ANN index in order to speed up retrieval. This might take a while." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "id": "H8HyvjCFSeaz", - "outputId": "27519f2a-e95a-4442-97b1-291931180ca8", - "colab": { - "base_uri": "https://localhost:8080/" - } - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{}" - ] - }, - "metadata": {}, - "execution_count": 24 - } - ], - "source": [ - "tbl.create_index(vector_column_name=\"vector\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ibNMrxyRt5HK" - }, - "source": [ - "This is a helper method for analysing recommendations later.\n", - "This method returns top N products that someone bought in the past (based on product quantity)." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "id": "Uzgk5Od0t5HK" - }, - "outputs": [], - "source": [ - "def products_bought_by_user_in_the_past(user_id: int, top: int = 10):\n", - "\n", - " selected = data[data.user_id == user_id].sort_values(by=['total_orders'], ascending=False)\n", - "\n", - " selected['product_name'] = selected['product_id'].map(product_entries.set_index('product_id')['product_name'])\n", - " selected = selected[['product_id', 'product_name', 'total_orders']].reset_index(drop=True)\n", - " if selected.shape[0] < top:\n", - " return selected\n", - "\n", - " return selected[:top]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ULyVnHEXt5HK" - }, - "source": [ - "Let's retrieve our test users so we can query for recommendations." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "id": "Wwl7yFKTt5HK" - }, - "outputs": [], - "source": [ - "test_user_ids = [206210, 206211]\n", - "test_user_factors = model.user_factors[user_to_index[test_user_ids]]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wTh61ou3t5HL" - }, - "source": [ - "## Let's now query LanceDB to retrieve recommendations." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 868 - }, - "id": "UiZg4Iset5HL", - "outputId": "edc08e77-c03f-4ded-fd1d-3fd9d8a91376" - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - " product_id product_name \\\n", - "0 196 Soda \n", - "1 46149 Zero Calorie Cola \n", - "2 40939 Drinking Water \n", - "3 37710 Trail Mix \n", - "4 22802 Mineral Water \n", - "5 41400 Crunchy Oats 'n Honey Granola Bars \n", - "6 46061 Popcorn \n", - "7 31651 Extra Fancy Unsalted Mixed Nuts \n", - "8 5258 Sparkling Water \n", - "9 38928 0% Greek Strained Yogurt \n", - "\n", - " vector _distance \n", - "0 [-0.0030924827, -0.0042996905, -0.01350651, -0... 35.096085 \n", - "1 [0.0015008126, -0.014029495, -0.015295635, 0.0... 35.392975 \n", - "2 [0.0018837166, -0.018152414, -0.015649604, 0.0... 35.864483 \n", - "3 [-0.0011668581, -0.0025222106, -0.016717039, -... 35.896873 \n", - "4 [-0.010115783, -0.017115017, -0.011403508, 0.0... 36.035912 \n", - "5 [0.0040870784, -0.0009994006, -0.018302424, -0... 36.042686 \n", - "6 [0.0036969625, -0.013887798, -0.002804261, -0.... 36.043732 \n", - "7 [0.014438897, -0.005578243, -0.0055169673, -0.... 36.117802 \n", - "8 [-0.022658644, -0.026015628, -0.0083606485, -0... 36.131721 \n", - "9 [0.0018425643, -0.011489441, -0.0052835834, 0.... 36.139870 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
product_idproduct_namevector_distance
0196Soda[-0.0030924827, -0.0042996905, -0.01350651, -0...35.096085
146149Zero Calorie Cola[0.0015008126, -0.014029495, -0.015295635, 0.0...35.392975
240939Drinking Water[0.0018837166, -0.018152414, -0.015649604, 0.0...35.864483
337710Trail Mix[-0.0011668581, -0.0025222106, -0.016717039, -...35.896873
422802Mineral Water[-0.010115783, -0.017115017, -0.011403508, 0.0...36.035912
541400Crunchy Oats 'n Honey Granola Bars[0.0040870784, -0.0009994006, -0.018302424, -0...36.042686
646061Popcorn[0.0036969625, -0.013887798, -0.002804261, -0....36.043732
731651Extra Fancy Unsalted Mixed Nuts[0.014438897, -0.005578243, -0.0055169673, -0....36.117802
85258Sparkling Water[-0.022658644, -0.026015628, -0.0083606485, -0...36.131721
9389280% Greek Strained Yogurt[0.0018425643, -0.011489441, -0.0052835834, 0....36.139870
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n" - ] - }, - "metadata": {} - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - " product_id product_name total_orders\n", - "0 46149 Zero Calorie Cola 50" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
product_idproduct_nametotal_orders
046149Zero Calorie Cola50
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "
\n", - "
\n" - ] - }, - "metadata": {} - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - " product_id product_name \\\n", - "0 26604 Organic Blackberries \n", - "1 27845 Organic Whole Milk \n", - "2 27966 Organic Raspberries \n", - "3 43352 Raspberries \n", - "4 9076 Blueberries \n", - "5 21288 Blackberries \n", - "6 39275 Organic Blueberries \n", - "7 39928 Organic Kiwi \n", - "8 11777 Red Raspberries \n", - "9 21137 Organic Strawberries \n", - "\n", - " vector _distance \n", - "0 [-0.017585486, 0.019628799, 0.0399348, 0.01422... 17.404045 \n", - "1 [-0.050286394, 0.026924692, 0.030701049, -0.02... 17.404305 \n", - "2 [-0.006732653, 0.015266006, 0.018316658, -0.00... 17.867121 \n", - "3 [0.0037516877, 0.013682851, 0.057814274, 0.031... 18.030893 \n", - "4 [0.0029817792, 0.030459687, 0.04528497, 0.0113... 18.135754 \n", - "5 [-0.011553102, -0.010046569, 0.037375, 0.02368... 18.141661 \n", - "6 [0.010543987, 0.006028164, 0.011502461, 0.0004... 18.241520 \n", - "7 [-0.044292357, -0.031322725, -0.00174381, -0.0... 18.414057 \n", - "8 [-0.0067819585, -0.023531102, 0.010277328, -0.... 18.468819 \n", - "9 [0.007023127, 0.0037457773, -0.0061378656, -0.... 18.476973 " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
product_idproduct_namevector_distance
026604Organic Blackberries[-0.017585486, 0.019628799, 0.0399348, 0.01422...17.404045
127845Organic Whole Milk[-0.050286394, 0.026924692, 0.030701049, -0.02...17.404305
227966Organic Raspberries[-0.006732653, 0.015266006, 0.018316658, -0.00...17.867121
343352Raspberries[0.0037516877, 0.013682851, 0.057814274, 0.031...18.030893
49076Blueberries[0.0029817792, 0.030459687, 0.04528497, 0.0113...18.135754
521288Blackberries[-0.011553102, -0.010046569, 0.037375, 0.02368...18.141661
639275Organic Blueberries[0.010543987, 0.006028164, 0.011502461, 0.0004...18.241520
739928Organic Kiwi[-0.044292357, -0.031322725, -0.00174381, -0.0...18.414057
811777Red Raspberries[-0.0067819585, -0.023531102, 0.010277328, -0....18.468819
921137Organic Strawberries[0.007023127, 0.0037457773, -0.0061378656, -0....18.476973
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n" - ] - }, - "metadata": {} - }, - { - "output_type": "display_data", - "data": { - "text/plain": [ - " product_id product_name total_orders\n", - "0 27845 Organic Whole Milk 49\n", - "1 26604 Organic Blackberries 32" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
product_idproduct_nametotal_orders
027845Organic Whole Milk49
126604Organic Blackberries32
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n" - ] - }, - "metadata": {} - } - ], - "source": [ - "# Query by user factors\n", - "test_user_embeddings = test_user_factors.tolist()\n", - "for embedding, id in zip(test_user_embeddings, test_user_ids):\n", - " results = tbl.search(embedding).limit(10).to_pandas()\n", - " display(results)\n", - " display(products_bought_by_user_in_the_past(id, top=15))" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.6" - }, - "vscode": { - "interpreter": { - "hash": "5fe10bf018ef3e697f9035d60bf60847932a12bface18908407fd371fe880db9" - } - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "2c0101b0a3574a14b2a37fc431eb2908": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_31c3c90fa42f489796fba11d57799089", - "IPY_MODEL_e13993dda2da40ff806d6e31a6e987d3", - "IPY_MODEL_0bff70b647f3404fa15690ec9f3d0c78" - ], - "layout": "IPY_MODEL_674cf2d29d044cada59480813e0e8e58" - } - }, - "31c3c90fa42f489796fba11d57799089": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_bfd4ff099ed14ab1bd79233beea7f402", - "placeholder": "​", - "style": "IPY_MODEL_000f9e8fd1db4bc0a7aceeb822ca2b2e", - "value": "100%" - } - }, - "e13993dda2da40ff806d6e31a6e987d3": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_75b270d981de425ba1fd9a790b2a68ff", - "max": 50, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_baafe1d810594384af1a5ffa4f2f5cb4", - "value": 50 - } - }, - "0bff70b647f3404fa15690ec9f3d0c78": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_bf95fd811f79425bb2248525aeab7da0", - "placeholder": "​", - "style": "IPY_MODEL_46fb5083adf24ce4ae3fd4ea9aa4772e", - "value": " 50/50 [17:28<00:00, 20.73s/it]" - } - }, - "674cf2d29d044cada59480813e0e8e58": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "bfd4ff099ed14ab1bd79233beea7f402": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "000f9e8fd1db4bc0a7aceeb822ca2b2e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "75b270d981de425ba1fd9a790b2a68ff": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "baafe1d810594384af1a5ffa4f2f5cb4": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "bf95fd811f79425bb2248525aeab7da0": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "46fb5083adf24ce4ae3fd4ea9aa4772e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "5b98b7b242994c999064688c9210c61b": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_d5b1eb34ddc949aebd25b3744b93b726", - "IPY_MODEL_752d37b9a68b42d284493645962f3782", - "IPY_MODEL_f0def002c7ca41f6a70e9dba1bc605c7" - ], - "layout": "IPY_MODEL_4b0298a9ecf84b509fbf379d43339b9c" - } - }, - "d5b1eb34ddc949aebd25b3744b93b726": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_a37be209d5bb44e18f32c0259073d2c8", - "placeholder": "​", - "style": "IPY_MODEL_b35984b48d8847eea119ee5eda049b9d", - "value": "100%" - } - }, - "752d37b9a68b42d284493645962f3782": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_4b20ad4b356645bbbfb94929160943f2", - "max": 192802, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_63b8646c732246988f566d0442a070e8", - "value": 192802 - } - }, - "f0def002c7ca41f6a70e9dba1bc605c7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_ae8581ec76314304b2078759e1dbdd7e", - "placeholder": "​", - "style": "IPY_MODEL_d0e90066f1ec42afa5f1c02551d3889e", - "value": " 192802/192802 [02:11<00:00, 1657.77it/s]" - } - }, - "4b0298a9ecf84b509fbf379d43339b9c": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a37be209d5bb44e18f32c0259073d2c8": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "b35984b48d8847eea119ee5eda049b9d": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "4b20ad4b356645bbbfb94929160943f2": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "63b8646c732246988f566d0442a070e8": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "ae8581ec76314304b2078759e1dbdd7e": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "d0e90066f1ec42afa5f1c02551d3889e": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/examples/product-recommender/lancedb_cloud/main.py b/examples/product-recommender/lancedb_cloud/main.py deleted file mode 100644 index af87c578..00000000 --- a/examples/product-recommender/lancedb_cloud/main.py +++ /dev/null @@ -1,137 +0,0 @@ -import zipfile -import numpy as np -import pandas as pd -import scipy.sparse -import torch -import implicit -from implicit import evaluation -import lancedb -import pydantic -from lancedb.pydantic import pydantic_to_schema, vector -import argparse - -def products_bought_by_user_in_the_past(user_id: int, top: int = 10): - - selected = data[data.user_id == user_id].sort_values(by=['total_orders'], ascending=False) - - selected['product_name'] = selected['product_id'].map(product_entries.set_index('product_id')['product_name']) - selected = selected[['product_id', 'product_name', 'total_orders']].reset_index(drop=True) - if selected.shape[0] < top: - return selected - - return selected[:top] - -def args_parse(): - parser = argparse.ArgumentParser(description='Product Recommender') - parser.add_argument('--factors', type=int, default=128, help='dimension of latent factor vectors') - parser.add_argument('--regularization', type=float, default=0.05, help='strength of penalty term') - parser.add_argument('--iterations', type=int, default=50, help='number of iterations to update') - parser.add_argument('--num-threads', type=int, default=1, help='amount of parallelization') - parser.add_argument('--num-partitions', type=int, default=256, help='number of partitions of the index') - parser.add_argument('--num-sub-vectors', type=int, default=16, help='number of sub-vectors (M) that will be created during Product Quantization (PQ).') - args = parser.parse_args() - - return args - -files = [ - 'instacart-market-basket-analysis.zip', - 'order_products__train.csv.zip', - 'order_products__prior.csv.zip', - 'products.csv.zip', - 'orders.csv.zip' -] - -if __name__ == "__main__": - args = args_parse() - for filename in files: - with zipfile.ZipFile(filename, 'r') as zip_ref: - zip_ref.extractall('./') - - products = pd.read_csv('products.csv') - orders = pd.read_csv('orders.csv') - order_products = pd.concat([pd.read_csv('order_products__train.csv'), pd.read_csv('order_products__prior.csv')]) - - customer_order_products = pd.merge(orders, order_products, how='inner',on='order_id') - - # create confidence table - data = customer_order_products.groupby(['user_id', 'product_id'])[['order_id']].count().reset_index() - data.columns=["user_id", "product_id", "total_orders"] - data.product_id = data.product_id.astype('int64') - - data_new = pd.DataFrame([[data.user_id.max() + 1, 46149, 50], # user 1 orders 50 Zero Calorie Cola - [data.user_id.max() + 2, 27845, 49], # user 2 orders 49 Organic Whole Milk - [data.user_id.max() + 2, 26604, 32] # user 2 orders 32 Organic Blackberries - ], columns=['user_id', 'product_id', 'total_orders']) - data = pd.concat([data, data_new]).reset_index(drop = True) - - # extract unique user and product ids - unique_users = list(np.sort(data.user_id.unique())) - unique_products = list(np.sort(products.product_id.unique())) - purchases = list(data.total_orders) - - # create zero-based index position <-> user/item ID mappings - index_to_user = pd.Series(unique_users) - - # create reverse mappings from user/item ID to index positions - user_to_index = pd.Series(data=index_to_user.index + 1, index=index_to_user.values) - - # create row and column for user and product ids - users_rows = data.user_id.astype(int) - products_cols = data.product_id.astype(int) - - # create CSR matrix - matrix = scipy.sparse.csr_matrix((purchases, (users_rows, products_cols)), shape=(len(unique_users) + 1, len(unique_products) + 1)) - matrix.data = np.nan_to_num(matrix.data, copy=False) - - #split data into train and test splits - train, test = evaluation.train_test_split(matrix, train_percentage=0.9) - - # initialize the recommender model - model = implicit.als.AlternatingLeastSquares(factors=args.factors, - regularization=args.regularization, - iterations=args.iterations, - num_threads=args.num_threads) - - alpha = 15 - train = (train * alpha).astype('double') - - # train the model on CSR matrix - model.fit(train, show_progress = True) - - test = (test * alpha).astype('double') - evaluation.ranking_metrics_at_k(model, train, test, K=100, - show_progress=True, num_threads=1) - - - db_url = "db://your-project-name" - api_key="sk_..." - region = "us-east-1" - db = lancedb.connect(db_url, api_key=api_key, region=region) - class ProductModel(pydantic.BaseModel): - product_id: int - product_name: str - vector: vector(args.factors) - schema = pydantic_to_schema(ProductModel) - table_name = 'product_recommender' - tbl = db.create_table(table_name, schema=schema) - - # Transform items into factors - items_factors = model.item_factors - product_entries = products[['product_id', 'product_name']].drop_duplicates() - product_entries['product_id'] = product_entries.product_id.astype('int64') - device = "cuda" if torch.cuda.is_available() else "cpu" - item_embeddings = items_factors[1:].tolist() - product_entries['vector'] = item_embeddings - - tbl.add(product_entries) - tbl.create_index(vector_column_name="vector") - - test_user_ids = [206210, 206211] - test_user_factors = model.user_factors[user_to_index[test_user_ids]] - - # Query by user factors - test_user_embeddings = test_user_factors.tolist() - for embedding, id in zip(test_user_embeddings, test_user_ids): - results = tbl.search(embedding).limit(10).to_pandas() - print(results.drop(columns=['vector']).to_string(max_cols=None)) - print(products_bought_by_user_in_the_past(id, top=15).to_string(max_cols=None)) diff --git a/examples/product-recommender/lancedb_cloud/requirements.txt b/examples/product-recommender/lancedb_cloud/requirements.txt deleted file mode 100644 index 662caa62..00000000 --- a/examples/product-recommender/lancedb_cloud/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -numpy -pandas -scipy -kaggle -implicit -torch -lancedb \ No newline at end of file