diff --git a/examples/transforms/example_webpage_transform.ipynb b/examples/transforms/example_webpage_transform.ipynb index ef749f7f..36ba9b6c 100644 --- a/examples/transforms/example_webpage_transform.ipynb +++ b/examples/transforms/example_webpage_transform.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "7d3b19ce", "metadata": {}, @@ -8,28 +9,214 @@ "## Running Async Transformations in Jupyter" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f17f05a", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install refuel-autolabel" - ] - }, { "cell_type": "code", "execution_count": 1, - "id": "2fd4290a", + "id": "1f17f05a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: refuel-autolabel[all] in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (0.0.12)\n", + "Requirement already satisfied: loguru>=0.5.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (0.5.3)\n", + "Requirement already satisfied: numpy>=1.23.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (1.23.3)\n", + "Requirement already satisfied: requests>=2.27.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (2.27.1)\n", + "Requirement already satisfied: datasets>=2.7.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (2.7.1)\n", + "Requirement already satisfied: langchain==0.0.210 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (0.0.210)\n", + "Requirement already satisfied: nervaluate>=0.1.8 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (0.1.8)\n", + "Requirement already satisfied: pandas>=1.3.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (1.3.5)\n", + "Requirement already satisfied: scikit-learn>=1.0.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (1.0.2)\n", + "Requirement already satisfied: tenacity>=8.2.2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (8.2.2)\n", + "Requirement already satisfied: SQLAlchemy>=2.0.19 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (2.0.19)\n", + "Requirement already satisfied: regex>=2023.6.3 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (2023.6.3)\n", + "Requirement already satisfied: rich>=13.3.5 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (13.3.5)\n", + "Requirement already satisfied: scipy>=1.10.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (1.10.1)\n", + "Requirement already satisfied: pydantic>=1.10.9 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (1.10.9)\n", + "Requirement already satisfied: torch>=1.10.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (1.10.1)\n", + "Requirement already satisfied: matplotlib>=3.5.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (3.5.1)\n", + "Requirement already satisfied: wget>=3.2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (3.2)\n", + "Requirement already satisfied: ipywidgets==8.0.6 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (8.0.6)\n", + "Requirement already satisfied: jsonschema>=4.17.3 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (4.17.3)\n", + "Requirement already satisfied: tabulate>=0.9.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (0.9.0)\n", + "Requirement already satisfied: typer[all]>=0.9.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (0.9.0)\n", + "Requirement already satisfied: simple-term-menu>=1.6.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (1.6.1)\n", + "Requirement already satisfied: black in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (22.3.0)\n", + "Requirement already satisfied: bumpver in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (2023.1121)\n", + "Requirement already satisfied: pip-tools in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (6.13.0)\n", + "Requirement already satisfied: pytest in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (7.2.0)\n", + "Collecting pytest-asyncio (from refuel-autolabel[all])\n", + " Downloading pytest_asyncio-0.21.1-py3-none-any.whl (13 kB)\n", + "Requirement already satisfied: pytest-mock in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (3.8.2)\n", + "Requirement already satisfied: pre-commit in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (2.20.0)\n", + "Requirement already satisfied: openai>=0.27.4 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (0.27.4)\n", + "Requirement already satisfied: tiktoken>=0.3.3 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (0.3.3)\n", + "Requirement already satisfied: anthropic==0.2.6 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (0.2.6)\n", + "Requirement already satisfied: transformers>=4.25.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (4.25.1)\n", + "Requirement already satisfied: google-cloud-aiplatform>=1.25.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (1.25.0)\n", + "Requirement already satisfied: cohere>=4.11.2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (4.11.2)\n", + "Requirement already satisfied: sentence-transformers in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (2.2.2)\n", + "Collecting pdfplumber>=0.10.2 (from refuel-autolabel[all])\n", + " Downloading pdfplumber-0.10.2-py3-none-any.whl (47 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.5/47.5 kB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pdf2image>=1.16.3 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (1.16.3)\n", + "Requirement already satisfied: pytesseract>=0.3.10 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (0.3.10)\n", + "Requirement already satisfied: bs4 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (0.0.1)\n", + "Requirement already satisfied: httpx in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (0.21.3)\n", + "Requirement already satisfied: fake-useragent in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from refuel-autolabel[all]) (1.2.1)\n", + "Requirement already satisfied: tokenizers in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from anthropic==0.2.6->refuel-autolabel[all]) (0.13.2)\n", + "Requirement already satisfied: aiohttp in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from anthropic==0.2.6->refuel-autolabel[all]) (3.8.4)\n", + "Requirement already satisfied: ipykernel>=4.5.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipywidgets==8.0.6->refuel-autolabel[all]) (6.6.1)\n", + "Requirement already satisfied: ipython>=6.1.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipywidgets==8.0.6->refuel-autolabel[all]) (7.31.0)\n", + "Requirement already satisfied: traitlets>=4.3.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipywidgets==8.0.6->refuel-autolabel[all]) (5.1.1)\n", + "Requirement already satisfied: widgetsnbextension~=4.0.7 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipywidgets==8.0.6->refuel-autolabel[all]) (4.0.7)\n", + "Requirement already satisfied: jupyterlab-widgets~=3.0.7 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipywidgets==8.0.6->refuel-autolabel[all]) (3.0.7)\n", + "Requirement already satisfied: PyYAML>=5.4.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from langchain==0.0.210->refuel-autolabel[all]) (6.0)\n", + "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from langchain==0.0.210->refuel-autolabel[all]) (4.0.2)\n", + "Requirement already satisfied: dataclasses-json<0.6.0,>=0.5.7 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from langchain==0.0.210->refuel-autolabel[all]) (0.5.7)\n", + "Requirement already satisfied: langchainplus-sdk>=0.0.17 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from langchain==0.0.210->refuel-autolabel[all]) (0.0.17)\n", + "Requirement already satisfied: numexpr<3.0.0,>=2.8.4 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from langchain==0.0.210->refuel-autolabel[all]) (2.8.4)\n", + "Requirement already satisfied: openapi-schema-pydantic<2.0,>=1.2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from langchain==0.0.210->refuel-autolabel[all]) (1.2.4)\n", + "Requirement already satisfied: backoff<3.0,>=2.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from cohere>=4.11.2->refuel-autolabel[all]) (2.2.1)\n", + "Requirement already satisfied: importlib_metadata<7.0,>=6.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from cohere>=4.11.2->refuel-autolabel[all]) (6.6.0)\n", + "Requirement already satisfied: pyarrow>=6.0.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from datasets>=2.7.0->refuel-autolabel[all]) (9.0.0)\n", + "Requirement already satisfied: dill<0.3.7 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from datasets>=2.7.0->refuel-autolabel[all]) (0.3.4)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from datasets>=2.7.0->refuel-autolabel[all]) (4.64.1)\n", + "Requirement already satisfied: xxhash in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from datasets>=2.7.0->refuel-autolabel[all]) (3.1.0)\n", + "Requirement already satisfied: multiprocess in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from datasets>=2.7.0->refuel-autolabel[all]) (0.70.12.2)\n", + "Requirement already satisfied: fsspec[http]>=2021.11.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from datasets>=2.7.0->refuel-autolabel[all]) (2022.8.2)\n", + "Requirement already satisfied: huggingface-hub<1.0.0,>=0.2.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from datasets>=2.7.0->refuel-autolabel[all]) (0.13.3)\n", + "Requirement already satisfied: packaging in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from datasets>=2.7.0->refuel-autolabel[all]) (21.3)\n", + "Requirement already satisfied: responses<0.19 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from datasets>=2.7.0->refuel-autolabel[all]) (0.18.0)\n", + "Requirement already satisfied: google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (2.10.1)\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (1.22.2)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (4.23.0)\n", + "Requirement already satisfied: google-cloud-storage<3.0.0dev,>=1.32.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (2.7.0)\n", + "Requirement already satisfied: google-cloud-bigquery<4.0.0dev,>=1.15.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (3.10.0)\n", + "Requirement already satisfied: google-cloud-resource-manager<3.0.0dev,>=1.3.3 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (1.6.3)\n", + "Requirement already satisfied: shapely<2.0.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (1.8.5.post1)\n", + "Requirement already satisfied: attrs>=17.4.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from jsonschema>=4.17.3->refuel-autolabel[all]) (20.3.0)\n", + "Requirement already satisfied: importlib-resources>=1.4.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from jsonschema>=4.17.3->refuel-autolabel[all]) (5.4.0)\n", + "Requirement already satisfied: pkgutil-resolve-name>=1.3.10 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from jsonschema>=4.17.3->refuel-autolabel[all]) (1.3.10)\n", + "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from jsonschema>=4.17.3->refuel-autolabel[all]) (0.18.0)\n", + "Requirement already satisfied: cycler>=0.10 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from matplotlib>=3.5.0->refuel-autolabel[all]) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from matplotlib>=3.5.0->refuel-autolabel[all]) (4.28.5)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from matplotlib>=3.5.0->refuel-autolabel[all]) (1.3.2)\n", + "Requirement already satisfied: pillow>=6.2.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from matplotlib>=3.5.0->refuel-autolabel[all]) (9.4.0)\n", + "Requirement already satisfied: pyparsing>=2.2.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from matplotlib>=3.5.0->refuel-autolabel[all]) (2.4.7)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from matplotlib>=3.5.0->refuel-autolabel[all]) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pandas>=1.3.0->refuel-autolabel[all]) (2021.3)\n", + "Requirement already satisfied: pdfminer.six==20221105 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pdfplumber>=0.10.2->refuel-autolabel[all]) (20221105)\n", + "Collecting pypdfium2>=4.18.0 (from pdfplumber>=0.10.2->refuel-autolabel[all])\n", + " Downloading pypdfium2-4.18.0-py3-none-macosx_11_0_arm64.whl (2.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pdfminer.six==20221105->pdfplumber>=0.10.2->refuel-autolabel[all]) (2.0.9)\n", + "Requirement already satisfied: cryptography>=36.0.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pdfminer.six==20221105->pdfplumber>=0.10.2->refuel-autolabel[all]) (36.0.1)\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pydantic>=1.10.9->refuel-autolabel[all]) (4.4.0)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from requests>=2.27.0->refuel-autolabel[all]) (1.26.14)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from requests>=2.27.0->refuel-autolabel[all]) (2021.10.8)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from requests>=2.27.0->refuel-autolabel[all]) (2.10)\n", + "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from rich>=13.3.5->refuel-autolabel[all]) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from rich>=13.3.5->refuel-autolabel[all]) (2.15.1)\n", + "Requirement already satisfied: joblib>=0.11 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from scikit-learn>=1.0.0->refuel-autolabel[all]) (1.2.0)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from scikit-learn>=1.0.0->refuel-autolabel[all]) (3.0.0)\n", + "Requirement already satisfied: filelock in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from transformers>=4.25.0->refuel-autolabel[all]) (3.4.2)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from typer[all]>=0.9.0->refuel-autolabel[all]) (8.1.3)\n", + "Requirement already satisfied: colorama<0.5.0,>=0.4.3 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from typer[all]>=0.9.0->refuel-autolabel[all]) (0.4.5)\n", + "Requirement already satisfied: shellingham<2.0.0,>=1.3.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from typer[all]>=0.9.0->refuel-autolabel[all]) (1.5.0.post1)\n", + "Requirement already satisfied: platformdirs>=2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from black->refuel-autolabel[all]) (2.5.1)\n", + "Requirement already satisfied: pathspec>=0.9.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from black->refuel-autolabel[all]) (0.9.0)\n", + "Requirement already satisfied: mypy-extensions>=0.4.3 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from black->refuel-autolabel[all]) (0.4.3)\n", + "Requirement already satisfied: tomli>=1.1.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from black->refuel-autolabel[all]) (2.0.1)\n", + "Requirement already satisfied: beautifulsoup4 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from bs4->refuel-autolabel[all]) (4.10.0)\n", + "Requirement already satisfied: pathlib2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from bumpver->refuel-autolabel[all]) (2.3.7.post1)\n", + "Requirement already satisfied: toml in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from bumpver->refuel-autolabel[all]) (0.10.2)\n", + "Requirement already satisfied: lexid in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from bumpver->refuel-autolabel[all]) (2021.1006)\n", + "Requirement already satisfied: looseversion in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from bumpver->refuel-autolabel[all]) (1.1.2)\n", + "Requirement already satisfied: sniffio in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from httpx->refuel-autolabel[all]) (1.2.0)\n", + "Requirement already satisfied: rfc3986[idna2008]<2,>=1.3 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from httpx->refuel-autolabel[all]) (1.5.0)\n", + "Requirement already satisfied: httpcore<0.15.0,>=0.14.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from httpx->refuel-autolabel[all]) (0.14.5)\n", + "Requirement already satisfied: build in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pip-tools->refuel-autolabel[all]) (0.10.0)\n", + "Requirement already satisfied: pip>=22.2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pip-tools->refuel-autolabel[all]) (23.1.2)\n", + "Requirement already satisfied: setuptools in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pip-tools->refuel-autolabel[all]) (65.6.3)\n", + "Requirement already satisfied: wheel in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pip-tools->refuel-autolabel[all]) (0.40.0)\n", + "Requirement already satisfied: cfgv>=2.0.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pre-commit->refuel-autolabel[all]) (3.3.1)\n", + "Requirement already satisfied: identify>=1.0.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pre-commit->refuel-autolabel[all]) (2.5.9)\n", + "Requirement already satisfied: nodeenv>=0.11.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pre-commit->refuel-autolabel[all]) (1.7.0)\n", + "Requirement already satisfied: virtualenv>=20.0.8 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pre-commit->refuel-autolabel[all]) (20.16.7)\n", + "Requirement already satisfied: iniconfig in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pytest->refuel-autolabel[all]) (1.1.1)\n", + "Requirement already satisfied: pluggy<2.0,>=0.12 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pytest->refuel-autolabel[all]) (1.0.0)\n", + "Requirement already satisfied: exceptiongroup>=1.0.0rc8 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pytest->refuel-autolabel[all]) (1.0.4)\n", + "Requirement already satisfied: torchvision in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from sentence-transformers->refuel-autolabel[all]) (0.11.2)\n", + "Requirement already satisfied: nltk in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from sentence-transformers->refuel-autolabel[all]) (3.6.6)\n", + "Requirement already satisfied: sentencepiece in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from sentence-transformers->refuel-autolabel[all]) (0.1.96)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from aiohttp->anthropic==0.2.6->refuel-autolabel[all]) (5.2.0)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from aiohttp->anthropic==0.2.6->refuel-autolabel[all]) (1.7.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from aiohttp->anthropic==0.2.6->refuel-autolabel[all]) (1.2.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from aiohttp->anthropic==0.2.6->refuel-autolabel[all]) (1.2.0)\n", + "Requirement already satisfied: marshmallow<4.0.0,>=3.3.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from dataclasses-json<0.6.0,>=0.5.7->langchain==0.0.210->refuel-autolabel[all]) (3.17.1)\n", + "Requirement already satisfied: marshmallow-enum<2.0.0,>=1.5.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from dataclasses-json<0.6.0,>=0.5.7->langchain==0.0.210->refuel-autolabel[all]) (1.5.1)\n", + "Requirement already satisfied: typing-inspect>=0.4.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from dataclasses-json<0.6.0,>=0.5.7->langchain==0.0.210->refuel-autolabel[all]) (0.8.0)\n", + "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.56.2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (1.56.4)\n", + "Requirement already satisfied: google-auth<3.0dev,>=1.25.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (2.6.0)\n", + "Requirement already satisfied: grpcio<2.0dev,>=1.33.2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (1.54.0)\n", + "Requirement already satisfied: grpcio-status<2.0dev,>=1.33.2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (1.54.0)\n", + "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (2.3.2)\n", + "Requirement already satisfied: google-resumable-media<3.0dev,>=0.6.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (2.4.1)\n", + "Requirement already satisfied: grpc-google-iam-v1<1.0.0dev,>=0.12.4 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-cloud-resource-manager<3.0.0dev,>=1.3.3->google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (0.12.6)\n", + "Requirement already satisfied: h11<0.13,>=0.11 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from httpcore<0.15.0,>=0.14.0->httpx->refuel-autolabel[all]) (0.12.0)\n", + "Requirement already satisfied: anyio==3.* in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from httpcore<0.15.0,>=0.14.0->httpx->refuel-autolabel[all]) (3.4.0)\n", + "Requirement already satisfied: zipp>=0.5 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from importlib_metadata<7.0,>=6.0->cohere>=4.11.2->refuel-autolabel[all]) (3.6.0)\n", + "Requirement already satisfied: debugpy<2.0,>=1.0.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets==8.0.6->refuel-autolabel[all]) (1.5.1)\n", + "Requirement already satisfied: jupyter-client<8.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets==8.0.6->refuel-autolabel[all]) (7.1.0)\n", + "Requirement already satisfied: tornado<7.0,>=4.2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets==8.0.6->refuel-autolabel[all]) (6.1)\n", + "Requirement already satisfied: matplotlib-inline<0.2.0,>=0.1.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets==8.0.6->refuel-autolabel[all]) (0.1.3)\n", + "Requirement already satisfied: nest-asyncio in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets==8.0.6->refuel-autolabel[all]) (1.5.4)\n", + "Requirement already satisfied: appnope in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipykernel>=4.5.1->ipywidgets==8.0.6->refuel-autolabel[all]) (0.1.2)\n", + "Requirement already satisfied: jedi>=0.16 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets==8.0.6->refuel-autolabel[all]) (0.18.1)\n", + "Requirement already satisfied: decorator in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets==8.0.6->refuel-autolabel[all]) (5.1.1)\n", + "Requirement already satisfied: pickleshare in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets==8.0.6->refuel-autolabel[all]) (0.7.5)\n", + "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets==8.0.6->refuel-autolabel[all]) (3.0.39)\n", + "Requirement already satisfied: backcall in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets==8.0.6->refuel-autolabel[all]) (0.2.0)\n", + "Requirement already satisfied: pexpect>4.3 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from ipython>=6.1.0->ipywidgets==8.0.6->refuel-autolabel[all]) (4.8.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from markdown-it-py<3.0.0,>=2.2.0->rich>=13.3.5->refuel-autolabel[all]) (0.1.2)\n", + "Requirement already satisfied: six>=1.5 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from python-dateutil>=2.7->matplotlib>=3.5.0->refuel-autolabel[all]) (1.16.0)\n", + "Requirement already satisfied: distlib<1,>=0.3.6 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from virtualenv>=20.0.8->pre-commit->refuel-autolabel[all]) (0.3.6)\n", + "Requirement already satisfied: soupsieve>1.2 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from beautifulsoup4->bs4->refuel-autolabel[all]) (2.3.1)\n", + "Requirement already satisfied: pyproject_hooks in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from build->pip-tools->refuel-autolabel[all]) (1.0.0)\n", + "Requirement already satisfied: cffi>=1.12 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from cryptography>=36.0.0->pdfminer.six==20221105->pdfplumber>=0.10.2->refuel-autolabel[all]) (1.15.0)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (4.2.4)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (0.2.8)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (4.8)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from google-resumable-media<3.0dev,>=0.6.0->google-cloud-bigquery<4.0.0dev,>=1.15.0->google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (1.5.0)\n", + "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets==8.0.6->refuel-autolabel[all]) (0.8.3)\n", + "Requirement already satisfied: entrypoints in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from jupyter-client<8.0->ipykernel>=4.5.1->ipywidgets==8.0.6->refuel-autolabel[all]) (0.3)\n", + "Requirement already satisfied: jupyter-core>=4.6.0 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from jupyter-client<8.0->ipykernel>=4.5.1->ipywidgets==8.0.6->refuel-autolabel[all]) (4.9.1)\n", + "Requirement already satisfied: pyzmq>=13 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from jupyter-client<8.0->ipykernel>=4.5.1->ipywidgets==8.0.6->refuel-autolabel[all]) (22.3.0)\n", + "Requirement already satisfied: ptyprocess>=0.5 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets==8.0.6->refuel-autolabel[all]) (0.7.0)\n", + "Requirement already satisfied: wcwidth in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=6.1.0->ipywidgets==8.0.6->refuel-autolabel[all]) (0.2.5)\n", + "Requirement already satisfied: pycparser in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20221105->pdfplumber>=0.10.2->refuel-autolabel[all]) (2.21)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=1.25.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform>=1.25.0->refuel-autolabel[all]) (0.4.8)\n", + "Installing collected packages: pypdfium2, pytest-asyncio, pdfplumber\n", + " Attempting uninstall: pdfplumber\n", + " Found existing installation: pdfplumber 0.8.0\n", + " Uninstalling pdfplumber-0.8.0:\n", + " Successfully uninstalled pdfplumber-0.8.0\n", + "Successfully installed pdfplumber-0.10.2 pypdfium2-4.18.0 pytest-asyncio-0.21.1\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], "source": [ - "import nest_asyncio\n", - "nest_asyncio.apply()" + "!pip install refuel-autolabel[all]\n", + "!pip install beautifulsoup4 httpx fake_useragent" ] }, { + "attachments": {}, "cell_type": "markdown", "id": "aea97f23", "metadata": {}, @@ -40,6 +227,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "2eab19b6", "metadata": {}, @@ -49,7 +237,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 1, "id": "50e7446e", "metadata": {}, "outputs": [], @@ -82,7 +270,20 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 2, + "id": "201b498e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# provide your own OpenAI API key here\n", + "os.environ['OPENAI_API_KEY'] = 'sk-XXXXXXXXXXXXXXXXXXXXXXXX'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "id": "fc6be6ac", "metadata": {}, "outputs": [], @@ -92,6 +293,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "060a41a1", "metadata": {}, @@ -101,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 4, "id": "5b79df29", "metadata": {}, "outputs": [], @@ -133,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 5, "id": "1d3476bd", "metadata": {}, "outputs": [], @@ -142,6 +344,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "15695b36", "metadata": {}, @@ -152,14 +355,14 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 6, "id": "55bda1eb", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9eb970ac8267464988b63df5275f2aaf", + "model_id": "998b3747b1c745fc98a16e10f57fa80a", "version_major": 2, "version_minor": 0 }, @@ -192,6 +395,14 @@ }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nihit/Documents/refuel-ai.nosync/virtualenv/lib/python3.8/site-packages/httpx/_client.py:2012: UserWarning: Unclosed . See https://www.python-httpx.org/async/#opening-and-closing-clients for details.\n", + " warnings.warn(\n" + ] } ], "source": [ @@ -199,6 +410,7 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "id": "1dce5e1d", "metadata": {}, @@ -209,14 +421,14 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 7, "id": "f513a335", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e1e56a33bc314c5ab26c72b1d989914e", + "model_id": "911e85e00642419f970ebde6081166ac", "version_major": 2, "version_minor": 0 }, @@ -244,7 +456,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 8, "id": "a9eafc65", "metadata": {}, "outputs": [ @@ -272,6 +484,8 @@ " url\n", " name\n", " content\n", + " content_in_bytes_column\n", + " soup_column\n", " metadata_column\n", " NationalPark_label\n", " NationalPark_error\n", @@ -285,105 +499,129 @@ " https://www.visitmt.com/places-to-go/glacier-n...\n", " Glacier National Park\n", " \\n\\n\\n\\n\\n\\n\\nGlacier National Park\\n\\n\\n\\n\\n\\...\n", + " b'\\n<!doctype html>\\n <html lang=\"en\">\\n<head...\n", + " [\\n, html, \\n, [\\n, [\\n, Google Tag Manager ,...\n", " {'url': 'https://www.visitmt.com/places-to-go/...\n", " Montana\n", " None\n", " True\n", - " {\"successfully_labeled\": true, \"label\": \"Monta...\n", + " b'\\x80\\x04\\x95q\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8...\n", " \n", " \n", " 1\n", " https://www.nps.gov/dena/index.htm\n", " Denali National Park\n", " \\n Denali National Park & Preserve (U.S. N...\n", + " b'<!doctype html> <html lang=\"en\" class=\"no-js...\n", + " [html, , [ , Content Copyright National Park...\n", " {'url': 'https://www.nps.gov/dena/index.htm', ...\n", " Alaska\n", " None\n", " True\n", - " {\"successfully_labeled\": true, \"label\": \"Alask...\n", + " b'\\x80\\x04\\x95p\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8...\n", " \n", " \n", " 2\n", " https://www.nps.gov/lavo/index.htm\n", " Lassen Volcanic National Park\n", " \\n Lassen Volcanic National Park (U.S. Nat...\n", + " b'<!doctype html> <html lang=\"en\" class=\"no-js...\n", + " [html, , [ , Content Copyright National Park...\n", " {'url': 'https://www.nps.gov/lavo/index.htm', ...\n", " California\n", " None\n", " True\n", - " {\"successfully_labeled\": true, \"label\": \"Calif...\n", + " b'\\x80\\x04\\x95t\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8...\n", " \n", " \n", " 3\n", " https://www.nps.gov/olym/index.htm\n", " Olympic National Park\n", " \\n Olympic National Park (U.S. National Pa...\n", + " b'<!doctype html> <html lang=\"en\" class=\"no-js...\n", + " [html, , [ , Content Copyright National Park...\n", " {'url': 'https://www.nps.gov/olym/index.htm', ...\n", " Washington\n", " None\n", " True\n", - " {\"successfully_labeled\": true, \"label\": \"Washi...\n", + " b'\\x80\\x04\\x95t\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8...\n", " \n", " \n", " 4\n", " https://www.nps.gov/pinn/index.htm\n", " Pinnacles National Park\n", " \\n Pinnacles National Park (U.S. National ...\n", + " b'<!doctype html> <html lang=\"en\" class=\"no-js...\n", + " [html, , [ , Content Copyright National Park...\n", " {'url': 'https://www.nps.gov/pinn/index.htm', ...\n", " California\n", " None\n", " True\n", - " {\"successfully_labeled\": true, \"label\": \"Calif...\n", + " b'\\x80\\x04\\x95t\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x8...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " url \n", - "0 https://www.visitmt.com/places-to-go/glacier-n... \\\n", + " url \\\n", + "0 https://www.visitmt.com/places-to-go/glacier-n... \n", "1 https://www.nps.gov/dena/index.htm \n", "2 https://www.nps.gov/lavo/index.htm \n", "3 https://www.nps.gov/olym/index.htm \n", "4 https://www.nps.gov/pinn/index.htm \n", "\n", - " name \n", - "0 Glacier National Park \\\n", + " name \\\n", + "0 Glacier National Park \n", "1 Denali National Park \n", "2 Lassen Volcanic National Park \n", "3 Olympic National Park \n", "4 Pinnacles National Park \n", "\n", - " content \n", - "0 \\n\\n\\n\\n\\n\\n\\nGlacier National Park\\n\\n\\n\\n\\n\\... \\\n", + " content \\\n", + "0 \\n\\n\\n\\n\\n\\n\\nGlacier National Park\\n\\n\\n\\n\\n\\... \n", "1 \\n Denali National Park & Preserve (U.S. N... \n", "2 \\n Lassen Volcanic National Park (U.S. Nat... \n", "3 \\n Olympic National Park (U.S. National Pa... \n", "4 \\n Pinnacles National Park (U.S. National ... \n", "\n", - " metadata_column NationalPark_label \n", - "0 {'url': 'https://www.visitmt.com/places-to-go/... Montana \\\n", + " content_in_bytes_column \\\n", + "0 b'\\n\\n \\n = 0.10.2", "pdf2image >= 1.16.3", "pytesseract >= 0.3.10", - "bs4", + "beautifulsoup4 >= 4.12.2", "httpx", "fake_useragent" ] diff --git a/src/autolabel/transforms/webpage_transform.py b/src/autolabel/transforms/webpage_transform.py index 4901e6d7..ae86d1a1 100644 --- a/src/autolabel/transforms/webpage_transform.py +++ b/src/autolabel/transforms/webpage_transform.py @@ -57,7 +57,7 @@ def __init__( self.beautiful_soup = BeautifulSoup except ImportError: raise ImportError( - "BeautifulSoup, httpx and fake_useragent are required to use the webpage transform. Please install them with the following command: pip install bs4 httpx fake_useragent" + "BeautifulSoup, httpx and fake_useragent are required to use the webpage transform. Please install them with the following command: pip install beautifulsoup4 httpx fake_useragent" ) def name(self) -> str: