diff --git a/notebooks/README.md b/notebooks/README.md new file mode 100644 index 0000000..09d7c2f --- /dev/null +++ b/notebooks/README.md @@ -0,0 +1,30 @@ +# Running the record api tooling on notebooks + +First of all, clone the record api repo + +``` +git clone https://github.com/data-apis/python-record-api.git +``` + +And create a new conda env for installing all the notebook's dependencies and the record api tooling. + +``` +conda create -n record-api python=3.8 +conda activate record-api +cd python-record-api +pip install record_api +``` + +We have an example of a notebook under `/notebooks/data`, so we need to transform it +to a python file so the record api tooling work. + +``` +cd notebooks +python notebook_to_python.py data +``` + +A new folder named `scripts` under the `notebooks` directory will be created with +all the files converted into python files. + +Then, run the file `run_record_api.sh` to generate the inferred API calls in each notebook. +The output will be under `infer_apis` directory as a json per notebook file. diff --git a/notebooks/data/example.ipynb b/notebooks/data/example.ipynb new file mode 100644 index 0000000..c29d8df --- /dev/null +++ b/notebooks/data/example.ipynb @@ -0,0 +1,538 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "s = pd.Series([1, 3, 5, np.nan, 6, 8])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "dates = pd.date_range(\"20130101\", periods=6)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list(\"ABCD\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df2 = pd.DataFrame(\n", + " {\n", + " \"A\": 1.0,\n", + " \"B\": pd.Timestamp(\"20130102\"),\n", + " \"C\": pd.Series(1, index=list(range(4)), dtype=\"float32\"),\n", + " \"D\": np.array([3] * 4, dtype=\"int32\"),\n", + " \"E\": pd.Categorical([\"test\", \"train\", \"test\", \"train\"]),\n", + " \"F\": \"foo\",\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
2013-01-01-0.328603-0.366380-0.619592-2.822423
2013-01-02-0.4270670.317282-0.938094-0.148115
2013-01-03-3.012013-0.5350660.7909450.209057
2013-01-040.7626960.2431091.8795141.108437
2013-01-05-0.402783-0.876177-0.0939850.367029
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "2013-01-01 -0.328603 -0.366380 -0.619592 -2.822423\n", + "2013-01-02 -0.427067 0.317282 -0.938094 -0.148115\n", + "2013-01-03 -3.012013 -0.535066 0.790945 0.209057\n", + "2013-01-04 0.762696 0.243109 1.879514 1.108437\n", + "2013-01-05 -0.402783 -0.876177 -0.093985 0.367029" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
2013-01-040.7626960.2431091.8795141.108437
2013-01-05-0.402783-0.876177-0.0939850.367029
2013-01-06-1.0492930.8370731.210975-1.206224
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "2013-01-04 0.762696 0.243109 1.879514 1.108437\n", + "2013-01-05 -0.402783 -0.876177 -0.093985 0.367029\n", + "2013-01-06 -1.049293 0.837073 1.210975 -1.206224" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.tail(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',\n", + " '2013-01-05', '2013-01-06'],\n", + " dtype='datetime64[ns]', freq='D')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.index" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['A', 'B', 'C', 'D'], dtype='object')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.32860292, -0.36637983, -0.61959229, -2.82242287],\n", + " [-0.42706726, 0.3172823 , -0.93809362, -0.14811456],\n", + " [-3.01201345, -0.53506589, 0.79094519, 0.20905686],\n", + " [ 0.76269556, 0.24310942, 1.87951383, 1.10843652],\n", + " [-0.40278323, -0.87617667, -0.09398487, 0.3670287 ],\n", + " [-1.04929264, 0.8370728 , 1.21097545, -1.20622421]])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.to_numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
count6.0000006.0000006.0000006.000000
mean-0.742844-0.0633600.371627-0.415373
std1.2567130.6363321.1017021.401058
min-3.012013-0.876177-0.938094-2.822423
25%-0.893736-0.492894-0.488190-0.941697
50%-0.414925-0.0616350.3484800.030471
75%-0.3471480.2987391.1059680.327536
max0.7626960.8370731.8795141.108437
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "count 6.000000 6.000000 6.000000 6.000000\n", + "mean -0.742844 -0.063360 0.371627 -0.415373\n", + "std 1.256713 0.636332 1.101702 1.401058\n", + "min -3.012013 -0.876177 -0.938094 -2.822423\n", + "25% -0.893736 -0.492894 -0.488190 -0.941697\n", + "50% -0.414925 -0.061635 0.348480 0.030471\n", + "75% -0.347148 0.298739 1.105968 0.327536\n", + "max 0.762696 0.837073 1.879514 1.108437" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DCBA
2013-01-01-2.822423-0.619592-0.366380-0.328603
2013-01-02-0.148115-0.9380940.317282-0.427067
2013-01-030.2090570.790945-0.535066-3.012013
2013-01-041.1084371.8795140.2431090.762696
2013-01-050.367029-0.093985-0.876177-0.402783
2013-01-06-1.2062241.2109750.837073-1.049293
\n", + "
" + ], + "text/plain": [ + " D C B A\n", + "2013-01-01 -2.822423 -0.619592 -0.366380 -0.328603\n", + "2013-01-02 -0.148115 -0.938094 0.317282 -0.427067\n", + "2013-01-03 0.209057 0.790945 -0.535066 -3.012013\n", + "2013-01-04 1.108437 1.879514 0.243109 0.762696\n", + "2013-01-05 0.367029 -0.093985 -0.876177 -0.402783\n", + "2013-01-06 -1.206224 1.210975 0.837073 -1.049293" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sort_index(axis=1, ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/grouped/example.json b/notebooks/grouped/example.json new file mode 100644 index 0000000..073c39b --- /dev/null +++ b/notebooks/grouped/example.json @@ -0,0 +1,18 @@ +{"bound_params":{"pos_or_kw":[["data",[{"t":"int"},{"t":"int"},{"t":"int"},{"t":"float"},{"t":"int"},{"t":"int"}]]]},"function":{"t":"type","v":{"module":"pandas.core.series","name":"Series"}},"n":1} +{"function":{"t":"builtin_function_or_method","v":"getattr"},"params":{"args":[{"t":"module","v":"pandas"},"date_range"]},"n":1} +{"bound_params":{"pos_or_kw":[["start","20130101"],["periods",{"t":"int"}]]},"function":{"t":"function","v":{"module":"pandas.core.indexes.datetimes","name":"date_range"}},"n":1} +{"function":{"t":"builtin_function_or_method","v":"getattr"},"params":{"args":[{"t":"module","v":"pandas"},"DataFrame"]},"n":1} +{"bound_params":{"pos_or_kw":[["data",{"t":{"module":"numpy","name":"ndarray"},"v":{"dtype":"float64"}}],["index",{"t":{"module":"pandas.core.indexes.datetimes","name":"DatetimeIndex"}}],["columns",["A","B","C","D"]]]},"function":{"t":"type","v":{"module":"pandas.core.frame","name":"DataFrame"}},"n":1} +{"bound_params":{"pos_or_kw":[["ts_input","20130102"]]},"function":{"t":"type","v":{"module":"pandas._libs.tslibs.timestamps","name":"Timestamp"}},"n":1} +{"function":{"t":"builtin_function_or_method","v":"getattr"},"params":{"args":[{"t":"module","v":"pandas"},"Series"]},"n":1} +{"bound_params":{"pos_or_kw":[["data",{"t":"int"}],["index",[{"t":"int"},{"t":"int"},{"t":"int"},{"t":"int"}]],["dtype","float32"]]},"function":{"t":"type","v":{"module":"pandas.core.series","name":"Series"}},"n":1} +{"bound_params":{"pos_or_kw":[["values",["test","train","test","train"]]]},"function":{"t":"type","v":{"module":"pandas.core.arrays.categorical","name":"Categorical"}},"n":1} +{"bound_params":{"pos_or_kw":[["data",{"t":"dict","v":[["A",{"t":"float"}],["B",{"t":{"module":"pandas._libs.tslibs.timestamps","name":"Timestamp"}}],["C",{"t":{"module":"pandas.core.series","name":"Series"}}],["D",{"t":{"module":"numpy","name":"ndarray"},"v":{"dtype":"int32"}}],["E",{"t":{"module":"pandas.core.arrays.categorical","name":"Categorical"}}],["F","foo"]]}]]},"function":{"t":"type","v":{"module":"pandas.core.frame","name":"DataFrame"}},"n":1} +{"bound_params":{},"function":{"t":"method","v":{"name":"head","self":{"t":{"module":"pandas.core.frame","name":"DataFrame"}}}},"n":1} +{"bound_params":{"pos_or_kw":[["n",{"t":"int"}]]},"function":{"t":"method","v":{"name":"tail","self":{"t":{"module":"pandas.core.frame","name":"DataFrame"}}}},"n":1} +{"function":{"t":"builtin_function_or_method","v":"getattr"},"params":{"args":[{"t":{"module":"pandas.core.frame","name":"DataFrame"}},"index"]},"n":1} +{"function":{"t":"builtin_function_or_method","v":"getattr"},"params":{"args":[{"t":{"module":"pandas.core.frame","name":"DataFrame"}},"columns"]},"n":1} +{"bound_params":{},"function":{"t":"method","v":{"name":"to_numpy","self":{"t":{"module":"pandas.core.frame","name":"DataFrame"}}}},"n":1} +{"bound_params":{},"function":{"t":"method","v":{"name":"describe","self":{"t":{"module":"pandas.core.frame","name":"DataFrame"}}}},"n":1} +{"function":{"t":"builtin_function_or_method","v":"getattr"},"params":{"args":[{"t":{"module":"pandas.core.frame","name":"DataFrame"}},"sort_index"]},"n":1} +{"bound_params":{"pos_or_kw":[["axis",{"t":"int"}],["ascending",{"t":"bool"}]]},"function":{"t":"method","v":{"name":"sort_index","self":{"t":{"module":"pandas.core.frame","name":"DataFrame"}}}},"n":1} diff --git a/notebooks/infer_apis/example.json b/notebooks/infer_apis/example.json new file mode 100644 index 0000000..9eef114 --- /dev/null +++ b/notebooks/infer_apis/example.json @@ -0,0 +1,443 @@ +{ + "modules": { + "pandas.core.series": { + "classes": { + "Series": { + "constructor_overloads": [ + { + "pos_or_kw_required": { + "data": { + "type": "list", + "item": { + "type": "union", + "options": [ + { + "type": { + "name": "int" + } + }, + { + "type": { + "name": "float" + } + } + ] + } + } + }, + "metadata": { + "usage.hola": 1 + } + }, + { + "pos_or_kw_required": { + "data": { + "type": { + "name": "int" + } + }, + "index": { + "type": "list", + "item": { + "type": { + "name": "int" + } + } + }, + "dtype": { + "type": "str", + "options": [ + "float32" + ] + } + }, + "metadata": { + "usage.hola": 1 + } + } + ], + "constructor": { + "pos_or_kw_required": { + "data": { + "type": "union", + "options": [ + { + "type": { + "name": "int" + } + }, + { + "type": "list", + "item": { + "type": "union", + "options": [ + { + "type": { + "name": "float" + } + }, + { + "type": { + "name": "int" + } + } + ] + } + } + ] + } + }, + "pos_or_kw_optional": { + "index": { + "type": "list", + "item": { + "type": { + "name": "int" + } + } + }, + "dtype": { + "type": "str", + "options": [ + "float32" + ] + } + }, + "pos_or_kw_optional_ordering": [ + [ + "index", + "dtype" + ] + ], + "metadata": { + "usage.hola": 2 + } + } + } + } + }, + "pandas": { + "properties": { + "date_range": [ + { + "usage.hola": 1 + }, + { + "type": "bottom" + } + ], + "DataFrame": [ + { + "usage.hola": 1 + }, + { + "type": "bottom" + } + ], + "Series": [ + { + "usage.hola": 1 + }, + { + "type": "bottom" + } + ] + } + }, + "pandas.core.indexes.datetimes": { + "function_overloads": { + "date_range": [ + { + "pos_or_kw_required": { + "start": { + "type": "str", + "options": [ + "20130101" + ] + }, + "periods": { + "type": { + "name": "int" + } + } + }, + "metadata": { + "usage.hola": 1 + } + } + ] + }, + "functions": { + "date_range": { + "pos_or_kw_required": { + "start": { + "type": "str", + "options": [ + "20130101" + ] + }, + "periods": { + "type": { + "name": "int" + } + } + }, + "metadata": { + "usage.hola": 1 + } + } + } + }, + "pandas.core.frame": { + "classes": { + "DataFrame": { + "constructor_overloads": [ + { + "pos_or_kw_required": { + "data": { + "type": { + "module": "numpy", + "name": "ndarray" + } + }, + "index": { + "type": { + "module": "pandas.core.indexes.datetimes", + "name": "DatetimeIndex" + } + }, + "columns": { + "type": "list", + "item": { + "type": "str", + "options": [ + "D", + "C", + "B", + "A" + ] + } + } + }, + "metadata": { + "usage.hola": 1 + } + }, + { + "pos_or_kw_required": { + "data": { + "type": "dict", + "key": { + "type": "str" + }, + "value": { + "type": "object" + } + } + }, + "metadata": { + "usage.hola": 1 + } + } + ], + "method_overloads": { + "head": [ + { + "metadata": { + "usage.hola": 1 + } + } + ], + "tail": [ + { + "pos_or_kw_required": { + "n": { + "type": { + "name": "int" + } + } + }, + "metadata": { + "usage.hola": 1 + } + } + ], + "to_numpy": [ + { + "metadata": { + "usage.hola": 1 + } + } + ], + "describe": [ + { + "metadata": { + "usage.hola": 1 + } + } + ], + "sort_index": [ + { + "pos_or_kw_required": { + "axis": { + "type": { + "name": "int" + } + }, + "ascending": { + "type": { + "name": "bool" + } + } + }, + "metadata": { + "usage.hola": 1 + } + } + ] + }, + "methods": { + "head": { + "metadata": { + "usage.hola": 1 + } + }, + "tail": { + "pos_or_kw_required": { + "n": { + "type": { + "name": "int" + } + } + }, + "metadata": { + "usage.hola": 1 + } + }, + "to_numpy": { + "metadata": { + "usage.hola": 1 + } + }, + "describe": { + "metadata": { + "usage.hola": 1 + } + }, + "sort_index": { + "pos_or_kw_required": { + "axis": { + "type": { + "name": "int" + } + }, + "ascending": { + "type": { + "name": "bool" + } + } + }, + "metadata": { + "usage.hola": 1 + } + } + }, + "properties": { + "index": [ + { + "usage.hola": 1 + }, + { + "type": "bottom" + } + ], + "columns": [ + { + "usage.hola": 1 + }, + { + "type": "bottom" + } + ] + } + } + } + }, + "pandas._libs.tslibs.timestamps": { + "classes": { + "Timestamp": { + "constructor_overloads": [ + { + "pos_or_kw_required": { + "ts_input": { + "type": "str", + "options": [ + "20130102" + ] + } + }, + "metadata": { + "usage.hola": 1 + } + } + ], + "constructor": { + "pos_or_kw_required": { + "ts_input": { + "type": "str", + "options": [ + "20130102" + ] + } + }, + "metadata": { + "usage.hola": 1 + } + } + } + } + }, + "pandas.core.arrays.categorical": { + "classes": { + "Categorical": { + "constructor_overloads": [ + { + "pos_or_kw_required": { + "values": { + "type": "list", + "item": { + "type": "str", + "options": [ + "train", + "test" + ] + } + } + }, + "metadata": { + "usage.hola": 1 + } + } + ], + "constructor": { + "pos_or_kw_required": { + "values": { + "type": "list", + "item": { + "type": "str", + "options": [ + "train", + "test" + ] + } + } + }, + "metadata": { + "usage.hola": 1 + } + } + } + } + } + } +} \ No newline at end of file diff --git a/notebooks/notebook_to_python.py b/notebooks/notebook_to_python.py new file mode 100644 index 0000000..633b5fc --- /dev/null +++ b/notebooks/notebook_to_python.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# This file was taken and modified from +# https://github.com/data-apis/dataframe-tools/ +import argparse +import json +import pathlib +import os + + +def notebook_to_python(content): + """ + Convert a notebook to a Python script, removing + all markdown. + + `content` is the parsed notebook JSON. + """ + result = [] + for cell in content['cells']: + if cell['cell_type'] == 'code': + if isinstance(cell['source'], str): + cell['source'] = [cell['source']] + for line in cell['source']: + result += line.split('\n') + + result = [line for line in result if not line.startswith('!') and not line.startswith('%')] + + return '\n'.join(result) + + +def process_notebook(path): + """ + Process the notebook in the given path to convert it into a python file. + """ + script_id = os.path.splitext(os.path.basename(path))[0] + with open(path, 'rb') as f: + script_content = f.read() + try: + nb_content = json.loads(script_content) + except json.JSONDecodeError: + # not a notebook, we assume it's already a Python file + script_as_python = script_content.decode('utf-8') + else: + script_as_python = notebook_to_python(nb_content) + + script_dir = pathlib.Path('scripts') + script_dir.mkdir(parents=True, exist_ok=True) + + with open(script_dir / f'{script_id}.py', 'w') as f: + f.write(script_as_python) + + +def main(directory): + """ + Take notebooks under the given directory, convert them to python files, and + create a directory structure ready to be executed. + """ + for root, dirs, files in os.walk(directory): + for f in files: + print(f) + if f.endswith('ipynb'): + print('*****') + process_notebook(os.path.join(root, f)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='notebook loader') + parser.add_argument('dir', type=str, + help='Directory of the notebooks') + args = parser.parse_args() + main(args.dir) \ No newline at end of file diff --git a/notebooks/output/example.jsonl b/notebooks/output/example.jsonl new file mode 100644 index 0000000..2eb9541 --- /dev/null +++ b/notebooks/output/example.jsonl @@ -0,0 +1,18 @@ +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:3","function":{"t":"type","v":{"module":"pandas.core.series","name":"Series"}},"bound_params":{"pos_or_kw":[["data",[{"t":"int"},{"t":"int"},{"t":"int"},{"t":"float"},{"t":"int"},{"t":"int"}]]]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:4","function":{"t":"builtin_function_or_method","v":"getattr"},"params":{"args":[{"t":"module","v":"pandas"},"date_range"]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:4","function":{"t":"function","v":{"module":"pandas.core.indexes.datetimes","name":"date_range"}},"bound_params":{"pos_or_kw":[["start","20130101"],["periods",{"t":"int"}]]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:5","function":{"t":"builtin_function_or_method","v":"getattr"},"params":{"args":[{"t":"module","v":"pandas"},"DataFrame"]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:5","function":{"t":"type","v":{"module":"pandas.core.frame","name":"DataFrame"}},"bound_params":{"pos_or_kw":[["data",{"t":{"module":"numpy","name":"ndarray"},"v":{"dtype":"float64"}}],["index",{"t":{"module":"pandas.core.indexes.datetimes","name":"DatetimeIndex"}}],["columns",["A","B","C","D"]]]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:12","function":{"t":"type","v":{"module":"pandas._libs.tslibs.timestamps","name":"Timestamp"}},"bound_params":{"pos_or_kw":[["ts_input","20130102"]]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:14","function":{"t":"builtin_function_or_method","v":"getattr"},"params":{"args":[{"t":"module","v":"pandas"},"Series"]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:14","function":{"t":"type","v":{"module":"pandas.core.series","name":"Series"}},"bound_params":{"pos_or_kw":[["data",{"t":"int"}],["index",[{"t":"int"},{"t":"int"},{"t":"int"},{"t":"int"}]],["dtype","float32"]]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:18","function":{"t":"type","v":{"module":"pandas.core.arrays.categorical","name":"Categorical"}},"bound_params":{"pos_or_kw":[["values",["test","train","test","train"]]]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:6","function":{"t":"type","v":{"module":"pandas.core.frame","name":"DataFrame"}},"bound_params":{"pos_or_kw":[["data",{"t":"dict","v":[["A",{"t":"float"}],["B",{"t":{"module":"pandas._libs.tslibs.timestamps","name":"Timestamp"}}],["C",{"t":{"module":"pandas.core.series","name":"Series"}}],["D",{"t":{"module":"numpy","name":"ndarray"},"v":{"dtype":"int32"}}],["E",{"t":{"module":"pandas.core.arrays.categorical","name":"Categorical"}}],["F","foo"]]}]]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:25","function":{"t":"method","v":{"self":{"t":{"module":"pandas.core.frame","name":"DataFrame"}},"name":"head"}},"bound_params":{}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:26","function":{"t":"method","v":{"self":{"t":{"module":"pandas.core.frame","name":"DataFrame"}},"name":"tail"}},"bound_params":{"pos_or_kw":[["n",{"t":"int"}]]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:27","function":{"t":"builtin_function_or_method","v":"getattr"},"params":{"args":[{"t":{"module":"pandas.core.frame","name":"DataFrame"}},"index"]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:28","function":{"t":"builtin_function_or_method","v":"getattr"},"params":{"args":[{"t":{"module":"pandas.core.frame","name":"DataFrame"}},"columns"]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:29","function":{"t":"method","v":{"self":{"t":{"module":"pandas.core.frame","name":"DataFrame"}},"name":"to_numpy"}},"bound_params":{}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:30","function":{"t":"method","v":{"self":{"t":{"module":"pandas.core.frame","name":"DataFrame"}},"name":"describe"}},"bound_params":{}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:31","function":{"t":"builtin_function_or_method","v":"getattr"},"params":{"args":[{"t":{"module":"pandas.core.frame","name":"DataFrame"}},"sort_index"]}} +{"location":"/Users/tefa/Documents/python-record-api/notebooks/scripts/example.py:31","function":{"t":"method","v":{"self":{"t":{"module":"pandas.core.frame","name":"DataFrame"}},"name":"sort_index"}},"bound_params":{"pos_or_kw":[["axis",{"t":"int"}],["ascending",{"t":"bool"}]]}} diff --git a/notebooks/run_record_api.sh b/notebooks/run_record_api.sh new file mode 100644 index 0000000..580c899 --- /dev/null +++ b/notebooks/run_record_api.sh @@ -0,0 +1,74 @@ +#!/bin/bash -e + +TOTAL=$(grep -l pandas scripts/*.py | wc -l) +TOTAL_RUN=0 +TOTAL_FAILED=0 +TOTAL_IGNORED=0 +TOTAL_PREVIOUS=0 + +mkdir output + +echo "Trying to run $(grep -l pandas scripts/*.py | wc -l) scripts" + +for FNAME in $(grep -l pandas scripts/*.py); do + echo "Running: $FNAME..." + + SCRIPT_ID=$(basename $FNAME .py) + OUTPUT_FNAME=output/$SCRIPT_ID.jsonl + if [[ -e $OUTPUT_FNAME ]]; then + TOTAL_PREVIOUS=$((TOTAL_PREVIOUS + 1)) + continue + fi + + TOTAL_RUN=$((TOTAL_RUN + 1)) + + cd scripts + export PYTHON_RECORD_API_TO_MODULES="pandas" + export PYTHON_RECORD_API_FROM_MODULES=$SCRIPT_ID + export PYTHON_RECORD_API_OUTPUT_FILE=../$OUTPUT_FNAME + set +e + python -m record_api + FAILED=$? + FAILED=$((FAILED != 0 ? 1 : 0)) + TOTAL_FAILED=$((TOTAL_FAILED + FAILED)) + set -e + + echo "Number of scripts: $TOTAL" + echo "Number of scripts previously run: $TOTAL_PREVIOUS" + echo "Number of scripts run: $TOTAL_RUN" + echo "Number of scripts failed: $TOTAL_FAILED" + echo "Number of scripts ignored (missing data source): $TOTAL_IGNORED" + echo "*********************************************************************" +done + +cd .. +mkdir grouped + +for FNAME in $(grep -l pandas output/*.jsonl); do + echo "Running: $FNAME..." + + SCRIPT_ID=$(basename $FNAME .jsonl) + OUTPUT_FNAME=grouped/$SCRIPT_ID.jsonl + + cd output + export PYTHON_RECORD_API_OUTPUT=../$OUTPUT_FNAME + export PYTHON_RECORD_API_INPUT=$SCRIPT_ID + python -m record_api.line_counts + +done + +cd .. +mkdir infer_apis + +for FNAME in $(grep -l pandas infer_apis/*.jsonl); do + echo "Running: $FNAME..." + + SCRIPT_ID=$(basename $FNAME .jsonl) + OUTPUT_FNAME=infer_apis/$SCRIPT_ID.json + + export PYTHON_RECORD_API_OUTPUT=../$OUTPUT_FNAME + export PYTHON_RECORD_API_INPUT=$SCRIPT_ID + export PYTHON_RECORD_API_LABEL=$SCRIPT_ID + export PYTHON_RECORD_API_MODULES=pandas + python -m record_api.infer_apis +done \ No newline at end of file diff --git a/notebooks/scripts/example.py b/notebooks/scripts/example.py new file mode 100644 index 0000000..c5cc4d7 --- /dev/null +++ b/notebooks/scripts/example.py @@ -0,0 +1,31 @@ +import numpy as np +import pandas as pd +s = pd.Series([1, 3, 5, np.nan, 6, 8]) +dates = pd.date_range("20130101", periods=6) +df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD")) +df2 = pd.DataFrame( + + { + + "A": 1.0, + + "B": pd.Timestamp("20130102"), + + "C": pd.Series(1, index=list(range(4)), dtype="float32"), + + "D": np.array([3] * 4, dtype="int32"), + + "E": pd.Categorical(["test", "train", "test", "train"]), + + "F": "foo", + + } + +) +df.head() +df.tail(3) +df.index +df.columns +df.to_numpy() +df.describe() +df.sort_index(axis=1, ascending=False) \ No newline at end of file