TuxML
diff --git a/‎option_columns.json
Lines changed: 1 addition & 0 deletions b/‎option_columns.json
Lines changed: 1 addition & 0 deletions
diff --git a/‎size_analysis.ipynb
Lines changed: 124 additions & 0 deletions b/‎size_analysis.ipynb
Lines changed: 124 additions & 0 deletions
@@ -0,0 +1,124 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import json\n",
+    "with open(\"option_columns.json\",\"r\") as f:\n",
+    "    option_columns = json.load(f)\n",
+    "\n",
+    "#Find the dataset here http://37.187.140.181/tuxml_dataset/\n",
+    "df = pd.read_csv(\"../tuxml-datasets/dataset_encoded_size.csv\", dtype={k:\"int8\" for k in option_columns})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(92473, 12638)"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#Filter configurations\n",
+    "df.query(\"cid >= 30000\", inplace=True)\n",
+    "df.query(\"kernel_size >= 0\", inplace=True)\n",
+    "\n",
+    "df.fillna(-1, inplace=True)\n",
+    "\n",
+    "df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn import tree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=\"cid\").drop(columns=\"kernel_size\"), df[\"kernel_size\"], test_size=0.9)\n",
+    "reg = tree.DecisionTreeRegressor()\n",
+    "reg.fit(X_train, y_train)\n",
+    "y_pred = reg.predict(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "count    83226.000000\n",
+       "mean        20.660668\n",
+       "std         26.782288\n",
+       "min          0.000392\n",
+       "25%          6.686377\n",
+       "50%         14.923435\n",
+       "75%         26.849332\n",
+       "max       1678.388727\n",
+       "Name: % error, dtype: float64"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dfErrors = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred, \"error\":(y_pred - y_test).abs(), \"% error\":((y_pred - y_test)/y_test).abs()*100})\n",
+    "dfErrors[\"% error\"].describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}