TuxML
diff --git a/‎.ipynb_checkpoints/feature_correlations-checkpoint.ipynb
Lines changed: 133 additions & 0 deletions b/‎.ipynb_checkpoints/feature_correlations-checkpoint.ipynb
Lines changed: 133 additions & 0 deletions
@@ -0,0 +1,133 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import os\n",
+    "\n",
+    "IGRIDA=True \n",
+    "if (IGRIDA):\n",
+    "    dirname = os.path.dirname(__file__)\n",
+    "    dataset_size_location = \"../rf-analysis/all_size_withyes.pkl\"\n",
+    "    dataset_size_filename = os.path.join(dirname, dataset_size_location) \n",
+    "    df = pd.read_pickle(dataset_size_filename)\n",
+    "#else:\n",
+    "#    import tuxml\n",
+    "#    df = tuxml.load_dataset()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "size_methods = [\"vmlinux\", \"GZIP-bzImage\", \"GZIP-vmlinux\", \"GZIP\", \"BZIP2-bzImage\", \n",
+    "              \"BZIP2-vmlinux\", \"BZIP2\", \"LZMA-bzImage\", \"LZMA-vmlinux\", \"LZMA\", \"XZ-bzImage\", \"XZ-vmlinux\", \"XZ\", \n",
+    "              \"LZO-bzImage\", \"LZO-vmlinux\", \"LZO\", \"LZ4-bzImage\", \"LZ4-vmlinux\", \"LZ4\"]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def _filter(df):\n",
+    "    return df.drop(columns=[\"cid\"]).drop(columns=size_methods)\n",
+    "fdf = _filter(df)\n",
+    "\n",
+    "dict_corr = []\n",
+    "def _samecolumns_after_ithcolumn(fd, i):\n",
+    "    col_i = fd[fd.columns[i]] # the column we want to compare with\n",
+    "    for j in range(i+1, len(fd.columns) - 1):\n",
+    "        if (col_i.equals(fd[fd.columns[j]])):\n",
+    "            dict_corr.append({fd.columns[i] : fd.columns[j]})\n",
+    "\n",
+    "# TIME CONSUMING !!\n",
+    "starting_point = 0\n",
+    "ending_point = len(fdf)\n",
+    "for i in range(starting_point, ending_point):\n",
+    "    _samecolumns_after_ithcolumn(fdf, i)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame(dict_corr).to_csv('ft-correlations.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def correlated_to(fd, option_name):\n",
+    "    i = fd.columns.get_loc(option_name)\n",
+    "    col_i = fd[fd.columns[i]] # the column we want to compare with\n",
+    "    for j in range(0, len(fd.columns) - 1):\n",
+    "        if (i == j):\n",
+    "            continue\n",
+    "        if (col_i.equals(fd[fd.columns[j]])):\n",
+    "            print(fd.columns[i], \"correlated to\", fd.columns[j])\n",
+    "\n",
+    "# correlated_to(fdf, \"CC_OPTIMIZE_FOR_SIZE\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: when 'y' value, correlation! \n",
+    "# interesting to see this effect (ie some options are correlated under some conditions)\n",
+    "# fdf.query('CC_OPTIMIZE_FOR_SIZE == 1')['CC_OPTIMIZE_FOR_SIZE'].value_counts(), fdf.query('CC_OPTIMIZE_FOR_SIZE == 1')['KMEMCHECK'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}