From 250ceca2a997afef480ce2222f9aa833fc4c8535 Mon Sep 17 00:00:00 2001 From: Lewis Fogden Date: Sun, 24 Mar 2024 21:43:52 +0000 Subject: [PATCH] added rectify() --- examples/notebook/table_documentation.ipynb | 625 +++++++++++++------- src/heavylight/heavytables.py | 19 +- 2 files changed, 434 insertions(+), 210 deletions(-) diff --git a/examples/notebook/table_documentation.ipynb b/examples/notebook/table_documentation.ipynb index 7d5923e..570bafb 100644 --- a/examples/notebook/table_documentation.ipynb +++ b/examples/notebook/table_documentation.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -101,7 +101,7 @@ "30 50 1.97" ] }, - "execution_count": 2, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -123,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -139,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -165,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -174,7 +174,7 @@ "array([1.17, 0.77, 0.77, 1.97])" ] }, - "execution_count": 5, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -193,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -248,7 +248,7 @@ "2 C 0.9" ] }, - "execution_count": 6, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -265,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -274,7 +274,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -283,7 +283,7 @@ "0.3" ] }, - "execution_count": 8, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -294,7 +294,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -303,7 +303,7 @@ "0.5" ] }, - "execution_count": 9, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -314,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -323,7 +323,7 @@ "0.5" ] }, - "execution_count": 10, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -334,7 +334,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -343,7 +343,7 @@ "array([0.3, 0.3, 0.9, 0.5])" ] }, - "execution_count": 11, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -355,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -389,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -468,7 +468,7 @@ "6 inf age_to_inf" ] }, - "execution_count": 13, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -490,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -506,7 +506,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -515,7 +515,7 @@ "'age_to_020'" ] }, - "execution_count": 15, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -526,7 +526,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -535,7 +535,7 @@ "'age_to_020'" ] }, - "execution_count": 16, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -546,7 +546,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -555,7 +555,7 @@ "'age_to_030'" ] }, - "execution_count": 17, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -566,7 +566,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -575,7 +575,7 @@ "'age_to_020'" ] }, - "execution_count": 18, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -587,7 +587,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 49, "metadata": {}, "outputs": [ { @@ -608,7 +608,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -729,7 +729,7 @@ "13 45.000000 age_to_060" ] }, - "execution_count": 20, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -750,7 +750,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -823,7 +823,7 @@ "5 10 0.10" ] }, - "execution_count": 21, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -838,7 +838,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -847,7 +847,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 53, "metadata": {}, "outputs": [ { @@ -856,7 +856,7 @@ "" ] }, - "execution_count": 23, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" }, @@ -887,7 +887,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 54, "metadata": {}, "outputs": [ { @@ -1005,7 +1005,7 @@ "[105 rows x 3 columns]" ] }, - "execution_count": 24, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -1021,7 +1021,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -1037,7 +1037,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 56, "metadata": {}, "outputs": [ { @@ -1046,7 +1046,7 @@ "4.4074642432000015" ] }, - "execution_count": 26, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -1064,7 +1064,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 57, "metadata": {}, "outputs": [ { @@ -1074,7 +1074,7 @@ " 4.40746424])" ] }, - "execution_count": 27, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -1099,7 +1099,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 58, "metadata": {}, "outputs": [ { @@ -1179,7 +1179,7 @@ "5 2 B 22" ] }, - "execution_count": 28, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } @@ -1195,20 +1195,20 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 59, "metadata": {}, "outputs": [ { - "ename": "AssertionError", - "evalue": "", + "ename": "ValueError", + "evalue": "Input `df` is not rectangular, expected_rows=8 != len(self.values)=6", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[29], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# This will trigger an assertion error (to fix:should raise a proper Exception)\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m tab6 \u001b[38;5;241m=\u001b[39m \u001b[43mTable\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf6\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/Dev/heavylight/src/heavylight/heavytables.py:144\u001b[0m, in \u001b[0;36mTable.__init__\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 140\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcol_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not implemented on \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcol\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 142\u001b[0m \u001b[38;5;66;03m# create an intkeytable\u001b[39;00m\n\u001b[0;32m--> 144\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_int_key_table \u001b[38;5;241m=\u001b[39m \u001b[43mIntKeyTable\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_int_keys\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf \u001b[38;5;241m=\u001b[39m df\n", - "File \u001b[0;32m~/Dev/heavylight/src/heavylight/heavytables.py:77\u001b[0m, in \u001b[0;36mIntKeyTable.__init__\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 74\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvalues \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvalue_col]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 76\u001b[0m expected_rows \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mprod(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mranges)\n\u001b[0;32m---> 77\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m expected_rows \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvalues)\n", - "\u001b[0;31mAssertionError\u001b[0m: " + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[59], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# This will trigger an assertion error (to fix:should raise a proper Exception)\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m tab6 \u001b[38;5;241m=\u001b[39m \u001b[43mTable\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf6\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/Dev/heavylight/src/heavylight/heavytables.py:146\u001b[0m, in \u001b[0;36mTable.__init__\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcol_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not implemented on \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcol\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 144\u001b[0m \u001b[38;5;66;03m# create an intkeytable\u001b[39;00m\n\u001b[0;32m--> 146\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_int_key_table \u001b[38;5;241m=\u001b[39m \u001b[43mIntKeyTable\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_int_keys\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 147\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdf \u001b[38;5;241m=\u001b[39m df\n", + "File \u001b[0;32m~/Dev/heavylight/src/heavylight/heavytables.py:79\u001b[0m, in \u001b[0;36mIntKeyTable.__init__\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 77\u001b[0m expected_rows \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mprod(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mranges)\n\u001b[1;32m 78\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m expected_rows \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvalues):\n\u001b[0;32m---> 79\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mInput `df` is not rectangular, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexpected_rows\u001b[38;5;132;01m=}\u001b[39;00m\u001b[38;5;124m != \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvalues)\u001b[38;5;132;01m=}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n", + "\u001b[0;31mValueError\u001b[0m: Input `df` is not rectangular, expected_rows=8 != len(self.values)=6" ] } ], @@ -1227,7 +1227,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 60, "metadata": {}, "outputs": [ { @@ -1369,7 +1369,7 @@ "[53560 rows x 5 columns]" ] }, - "execution_count": 32, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } @@ -1383,7 +1383,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 61, "metadata": {}, "outputs": [ { @@ -1392,7 +1392,7 @@ "106090" ] }, - "execution_count": 40, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } @@ -1403,7 +1403,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 62, "metadata": {}, "outputs": [ { @@ -1477,7 +1477,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 63, "metadata": {}, "outputs": [ { @@ -1486,7 +1486,7 @@ "['Age|int', 'Duration|int', 'Underwriting|str', 'Sex|str']" ] }, - "execution_count": 44, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -1498,7 +1498,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 64, "metadata": {}, "outputs": [ { @@ -1628,7 +1628,7 @@ "[106090 rows x 4 columns]" ] }, - "execution_count": 72, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -1646,7 +1646,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 65, "metadata": {}, "outputs": [ { @@ -1788,7 +1788,7 @@ "[106090 rows x 5 columns]" ] }, - "execution_count": 74, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -1801,7 +1801,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 66, "metadata": {}, "outputs": [ { @@ -1810,7 +1810,7 @@ "52530" ] }, - "execution_count": 76, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } @@ -1821,7 +1821,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 67, "metadata": {}, "outputs": [ { @@ -1830,7 +1830,7 @@ "True" ] }, - "execution_count": 79, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } @@ -1842,7 +1842,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1851,7 +1851,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 69, "metadata": {}, "outputs": [ { @@ -1909,7 +1909,7 @@ "1 19 2 NS_P Female 0.00033" ] }, - "execution_count": 81, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } @@ -1920,7 +1920,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 70, "metadata": {}, "outputs": [ { @@ -1929,7 +1929,7 @@ "0.00033" ] }, - "execution_count": 82, + "execution_count": 70, "metadata": {}, "output_type": "execute_result" } @@ -1940,7 +1940,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 71, "metadata": {}, "outputs": [ { @@ -1973,44 +1973,44 @@ " \n", " \n", " \n", - " 25180\n", - " 117\n", - " 54\n", + " 27628\n", + " 78\n", + " 53\n", " NS_SP\n", - " Female\n", - " 0.85143\n", - " \n", - " \n", - " 17335\n", - " 37\n", - " 7\n", - " NS_R\n", " Male\n", - " 0.00047\n", + " 0.03014\n", " \n", " \n", - " 32688\n", - " 70\n", - " 48\n", - " S_P\n", + " 46812\n", + " 107\n", + " 40\n", + " S_R\n", " Female\n", - " 0.02309\n", + " 0.49811\n", " \n", " \n", - " 53529\n", - " 118\n", - " 6\n", + " 47039\n", + " 84\n", + " 12\n", " S_R\n", - " Male\n", - " 0.89977\n", + " Female\n", + " 0.09072\n", " \n", " \n", - " 45607\n", - " 80\n", - " 32\n", + " 44495\n", + " 67\n", + " 33\n", " S_R\n", " Female\n", - " 0.06138\n", + " 0.02112\n", + " \n", + " \n", + " 39691\n", + " 45\n", + " 4\n", + " S_P\n", + " Male\n", + " 0.00160\n", " \n", " \n", " ...\n", @@ -2021,44 +2021,44 @@ " ...\n", " \n", " \n", - " 52443\n", - " 85\n", - " 12\n", + " 50522\n", + " 86\n", + " 44\n", " S_R\n", " Male\n", - " 0.09892\n", + " 0.12620\n", " \n", " \n", - " 25609\n", - " 78\n", - " 6\n", + " 23848\n", + " 115\n", + " 72\n", " NS_SP\n", " Female\n", - " 0.01140\n", + " 0.76487\n", " \n", " \n", - " 35805\n", - " 87\n", - " 25\n", - " S_P\n", + " 1806\n", + " 57\n", + " 21\n", + " NS_P\n", " Female\n", - " 0.12402\n", + " 0.00277\n", " \n", " \n", - " 49964\n", - " 95\n", - " 60\n", - " S_R\n", - " Male\n", - " 0.25410\n", + " 14096\n", + " 102\n", + " 45\n", + " NS_R\n", + " Female\n", + " 0.38258\n", " \n", " \n", - " 2035\n", - " 40\n", - " 1\n", + " 2339\n", + " 107\n", + " 65\n", " NS_P\n", " Female\n", - " 0.00016\n", + " 0.49811\n", " \n", " \n", "\n", @@ -2067,22 +2067,22 @@ ], "text/plain": [ " Age|int Duration|int Underwriting|str Sex|str vals\n", - "25180 117 54 NS_SP Female 0.85143\n", - "17335 37 7 NS_R Male 0.00047\n", - "32688 70 48 S_P Female 0.02309\n", - "53529 118 6 S_R Male 0.89977\n", - "45607 80 32 S_R Female 0.06138\n", + "27628 78 53 NS_SP Male 0.03014\n", + "46812 107 40 S_R Female 0.49811\n", + "47039 84 12 S_R Female 0.09072\n", + "44495 67 33 S_R Female 0.02112\n", + "39691 45 4 S_P Male 0.00160\n", "... ... ... ... ... ...\n", - "52443 85 12 S_R Male 0.09892\n", - "25609 78 6 NS_SP Female 0.01140\n", - "35805 87 25 S_P Female 0.12402\n", - "49964 95 60 S_R Male 0.25410\n", - "2035 40 1 NS_P Female 0.00016\n", + "50522 86 44 S_R Male 0.12620\n", + "23848 115 72 NS_SP Female 0.76487\n", + "1806 57 21 NS_P Female 0.00277\n", + "14096 102 45 NS_R Female 0.38258\n", + "2339 107 65 NS_P Female 0.49811\n", "\n", "[100000 rows x 5 columns]" ] }, - "execution_count": 93, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } @@ -2096,7 +2096,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 72, "metadata": {}, "outputs": [ { @@ -2130,49 +2130,49 @@ " \n", " \n", " \n", - " 25180\n", - " 117\n", - " 54\n", + " 27628\n", + " 78\n", + " 53\n", " NS_SP\n", - " Female\n", - " 0.85143\n", - " 0.85143\n", - " \n", - " \n", - " 17335\n", - " 37\n", - " 7\n", - " NS_R\n", " Male\n", - " 0.00047\n", - " 0.00047\n", + " 0.03014\n", + " 0.03014\n", " \n", " \n", - " 32688\n", - " 70\n", - " 48\n", - " S_P\n", + " 46812\n", + " 107\n", + " 40\n", + " S_R\n", " Female\n", - " 0.02309\n", - " 0.02309\n", + " 0.49811\n", + " 0.49811\n", " \n", " \n", - " 53529\n", - " 118\n", - " 6\n", + " 47039\n", + " 84\n", + " 12\n", " S_R\n", - " Male\n", - " 0.89977\n", - " 0.89977\n", + " Female\n", + " 0.09072\n", + " 0.09072\n", " \n", " \n", - " 45607\n", - " 80\n", - " 32\n", + " 44495\n", + " 67\n", + " 33\n", " S_R\n", " Female\n", - " 0.06138\n", - " 0.06138\n", + " 0.02112\n", + " 0.02112\n", + " \n", + " \n", + " 39691\n", + " 45\n", + " 4\n", + " S_P\n", + " Male\n", + " 0.00160\n", + " 0.00160\n", " \n", " \n", " ...\n", @@ -2184,49 +2184,49 @@ " ...\n", " \n", " \n", - " 52443\n", - " 85\n", - " 12\n", + " 50522\n", + " 86\n", + " 44\n", " S_R\n", " Male\n", - " 0.09892\n", - " 0.09892\n", + " 0.12620\n", + " 0.12620\n", " \n", " \n", - " 25609\n", - " 78\n", - " 6\n", + " 23848\n", + " 115\n", + " 72\n", " NS_SP\n", " Female\n", - " 0.01140\n", - " 0.01140\n", + " 0.76487\n", + " 0.76487\n", " \n", " \n", - " 35805\n", - " 87\n", - " 25\n", - " S_P\n", + " 1806\n", + " 57\n", + " 21\n", + " NS_P\n", " Female\n", - " 0.12402\n", - " 0.12402\n", + " 0.00277\n", + " 0.00277\n", " \n", " \n", - " 49964\n", - " 95\n", - " 60\n", - " S_R\n", - " Male\n", - " 0.25410\n", - " 0.25410\n", + " 14096\n", + " 102\n", + " 45\n", + " NS_R\n", + " Female\n", + " 0.38258\n", + " 0.38258\n", " \n", " \n", - " 2035\n", - " 40\n", - " 1\n", + " 2339\n", + " 107\n", + " 65\n", " NS_P\n", " Female\n", - " 0.00016\n", - " 0.00016\n", + " 0.49811\n", + " 0.49811\n", " \n", " \n", "\n", @@ -2235,22 +2235,22 @@ ], "text/plain": [ " Age|int Duration|int Underwriting|str Sex|str vals val_from_tab7\n", - "25180 117 54 NS_SP Female 0.85143 0.85143\n", - "17335 37 7 NS_R Male 0.00047 0.00047\n", - "32688 70 48 S_P Female 0.02309 0.02309\n", - "53529 118 6 S_R Male 0.89977 0.89977\n", - "45607 80 32 S_R Female 0.06138 0.06138\n", + "27628 78 53 NS_SP Male 0.03014 0.03014\n", + "46812 107 40 S_R Female 0.49811 0.49811\n", + "47039 84 12 S_R Female 0.09072 0.09072\n", + "44495 67 33 S_R Female 0.02112 0.02112\n", + "39691 45 4 S_P Male 0.00160 0.00160\n", "... ... ... ... ... ... ...\n", - "52443 85 12 S_R Male 0.09892 0.09892\n", - "25609 78 6 NS_SP Female 0.01140 0.01140\n", - "35805 87 25 S_P Female 0.12402 0.12402\n", - "49964 95 60 S_R Male 0.25410 0.25410\n", - "2035 40 1 NS_P Female 0.00016 0.00016\n", + "50522 86 44 S_R Male 0.12620 0.12620\n", + "23848 115 72 NS_SP Female 0.76487 0.76487\n", + "1806 57 21 NS_P Female 0.00277 0.00277\n", + "14096 102 45 NS_R Female 0.38258 0.38258\n", + "2339 107 65 NS_P Female 0.49811 0.49811\n", "\n", "[100000 rows x 6 columns]" ] }, - "execution_count": 95, + "execution_count": 72, "metadata": {}, "output_type": "execute_result" } @@ -2266,7 +2266,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 73, "metadata": {}, "outputs": [ { @@ -2275,7 +2275,7 @@ "True" ] }, - "execution_count": 99, + "execution_count": 73, "metadata": {}, "output_type": "execute_result" } @@ -2284,6 +2284,213 @@ "np.allclose(df7_test['vals'], df7_test['val_from_tab7'])" ] }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "tab7b = Table(Table.rectify(df7))" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.03014, 0.49811, 0.09072, ..., 0.00277, 0.38258, 0.49811])" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tab7b[\n", + " df7_test['Age|int'].values,\n", + " df7_test['Duration|int'].values,\n", + " df7_test['Underwriting|str'].values,\n", + " df7_test['Sex|str'].values\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Age|intDuration|intUnderwriting|strSex|strvals
10182NS_PFemaleNaN
20183NS_PFemaleNaN
1050193NS_PFemaleNaN
30184NS_PFemaleNaN
1060194NS_PFemaleNaN
..................
100939115103S_RMaleNaN
101969116103S_RMaleNaN
102999117103S_RMaleNaN
104029118103S_RMaleNaN
105059119103S_RMaleNaN
\n", + "

52530 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " Age|int Duration|int Underwriting|str Sex|str vals\n", + "10 18 2 NS_P Female NaN\n", + "20 18 3 NS_P Female NaN\n", + "1050 19 3 NS_P Female NaN\n", + "30 18 4 NS_P Female NaN\n", + "1060 19 4 NS_P Female NaN\n", + "... ... ... ... ... ...\n", + "100939 115 103 S_R Male NaN\n", + "101969 116 103 S_R Male NaN\n", + "102999 117 103 S_R Male NaN\n", + "104029 118 103 S_R Male NaN\n", + "105059 119 103 S_R Male NaN\n", + "\n", + "[52530 rows x 5 columns]" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tab7b.df[tab7b.df.vals.isna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "nan" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tab7b[18, 2, 'NS_P', 'Female']" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/src/heavylight/heavytables.py b/src/heavylight/heavytables.py index 489aef5..2f02121 100644 --- a/src/heavylight/heavytables.py +++ b/src/heavylight/heavytables.py @@ -1,6 +1,7 @@ # Heavytables # Provides a high performance multiple-index -> single index table +import itertools import pandas as pd import numpy as np @@ -74,7 +75,8 @@ def __init__(self, df: pd.DataFrame): self.values = self.df[self.value_col].values expected_rows = np.prod(self.ranges) - assert expected_rows == len(self.values) + if expected_rows != len(self.values): + raise ValueError(f'Input `df` is not rectangular, {expected_rows=} != {len(self.values)=}') def get_index(self, *keys): index = 0 @@ -159,6 +161,21 @@ def __getitem__(self, keys): def __repr__(self): # TODO: return a nice representation of the table, e.g. head/keys etc. return repr(self.df) + + @staticmethod + def rectify(df: pd.DataFrame) -> pd.DataFrame: + """Convert a triangular (incomplete) dataframe into a valid rectangular dataframe""" + key_cols = list(df.columns[:-1]) + df_unique_keys = [list(df[col_name].unique()) for col_name in key_cols] + # TODO: check that any |int and |int_bound keys form a range. (e.g. use min and max, and then arange(+1)) + + # construct a dataframe from the cartesian product of these + df_rect_keys = pd.DataFrame(itertools.product(*df_unique_keys), columns=key_cols) + + # fill out the table + df_rect = df_rect_keys.merge(right=df, how='left', on = key_cols) + + return df_rect @classmethod def read_excel(cls, spreadsheet_path, sheet_name):