Skip to content

Commit 2971be2

Browse files
committed
unfinished linear
1 parent a351f46 commit 2971be2

7 files changed

+587
-9121
lines changed

ConceptExplain/Scipy_Linear_Regression.ipynb

+180
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,387 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 2,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"import tensorflow as tf\n",
12+
"import tempfile\n",
13+
"import pandas as pd"
14+
]
15+
},
16+
{
17+
"cell_type": "code",
18+
"execution_count": 3,
19+
"metadata": {
20+
"collapsed": false
21+
},
22+
"outputs": [],
23+
"source": [
24+
"import urllib.request"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": 5,
30+
"metadata": {
31+
"collapsed": false
32+
},
33+
"outputs": [
34+
{
35+
"data": {
36+
"text/plain": [
37+
"('/var/folders/vk/hdq2y0l55v72c696m28nyymr0000gp/T/tmpzrz55s82',\n",
38+
" <http.client.HTTPMessage at 0x119553358>)"
39+
]
40+
},
41+
"execution_count": 5,
42+
"metadata": {},
43+
"output_type": "execute_result"
44+
}
45+
],
46+
"source": [
47+
"train_file = tempfile.NamedTemporaryFile()\n",
48+
"test_file = tempfile.NamedTemporaryFile()\n",
49+
"urllib.request.urlretrieve(\"http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data\", train_file.name)\n",
50+
"urllib.request.urlretrieve(\"http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test\", test_file.name)"
51+
]
52+
},
53+
{
54+
"cell_type": "code",
55+
"execution_count": 6,
56+
"metadata": {
57+
"collapsed": false
58+
},
59+
"outputs": [],
60+
"source": [
61+
"COLUMNS = [\"age\", \"workclass\", \"fnlwgt\", \"education\", \"education_num\",\n",
62+
" \"marital_status\", \"occupation\", \"relationship\", \"race\", \"gender\",\n",
63+
" \"capital_gain\", \"capital_loss\", \"hours_per_week\", \"native_country\",\n",
64+
" \"income_bracket\"]\n",
65+
"df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)\n",
66+
"df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)"
67+
]
68+
},
69+
{
70+
"cell_type": "code",
71+
"execution_count": 7,
72+
"metadata": {
73+
"collapsed": true
74+
},
75+
"outputs": [],
76+
"source": [
77+
"LABEL_COLUMN = \"label\"\n",
78+
"df_train[LABEL_COLUMN] = (df_train[\"income_bracket\"].apply(lambda x: \">50K\" in x)).astype(int)\n",
79+
"df_test[LABEL_COLUMN] = (df_test[\"income_bracket\"].apply(lambda x: \">50K\" in x)).astype(int)"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": 8,
85+
"metadata": {
86+
"collapsed": true
87+
},
88+
"outputs": [],
89+
"source": [
90+
"CATEGORICAL_COLUMNS = [\"workclass\", \"education\", \"marital_status\", \"occupation\",\n",
91+
" \"relationship\", \"race\", \"gender\", \"native_country\"]\n",
92+
"CONTINUOUS_COLUMNS = [\"age\", \"education_num\", \"capital_gain\", \"capital_loss\", \"hours_per_week\"]"
93+
]
94+
},
95+
{
96+
"cell_type": "code",
97+
"execution_count": 9,
98+
"metadata": {
99+
"collapsed": false
100+
},
101+
"outputs": [
102+
{
103+
"data": {
104+
"text/html": [
105+
"<div>\n",
106+
"<style>\n",
107+
" .dataframe thead tr:only-child th {\n",
108+
" text-align: right;\n",
109+
" }\n",
110+
"\n",
111+
" .dataframe thead th {\n",
112+
" text-align: left;\n",
113+
" }\n",
114+
"\n",
115+
" .dataframe tbody tr th {\n",
116+
" vertical-align: top;\n",
117+
" }\n",
118+
"</style>\n",
119+
"<table border=\"1\" class=\"dataframe\">\n",
120+
" <thead>\n",
121+
" <tr style=\"text-align: right;\">\n",
122+
" <th></th>\n",
123+
" <th>age</th>\n",
124+
" <th>workclass</th>\n",
125+
" <th>fnlwgt</th>\n",
126+
" <th>education</th>\n",
127+
" <th>education_num</th>\n",
128+
" <th>marital_status</th>\n",
129+
" <th>occupation</th>\n",
130+
" <th>relationship</th>\n",
131+
" <th>race</th>\n",
132+
" <th>gender</th>\n",
133+
" <th>capital_gain</th>\n",
134+
" <th>capital_loss</th>\n",
135+
" <th>hours_per_week</th>\n",
136+
" <th>native_country</th>\n",
137+
" <th>income_bracket</th>\n",
138+
" <th>label</th>\n",
139+
" </tr>\n",
140+
" </thead>\n",
141+
" <tbody>\n",
142+
" <tr>\n",
143+
" <th>0</th>\n",
144+
" <td>39</td>\n",
145+
" <td>State-gov</td>\n",
146+
" <td>77516</td>\n",
147+
" <td>Bachelors</td>\n",
148+
" <td>13</td>\n",
149+
" <td>Never-married</td>\n",
150+
" <td>Adm-clerical</td>\n",
151+
" <td>Not-in-family</td>\n",
152+
" <td>White</td>\n",
153+
" <td>Male</td>\n",
154+
" <td>2174</td>\n",
155+
" <td>0</td>\n",
156+
" <td>40</td>\n",
157+
" <td>United-States</td>\n",
158+
" <td>&lt;=50K</td>\n",
159+
" <td>0</td>\n",
160+
" </tr>\n",
161+
" <tr>\n",
162+
" <th>1</th>\n",
163+
" <td>50</td>\n",
164+
" <td>Self-emp-not-inc</td>\n",
165+
" <td>83311</td>\n",
166+
" <td>Bachelors</td>\n",
167+
" <td>13</td>\n",
168+
" <td>Married-civ-spouse</td>\n",
169+
" <td>Exec-managerial</td>\n",
170+
" <td>Husband</td>\n",
171+
" <td>White</td>\n",
172+
" <td>Male</td>\n",
173+
" <td>0</td>\n",
174+
" <td>0</td>\n",
175+
" <td>13</td>\n",
176+
" <td>United-States</td>\n",
177+
" <td>&lt;=50K</td>\n",
178+
" <td>0</td>\n",
179+
" </tr>\n",
180+
" <tr>\n",
181+
" <th>2</th>\n",
182+
" <td>38</td>\n",
183+
" <td>Private</td>\n",
184+
" <td>215646</td>\n",
185+
" <td>HS-grad</td>\n",
186+
" <td>9</td>\n",
187+
" <td>Divorced</td>\n",
188+
" <td>Handlers-cleaners</td>\n",
189+
" <td>Not-in-family</td>\n",
190+
" <td>White</td>\n",
191+
" <td>Male</td>\n",
192+
" <td>0</td>\n",
193+
" <td>0</td>\n",
194+
" <td>40</td>\n",
195+
" <td>United-States</td>\n",
196+
" <td>&lt;=50K</td>\n",
197+
" <td>0</td>\n",
198+
" </tr>\n",
199+
" <tr>\n",
200+
" <th>3</th>\n",
201+
" <td>53</td>\n",
202+
" <td>Private</td>\n",
203+
" <td>234721</td>\n",
204+
" <td>11th</td>\n",
205+
" <td>7</td>\n",
206+
" <td>Married-civ-spouse</td>\n",
207+
" <td>Handlers-cleaners</td>\n",
208+
" <td>Husband</td>\n",
209+
" <td>Black</td>\n",
210+
" <td>Male</td>\n",
211+
" <td>0</td>\n",
212+
" <td>0</td>\n",
213+
" <td>40</td>\n",
214+
" <td>United-States</td>\n",
215+
" <td>&lt;=50K</td>\n",
216+
" <td>0</td>\n",
217+
" </tr>\n",
218+
" <tr>\n",
219+
" <th>4</th>\n",
220+
" <td>28</td>\n",
221+
" <td>Private</td>\n",
222+
" <td>338409</td>\n",
223+
" <td>Bachelors</td>\n",
224+
" <td>13</td>\n",
225+
" <td>Married-civ-spouse</td>\n",
226+
" <td>Prof-specialty</td>\n",
227+
" <td>Wife</td>\n",
228+
" <td>Black</td>\n",
229+
" <td>Female</td>\n",
230+
" <td>0</td>\n",
231+
" <td>0</td>\n",
232+
" <td>40</td>\n",
233+
" <td>Cuba</td>\n",
234+
" <td>&lt;=50K</td>\n",
235+
" <td>0</td>\n",
236+
" </tr>\n",
237+
" </tbody>\n",
238+
"</table>\n",
239+
"</div>"
240+
],
241+
"text/plain": [
242+
" age workclass fnlwgt education education_num \\\n",
243+
"0 39 State-gov 77516 Bachelors 13 \n",
244+
"1 50 Self-emp-not-inc 83311 Bachelors 13 \n",
245+
"2 38 Private 215646 HS-grad 9 \n",
246+
"3 53 Private 234721 11th 7 \n",
247+
"4 28 Private 338409 Bachelors 13 \n",
248+
"\n",
249+
" marital_status occupation relationship race gender \\\n",
250+
"0 Never-married Adm-clerical Not-in-family White Male \n",
251+
"1 Married-civ-spouse Exec-managerial Husband White Male \n",
252+
"2 Divorced Handlers-cleaners Not-in-family White Male \n",
253+
"3 Married-civ-spouse Handlers-cleaners Husband Black Male \n",
254+
"4 Married-civ-spouse Prof-specialty Wife Black Female \n",
255+
"\n",
256+
" capital_gain capital_loss hours_per_week native_country income_bracket \\\n",
257+
"0 2174 0 40 United-States <=50K \n",
258+
"1 0 0 13 United-States <=50K \n",
259+
"2 0 0 40 United-States <=50K \n",
260+
"3 0 0 40 United-States <=50K \n",
261+
"4 0 0 40 Cuba <=50K \n",
262+
"\n",
263+
" label \n",
264+
"0 0 \n",
265+
"1 0 \n",
266+
"2 0 \n",
267+
"3 0 \n",
268+
"4 0 "
269+
]
270+
},
271+
"execution_count": 9,
272+
"metadata": {},
273+
"output_type": "execute_result"
274+
}
275+
],
276+
"source": [
277+
"df_train.head(5)"
278+
]
279+
},
280+
{
281+
"cell_type": "code",
282+
"execution_count": 10,
283+
"metadata": {
284+
"collapsed": true
285+
},
286+
"outputs": [],
287+
"source": [
288+
"import tensorflow as tf\n",
289+
"\n",
290+
"def input_fn(df):\n",
291+
" # Creates a dictionary mapping from each continuous feature column name (k) to\n",
292+
" # the values of that column stored in a constant Tensor.\n",
293+
" continuous_cols = {k: tf.constant(df[k].values)\n",
294+
" for k in CONTINUOUS_COLUMNS}\n",
295+
" # Creates a dictionary mapping from each categorical feature column name (k)\n",
296+
" # to the values of that column stored in a tf.SparseTensor.\n",
297+
" categorical_cols = {k: tf.SparseTensor(\n",
298+
" indices=[[i, 0] for i in range(df[k].size)],\n",
299+
" values=df[k].values,\n",
300+
" dense_shape=[df[k].size, 1])\n",
301+
" for k in CATEGORICAL_COLUMNS}\n",
302+
" # Merges the two dictionaries into one.\n",
303+
" feature_cols = dict(continuous_cols.items() + categorical_cols.items())\n",
304+
" # Converts the label column into a constant Tensor.\n",
305+
" label = tf.constant(df[LABEL_COLUMN].values)\n",
306+
" # Returns the feature columns and the label.\n",
307+
" return feature_cols, label\n",
308+
"\n",
309+
"def train_input_fn():\n",
310+
" return input_fn(df_train)\n",
311+
"\n",
312+
"def eval_input_fn():\n",
313+
" return input_fn(df_test)"
314+
]
315+
},
316+
{
317+
"cell_type": "code",
318+
"execution_count": 11,
319+
"metadata": {
320+
"collapsed": true
321+
},
322+
"outputs": [],
323+
"source": [
324+
"gender = tf.contrib.layers.sparse_column_with_keys(\n",
325+
" column_name=\"gender\", keys=[\"Female\", \"Male\"])"
326+
]
327+
},
328+
{
329+
"cell_type": "code",
330+
"execution_count": 12,
331+
"metadata": {
332+
"collapsed": true
333+
},
334+
"outputs": [],
335+
"source": [
336+
"education = tf.contrib.layers.sparse_column_with_hash_bucket(\"education\", hash_bucket_size=1000)"
337+
]
338+
},
339+
{
340+
"cell_type": "code",
341+
"execution_count": 13,
342+
"metadata": {
343+
"collapsed": true
344+
},
345+
"outputs": [],
346+
"source": [
347+
"race = tf.contrib.layers.sparse_column_with_hash_bucket(\"race\", hash_bucket_size=100)\n",
348+
"marital_status = tf.contrib.layers.sparse_column_with_hash_bucket(\"marital_status\", hash_bucket_size=100)\n",
349+
"relationship = tf.contrib.layers.sparse_column_with_hash_bucket(\"relationship\", hash_bucket_size=100)\n",
350+
"workclass = tf.contrib.layers.sparse_column_with_hash_bucket(\"workclass\", hash_bucket_size=100)\n",
351+
"occupation = tf.contrib.layers.sparse_column_with_hash_bucket(\"occupation\", hash_bucket_size=1000)\n",
352+
"native_country = tf.contrib.layers.sparse_column_with_hash_bucket(\"native_country\", hash_bucket_size=1000)"
353+
]
354+
},
355+
{
356+
"cell_type": "code",
357+
"execution_count": null,
358+
"metadata": {
359+
"collapsed": true
360+
},
361+
"outputs": [],
362+
"source": []
363+
}
364+
],
365+
"metadata": {
366+
"anaconda-cloud": {},
367+
"kernelspec": {
368+
"display_name": "Python [conda env:snakes]",
369+
"language": "python",
370+
"name": "conda-env-snakes-py"
371+
},
372+
"language_info": {
373+
"codemirror_mode": {
374+
"name": "ipython",
375+
"version": 3
376+
},
377+
"file_extension": ".py",
378+
"mimetype": "text/x-python",
379+
"name": "python",
380+
"nbconvert_exporter": "python",
381+
"pygments_lexer": "ipython3",
382+
"version": "3.6.1"
383+
}
384+
},
385+
"nbformat": 4,
386+
"nbformat_minor": 1
387+
}

0 commit comments

Comments
 (0)