Skip to content

Commit e8c79f7

Browse files
committed
Add datapackages
1 parent 5558b42 commit e8c79f7

File tree

3 files changed

+403
-2
lines changed

3 files changed

+403
-2
lines changed
File renamed without changes.

solutions/src/data/00_add-metadata.ipynb

+284-2
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,296 @@
99
"import datapackage"
1010
]
1111
},
12+
{
13+
"cell_type": "markdown",
14+
"metadata": {},
15+
"source": [
16+
"The `datapackage` allows you to work with data packages, so we start by creating a blank data package like so:"
17+
]
18+
},
1219
{
1320
"cell_type": "code",
14-
"execution_count": null,
21+
"execution_count": 20,
1522
"metadata": {},
1623
"outputs": [],
1724
"source": [
18-
"package = "
25+
"package = datapackage.Package()"
26+
]
27+
},
28+
{
29+
"cell_type": "markdown",
30+
"metadata": {},
31+
"source": [
32+
"We can now add useful metadata by addding keys to the metadata attribute dictionary. We will start by adding the `name` key and the human-readable `title` key. For a list of the keys supported check the [DataPackage spec](https://frictionlessdata.io/specs/data-package/#metadata)"
1933
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": 21,
38+
"metadata": {},
39+
"outputs": [],
40+
"source": [
41+
"package.descriptor['name'] = 'winemag-reviews'\n",
42+
"package.descriptor['title'] = 'Winemag wine reviews dataset'"
43+
]
44+
},
45+
{
46+
"cell_type": "code",
47+
"execution_count": 4,
48+
"metadata": {},
49+
"outputs": [
50+
{
51+
"data": {
52+
"text/html": [
53+
"<div>\n",
54+
"<style scoped>\n",
55+
" .dataframe tbody tr th:only-of-type {\n",
56+
" vertical-align: middle;\n",
57+
" }\n",
58+
"\n",
59+
" .dataframe tbody tr th {\n",
60+
" vertical-align: top;\n",
61+
" }\n",
62+
"\n",
63+
" .dataframe thead th {\n",
64+
" text-align: right;\n",
65+
" }\n",
66+
"</style>\n",
67+
"<table border=\"1\" class=\"dataframe\">\n",
68+
" <thead>\n",
69+
" <tr style=\"text-align: right;\">\n",
70+
" <th></th>\n",
71+
" <th>country</th>\n",
72+
" <th>description</th>\n",
73+
" <th>designation</th>\n",
74+
" <th>points</th>\n",
75+
" <th>price</th>\n",
76+
" <th>province</th>\n",
77+
" <th>region_1</th>\n",
78+
" <th>region_2</th>\n",
79+
" <th>taster_name</th>\n",
80+
" <th>taster_twitter_handle</th>\n",
81+
" <th>title</th>\n",
82+
" <th>variety</th>\n",
83+
" <th>winery</th>\n",
84+
" </tr>\n",
85+
" </thead>\n",
86+
" <tbody>\n",
87+
" <tr>\n",
88+
" <th>0</th>\n",
89+
" <td>Italy</td>\n",
90+
" <td>Aromas include tropical fruit, broom, brimston...</td>\n",
91+
" <td>Vulkà Bianco</td>\n",
92+
" <td>87</td>\n",
93+
" <td>NaN</td>\n",
94+
" <td>Sicily &amp; Sardinia</td>\n",
95+
" <td>Etna</td>\n",
96+
" <td>NaN</td>\n",
97+
" <td>Kerin O’Keefe</td>\n",
98+
" <td>@kerinokeefe</td>\n",
99+
" <td>Nicosia 2013 Vulkà Bianco (Etna)</td>\n",
100+
" <td>White Blend</td>\n",
101+
" <td>Nicosia</td>\n",
102+
" </tr>\n",
103+
" <tr>\n",
104+
" <th>1</th>\n",
105+
" <td>Portugal</td>\n",
106+
" <td>This is ripe and fruity, a wine that is smooth...</td>\n",
107+
" <td>Avidagos</td>\n",
108+
" <td>87</td>\n",
109+
" <td>15.0</td>\n",
110+
" <td>Douro</td>\n",
111+
" <td>NaN</td>\n",
112+
" <td>NaN</td>\n",
113+
" <td>Roger Voss</td>\n",
114+
" <td>@vossroger</td>\n",
115+
" <td>Quinta dos Avidagos 2011 Avidagos Red (Douro)</td>\n",
116+
" <td>Portuguese Red</td>\n",
117+
" <td>Quinta dos Avidagos</td>\n",
118+
" </tr>\n",
119+
" <tr>\n",
120+
" <th>2</th>\n",
121+
" <td>US</td>\n",
122+
" <td>Tart and snappy, the flavors of lime flesh and...</td>\n",
123+
" <td>NaN</td>\n",
124+
" <td>87</td>\n",
125+
" <td>14.0</td>\n",
126+
" <td>Oregon</td>\n",
127+
" <td>Willamette Valley</td>\n",
128+
" <td>Willamette Valley</td>\n",
129+
" <td>Paul Gregutt</td>\n",
130+
" <td>@paulgwine</td>\n",
131+
" <td>Rainstorm 2013 Pinot Gris (Willamette Valley)</td>\n",
132+
" <td>Pinot Gris</td>\n",
133+
" <td>Rainstorm</td>\n",
134+
" </tr>\n",
135+
" <tr>\n",
136+
" <th>3</th>\n",
137+
" <td>US</td>\n",
138+
" <td>Pineapple rind, lemon pith and orange blossom ...</td>\n",
139+
" <td>Reserve Late Harvest</td>\n",
140+
" <td>87</td>\n",
141+
" <td>13.0</td>\n",
142+
" <td>Michigan</td>\n",
143+
" <td>Lake Michigan Shore</td>\n",
144+
" <td>NaN</td>\n",
145+
" <td>Alexander Peartree</td>\n",
146+
" <td>NaN</td>\n",
147+
" <td>St. Julian 2013 Reserve Late Harvest Riesling ...</td>\n",
148+
" <td>Riesling</td>\n",
149+
" <td>St. Julian</td>\n",
150+
" </tr>\n",
151+
" <tr>\n",
152+
" <th>4</th>\n",
153+
" <td>US</td>\n",
154+
" <td>Much like the regular bottling from 2012, this...</td>\n",
155+
" <td>Vintner's Reserve Wild Child Block</td>\n",
156+
" <td>87</td>\n",
157+
" <td>65.0</td>\n",
158+
" <td>Oregon</td>\n",
159+
" <td>Willamette Valley</td>\n",
160+
" <td>Willamette Valley</td>\n",
161+
" <td>Paul Gregutt</td>\n",
162+
" <td>@paulgwine</td>\n",
163+
" <td>Sweet Cheeks 2012 Vintner's Reserve Wild Child...</td>\n",
164+
" <td>Pinot Noir</td>\n",
165+
" <td>Sweet Cheeks</td>\n",
166+
" </tr>\n",
167+
" </tbody>\n",
168+
"</table>\n",
169+
"</div>"
170+
],
171+
"text/plain": [
172+
" country description \\\n",
173+
"0 Italy Aromas include tropical fruit, broom, brimston... \n",
174+
"1 Portugal This is ripe and fruity, a wine that is smooth... \n",
175+
"2 US Tart and snappy, the flavors of lime flesh and... \n",
176+
"3 US Pineapple rind, lemon pith and orange blossom ... \n",
177+
"4 US Much like the regular bottling from 2012, this... \n",
178+
"\n",
179+
" designation points price province \\\n",
180+
"0 Vulkà Bianco 87 NaN Sicily & Sardinia \n",
181+
"1 Avidagos 87 15.0 Douro \n",
182+
"2 NaN 87 14.0 Oregon \n",
183+
"3 Reserve Late Harvest 87 13.0 Michigan \n",
184+
"4 Vintner's Reserve Wild Child Block 87 65.0 Oregon \n",
185+
"\n",
186+
" region_1 region_2 taster_name \\\n",
187+
"0 Etna NaN Kerin O’Keefe \n",
188+
"1 NaN NaN Roger Voss \n",
189+
"2 Willamette Valley Willamette Valley Paul Gregutt \n",
190+
"3 Lake Michigan Shore NaN Alexander Peartree \n",
191+
"4 Willamette Valley Willamette Valley Paul Gregutt \n",
192+
"\n",
193+
" taster_twitter_handle title \\\n",
194+
"0 @kerinokeefe Nicosia 2013 Vulkà Bianco (Etna) \n",
195+
"1 @vossroger Quinta dos Avidagos 2011 Avidagos Red (Douro) \n",
196+
"2 @paulgwine  Rainstorm 2013 Pinot Gris (Willamette Valley) \n",
197+
"3 NaN St. Julian 2013 Reserve Late Harvest Riesling ... \n",
198+
"4 @paulgwine  Sweet Cheeks 2012 Vintner's Reserve Wild Child... \n",
199+
"\n",
200+
" variety winery \n",
201+
"0 White Blend Nicosia \n",
202+
"1 Portuguese Red Quinta dos Avidagos \n",
203+
"2 Pinot Gris Rainstorm \n",
204+
"3 Riesling St. Julian \n",
205+
"4 Pinot Noir Sweet Cheeks "
206+
]
207+
},
208+
"execution_count": 4,
209+
"metadata": {},
210+
"output_type": "execute_result"
211+
}
212+
],
213+
"source": [
214+
"## Loading the dataset\n",
215+
"import pandas as pd\n",
216+
"wine = pd.read_csv('../../data/raw/winemag-data-130k-v2.csv', index_col=0)\n",
217+
"wine.head()"
218+
]
219+
},
220+
{
221+
"cell_type": "markdown",
222+
"metadata": {},
223+
"source": [
224+
"## Inferring the data schema"
225+
]
226+
},
227+
{
228+
"cell_type": "code",
229+
"execution_count": 27,
230+
"metadata": {},
231+
"outputs": [
232+
{
233+
"ename": "DataPackageException",
234+
"evalue": "Local path \"..\\..\\data\\data.csv\" is not safe",
235+
"output_type": "error",
236+
"traceback": [
237+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
238+
"\u001b[1;31mDataPackageException\u001b[0m Traceback (most recent call last)",
239+
"\u001b[1;32m<ipython-input-27-f323e9bb32d7>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mpackage\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minfer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'../../*/*.csv'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
240+
"\u001b[1;32m~\\Anaconda3\\envs\\reproPython\\lib\\site-packages\\datapackage\\package.py\u001b[0m in \u001b[0;36minfer\u001b[1;34m(self, pattern)\u001b[0m\n\u001b[0;32m 190\u001b[0m \u001b[0moptions\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;34m'recursive'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m}\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;34m'**'\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mpattern\u001b[0m \u001b[1;32melse\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 191\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mpath\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mglob\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mglob\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__base_path\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpattern\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 192\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd_resource\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m{\u001b[0m\u001b[1;34m'path'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrelpath\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__base_path\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 193\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 194\u001b[0m \u001b[1;31m# Resources\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
241+
"\u001b[1;32m~\\Anaconda3\\envs\\reproPython\\lib\\site-packages\\datapackage\\package.py\u001b[0m in \u001b[0;36madd_resource\u001b[1;34m(self, descriptor)\u001b[0m\n\u001b[0;32m 161\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__current_descriptor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'resources'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 162\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__current_descriptor\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'resources'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdescriptor\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 163\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__build\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 164\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__resources\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 165\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
242+
"\u001b[1;32m~\\Anaconda3\\envs\\reproPython\\lib\\site-packages\\datapackage\\package.py\u001b[0m in \u001b[0;36m__build\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 305\u001b[0m \u001b[0mbase_path\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__base_path\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 306\u001b[0m \u001b[0mstorage\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__storage\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 307\u001b[1;33m package=self)\n\u001b[0m\u001b[0;32m 308\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mresource\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 309\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__resources\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mupdated_resource\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
243+
"\u001b[1;32m~\\Anaconda3\\envs\\reproPython\\lib\\site-packages\\datapackage\\resource.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, descriptor, base_path, strict, storage, package, **options)\u001b[0m\n\u001b[0;32m 65\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 66\u001b[0m \u001b[1;31m# Build resource\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 67\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__build\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 68\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 69\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
244+
"\u001b[1;32m~\\Anaconda3\\envs\\reproPython\\lib\\site-packages\\datapackage\\resource.py\u001b[0m in \u001b[0;36m__build\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 319\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__current_descriptor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'path'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 320\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__base_path\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 321\u001b[1;33m self.__storage)\n\u001b[0m\u001b[0;32m 322\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 323\u001b[0m \u001b[1;31m# Instantiate profile\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
245+
"\u001b[1;32m~\\Anaconda3\\envs\\reproPython\\lib\\site-packages\\datapackage\\resource.py\u001b[0m in \u001b[0;36m_inspect_source\u001b[1;34m(data, path, base_path, storage)\u001b[0m\n\u001b[0;32m 479\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mhelpers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_safe_path\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 480\u001b[0m raise exceptions.DataPackageException(\n\u001b[1;32m--> 481\u001b[1;33m 'Local path \"%s\" is not safe' % path[0])\n\u001b[0m\u001b[0;32m 482\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 483\u001b[0m \u001b[1;31m# Not base path\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
246+
"\u001b[1;31mDataPackageException\u001b[0m: Local path \"..\\..\\data\\data.csv\" is not safe"
247+
]
248+
}
249+
],
250+
"source": [
251+
"package.infer('../../*/*.csv')"
252+
]
253+
},
254+
{
255+
"cell_type": "code",
256+
"execution_count": 26,
257+
"metadata": {},
258+
"outputs": [
259+
{
260+
"data": {
261+
"text/plain": [
262+
"{'profile': 'data-package',\n",
263+
" 'name': 'winemag-reviews',\n",
264+
" 'title': 'Winemag wine reviews dataset'}"
265+
]
266+
},
267+
"execution_count": 26,
268+
"metadata": {},
269+
"output_type": "execute_result"
270+
}
271+
],
272+
"source": [
273+
"package.descriptor"
274+
]
275+
},
276+
{
277+
"cell_type": "code",
278+
"execution_count": 19,
279+
"metadata": {},
280+
"outputs": [
281+
{
282+
"data": {
283+
"text/plain": [
284+
"{'path': '..\\\\..\\\\data\\\\chile.csv', 'profile': 'data-resource'}"
285+
]
286+
},
287+
"execution_count": 19,
288+
"metadata": {},
289+
"output_type": "execute_result"
290+
}
291+
],
292+
"source": [
293+
"package.descriptor['resources'][0]"
294+
]
295+
},
296+
{
297+
"cell_type": "code",
298+
"execution_count": null,
299+
"metadata": {},
300+
"outputs": [],
301+
"source": []
20302
}
21303
],
22304
"metadata": {

0 commit comments

Comments
 (0)