diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b60d528 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +listings.csv +password.txt +secretpassword.txt + +# MAC OS +.DS_Store \ No newline at end of file diff --git a/7.seven.txt b/7.seven.txt new file mode 100644 index 0000000..fc8d788 --- /dev/null +++ b/7.seven.txt @@ -0,0 +1 @@ +you are thinking about the number 7 diff --git a/HB_hat_keine_Idee.txt b/HB_hat_keine_Idee.txt new file mode 100644 index 0000000..c43181e --- /dev/null +++ b/HB_hat_keine_Idee.txt @@ -0,0 +1,2 @@ + +nee, wirklich nicht :o) diff --git a/LegolasWhatDoYourElfEyesSee.txt b/LegolasWhatDoYourElfEyesSee.txt new file mode 100644 index 0000000..cb15601 --- /dev/null +++ b/LegolasWhatDoYourElfEyesSee.txt @@ -0,0 +1 @@ +They are taking the hobbits to Isengard!!! diff --git a/OPENME - Kopie.txt b/OPENME - Kopie.txt new file mode 100644 index 0000000..e69de29 diff --git a/OPENME.txt b/OPENME.txt new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index 7df0001..dc41a28 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,37 @@ -# Introduction to Python For Data Science -This repo contains the teaching material for the Introduction to Python (and useful libraries) masterclass at the [Data Science Retreat](http://datascienceretreat.com/). +# Intro to git For Data Science modified :) -## Table of Content +This repo is used as a test target for the DSR course on git. + +The original content was material for Python, hence the name, nowadays it acts as a practice target and the +Python files are vestigial. + +## Intro in English +This big repo contains the teaching material for the Introduction to Python (and useful libraries) masterclass at the [Data Science Retreat](http://datascienceretreat.com/), it does not cover Pandas. + +## Вступ українською +Це велике сховище містить навчальний матеріал для майстер-класу «Вступ до Python» (і корисних бібліотек) на [Data Science Retreat](http://datascienceretreat.com/), воно не стосується Pandas. + +## Intro in Finnish +Tämä repo sisältää Suomalaisen version....This big repo contains the teaching material for the Introduction to Python (and useful libraries) masterclass at the [Data Science Retreat](http://datascienceretreat.com/), it does not cover Pandas. + +# some other change that will cause conflicts +here it is + +# new change +blablabla + +# new change 2 +merve was here + +# Inro auf Deutsch +Dieses Repo ist uber der Inhalt des original Lektion. + +## Intro in italian +Questo repository contiene il materiale originale del corso di Python per DSR, ora esperimenti su git. +## Intro in Cherokee +ᎯᎠ ᎡᏆ ᎣᏍᏓ ᎨᏒ, ᎬᏂᎨᏒ ᎢᏧᏩᏁᏗᏱ ᎬᏂᎨᏒ ᏅᏓᏍᎩᏴᏁᎵ. + +## Table of contents * [About Me](#about-me) * [The Python Programming Language](#the-python-programming-language) @@ -12,32 +42,31 @@ This repo contains the teaching material for the Introduction to Python (and use * [Running the IPython interpreter and a python file](#running-the-ipython-interpreter-and-a-python-file) * [Jupyter Notebook](#jupyter-notebook) * [Python basics](#python-basics) -* [Pandas](#pandas) - * [Intro tutorial on pandas basics](#intro-tutorials-on-pandas-basics) - * [Data Munging with Pandas](#data-munging-with-pandas) * [NumPy and Matplotlib](#numpy-and-matplotlib) * [NumPy](#numpy) * [Matplotlib](#matplotlib) -* [Scikit-learn and your first Data Science case](#scikit-learn-and-your-first-data-science-case) - * [Scikit-learn](#scikit-learn) - * [Your first Data Science case](#your-first-data-science-case) * [SciPy](#scipy) - +## Hi +Ciao ## About me -Slides for this section can be found [here](https://slides.com/utstikkar/introtopython-aboutme). +Slides for this section can be found [here](https://docs.google.com/presentation/d/e/2PACX-1vTbd4eONN5nSiNaTWW3uM2RM3O0jsoVT8gQ9byqa0X5vStBZGUBfiUSM7-HegCjymaDbaUzQ-9yyvMR/pub). ## The Python Programming Language -Slide deck for this entire section is available [here](https://slides.com/utstikkar/introtopython-pythonproglanguage). +Complete slides [here](https://docs.google.com/presentation/d/e/2PACX-1vRPV8i3pQw7MCa6eG-9y9LgIFREJF_3sN4opFDXQ2r_NJgea9ObLJQfj4S_CiM6Ptxs7t0WU6lCa-QH/pub?start=false&loop=false&delayms=3000), inclusive of exercises. -### Why Python? -Slides on this topic start [here](http://slides.com/utstikkar/introtopython-pythonproglanguage#/1) +Extra links: + * [The SciPy Lectures -- The Python Language](http://scipy-lectures.github.io/intro/language/python_language.html). -### Python for DS Components -Slides on this topic start [here](http://slides.com/utstikkar/introtopython-pythonproglanguage#/5) +Practice those examples using alternatively python files, the IPython interpreter and an IPython Notebook. + +To practice: +* [Python interactive exercises](http://codingbat.com/python) +* [Join the codewars competitions](http://www.codewars.com/?language=python) ### Python 2 vs. Python 3 -Slides on this topic start [here](http://slides.com/utstikkar/introtopython-pythonproglanguage#/6) + +Note: as explained in the lesson you should now just go with Python 3. These links are from more than 2 years ago but still useful if you need to use old libraries. A great [notebook](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/key_differences_between_python_2_and_3.ipynb) covering the main differences has been written by Sebastian Raschka. @@ -54,23 +83,28 @@ The most basic interactive Python command line, where each line starts with a `> #### IDLE Standard editor in Python distributions, easy to use but very basic. +###ADD-ME +added add me + #### IPython A more sophisticated interactive Python command line. It incorporates tab-completion, interactive help and regular shell commands. Also look up the `%`-magic commands. + #### Spyder **Spyder** is part of the **Anaconda** Python distribution. It is a small IDE mostly for data analysis, similar to RStudio. It automatically highlights Syntax errors, contains a variable explorer, debugging functionality and other useful things. + #### Jupyter Notebooks Interactive environment for the web browser. A Jupyter notebook contains Python code, text, images and any output from your program (including plots!). It is a great tool for exploratory data analysis. #### Sublime2 A general-purpose text editor that works on all systems. There are many plugins for Python available. There are a free and a commercial version available. -#### Atom -The Open Source cousin of Sublime2. +#### Visual Studio Code +The Open Source cousin of Sublime2, similar to Atom. #### PyCharm -PyCharm is probably the most luxurious IDE for Python. It contains tons of functions that are a superset of all the above. PyCharm is a great choice for bigger Python projects. +PyCharm is probably the most luxurious IDE for Python. It contains tons of functions that are a superset of all the above. PyCharm is a great choice for bigger Python projects. Free for non-commercial use. #### Notepad++ If you must use a text editor on Windows to edit Python code, refuse to use anything worse than **Notepad++**. @@ -85,33 +119,20 @@ I know people who are successfully using Emacs to write Python code, but haven't Slides on this topic start [here](http://slides.com/utstikkar/introtopython-pythonproglanguage#/12) ### Jupyter Notebook -A live demo will be given during the masterclass. +A live demo will be given during the masterclass. Here just a [warning note](https://docs.google.com/presentation/d/e/2PACX-1vR2ntOr6vWHgHoC0X3arDtim9fIhaoF7r6Vl5fVjxSXeXpD2NRykOSR_UyQzbtjppD2tiqwkw2peMfQ/pub?start=false&loop=false&delayms=3000) Experiment further with the IPython Notebook environment with [this Jupyter Notebook](http://nbviewer.ipython.org/github/ipython/ipython/blob/2.x/examples/Notebook/Running%20Code.ipynb). Try to clone or download it, before opening it, running and modifying its cells. Many more Jupyter features in [this blog post](http://arogozhnikov.github.io/2016/09/10/jupyter-features.html). -### Python basics -Times to get your hands dirty. Read and test for yourself the examples provided in: [The SciPy Lectures -- The Python Language](http://scipy-lectures.github.io/intro/language/python_language.html). - -Practice those examples using alternatively python files, the IPython interpreter and an IPython Notebook. - -To practice: -* [Python interactive exercises](http://codingbat.com/python) -* [Join the codewars competitions](http://www.codewars.com/?language=python) - -## Pandas +And of course, be aware of the fact Jupyter is NOT an IDE and can bite you in various ways: [See this presentation](https://docs.google.com/presentation/d/1n2RlMdmv1p25Xy5thJUhkKGvjtV-dkAIsUXP-AL4ffI/edit#slide=id.g3cb1319227_1_388) -### Intro tutorials on pandas basics +## Git +Slides are [here](https://docs.google.com/presentation/d/e/2PACX-1vSRDWRpbJpNmtPk5SufekG8bSbBSJGjsua-nf-BxTzS_F2qMkHwmFPzjQlnR6op2pwa0QzL-PTFGikx/pub?start=false&loop=false&delayms=3000) - * [Tutorial: Data structures](https://github.com/utstikkar/pandas-tutorial/blob/master/intro-to-pandas-1-Data-Structures.ipynb) - * [Tutorial: Working with dataframes](https://github.com/utstikkar/pandas-tutorial/blob/master/intro-to-pandas-2-Working-With-DataFrames.ipynb) - * [Tutorial: Using pandas on the MovieLens dataset](https://github.com/utstikkar/pandas-tutorial/blob/master/intro-to-pandas-3-Pandas-On-MovieLens.ipynb) - -### Data munging with pandas - - * [Exercises](https://github.com/utstikkar/pandas-tutorial/blob/master/data-munging-with-pandas.ipynb) +## What is machine learning +A brief introduction/recap of ML its terminology. Slides [here](https://docs.google.com/presentation/d/e/2PACX-1vRfxH8TbgtOQy24JBu28i12kYrbUquXKu6VZhZC3wyCUdiLW1HqF75mgnLI-EjKHFQUdPeZ-6OYD8G7/pub?start=false&loop=false&delayms=3000) ## NumPy and Matplotlib @@ -121,26 +142,21 @@ Start with the official [NumPy Tutorial](http://wiki.scipy.org/Tentative_NumPy_T Move on to these [exercises](http://scipy-lectures.github.io/intro/numpy/exercises.html). ### Matplotlib -Learn the basics and some more advanced plotting tricks in Matplotlib with this [hands-on tutorial](http://scipy-lectures.github.io/intro/matplotlib/matplotlib.html). +Learn the basics and some more advanced plotting tricks in Matplotlib with this [hands-on tutorial](http://scipy-lectures.github.io/intro/matplotlib/matplotlib.html). + +It's also very useful to look at the [gallery](https://matplotlib.org/gallery.html) to find examples of every possible chart you may want. -## Scikit-learn and your first Data Science case +## Scikit-learn and your first ML case +Slides are [here](https://docs.google.com/presentation/d/e/2PACX-1vTjCOfNagJZzOjovAPgNBkVxcddNlKbWZ5oxEjicbuFyEwpAbMjG8m7x0tx3xjqUyKkoYFh0rysWRNL/pub?start=false&loop=false&delayms=3000) ### Scikit-learn * Introduction to machine learning with scikit-learn [slides](http://slides.com/luciasantamaria/intro-machine-learning-scikit-learn#/) * Doing machine learning with scikit-learn [slides](https://github.com/luciasantamaria/pandas-tutorial/blob/master/scikit-learn.pdf) * [Tutorial: Introduction to scikit-learn](https://github.com/utstikkar/pandas-tutorial/blob/master/intro-to-scikit-learn-1-Basics.ipynb) * [To go further](http://nbviewer.jupyter.org/github/jakevdp/sklearn_tutorial/blob/master/notebooks/Index.ipynb) - -## Your first data science case - -A great source of data problems nowadays is the Kaggle platform. We'll be starting today with a simple but representative dataset: [Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic). - - * [Guide](https://github.com/luciasantamaria/pandas-tutorial/blob/master/titanic-machine-learning.ipynb) for orientation to approach the problem - -IMPORTANT: you will find plenty of materials to analyze this data, however you'll learn the most if you give the problem some thought and try out several things before resorting to ready-made answers. -## SciPy +## SciPy SciPy is a collection of mathematical algorithms and convenience functions built on the Numpy extension of Python. [Here](http://scipy-lectures.github.io/intro/scipy.html) is a hands-on overview of this collection, together with practical exercises and more advanced problems. @@ -152,3 +168,8 @@ This repository contains a variety of content: some developed by Amélie Anglade The third-party content is distributed under the license provided by those parties. Any derivative work respects the original licenses, and credits its initial authors. Original content developed by Amélie Anglade is distributed under the MIT license. + + +## New section + +New content \ No newline at end of file diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..1502d70 --- /dev/null +++ b/README.txt @@ -0,0 +1,10 @@ +kd;flkdsl;fkds + +sdfkl;dskfds +# + +dsfdsf# + +jlkjhey hey + +asdsa diff --git a/aaa.txt b/aaa.txt new file mode 100644 index 0000000..47cacb1 --- /dev/null +++ b/aaa.txt @@ -0,0 +1,4 @@ +new file here +Hallo world new file + +Ggg diff --git a/blurb.txt b/blurb.txt new file mode 100644 index 0000000..8bd6648 --- /dev/null +++ b/blurb.txt @@ -0,0 +1 @@ +asdf diff --git a/bruh.txt b/bruh.txt new file mode 100644 index 0000000..abe1f07 --- /dev/null +++ b/bruh.txt @@ -0,0 +1,3 @@ +BRUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUH +WHAAAAAAAAAAAAAAAAAAAAAAAAAAAAT +THAT IS NOT POLITE! diff --git a/coolestReadmeEver.txt b/coolestReadmeEver.txt new file mode 100644 index 0000000..58500b1 --- /dev/null +++ b/coolestReadmeEver.txt @@ -0,0 +1,11 @@ +Cool file, lame content. + +Seriously. + +Totally lame. + + +... + +and it is not getting any better. + diff --git a/fancy_file.txt b/fancy_file.txt new file mode 100644 index 0000000..e69de29 diff --git a/hello.tsdf b/hello.tsdf new file mode 100644 index 0000000..0b8684f --- /dev/null +++ b/hello.tsdf @@ -0,0 +1,2 @@ +asdfdsds sdgdfg +sdfds diff --git a/henlo.txt b/henlo.txt new file mode 100644 index 0000000..e69de29 diff --git a/ho_ho_ho.txt b/ho_ho_ho.txt new file mode 100644 index 0000000..6b47fcc --- /dev/null +++ b/ho_ho_ho.txt @@ -0,0 +1,7 @@ +ho ho ho + +:-P + + + +another HOHO diff --git a/iva.txt b/iva.txt new file mode 100644 index 0000000..00dbedf --- /dev/null +++ b/iva.txt @@ -0,0 +1,3 @@ +hello + +new text diff --git a/jacobo.txt b/jacobo.txt new file mode 100644 index 0000000..a95c79b --- /dev/null +++ b/jacobo.txt @@ -0,0 +1,3 @@ +Name: +Adress: +Zip Code: diff --git a/jansfile.txt b/jansfile.txt new file mode 100644 index 0000000..81eb182 --- /dev/null +++ b/jansfile.txt @@ -0,0 +1 @@ +thats my file and no ones other! diff --git a/jean.py b/jean.py new file mode 100644 index 0000000..b26a9cb --- /dev/null +++ b/jean.py @@ -0,0 +1,3 @@ +def my_input(): + print('collaborating on Git is fun') +my_input() diff --git a/lord_of_the_files.file b/lord_of_the_files.file new file mode 100644 index 0000000..9873e8b --- /dev/null +++ b/lord_of_the_files.file @@ -0,0 +1,3 @@ +|\/\/\/\/| +| | +---------- diff --git a/marco_file.txt b/marco_file.txt new file mode 100644 index 0000000..547c0b2 --- /dev/null +++ b/marco_file.txt @@ -0,0 +1 @@ +hi batch 35 diff --git a/michele_file.txt b/michele_file.txt new file mode 100644 index 0000000..2b308d9 --- /dev/null +++ b/michele_file.txt @@ -0,0 +1,2 @@ +This is the file I added + diff --git a/msg.txt b/msg.txt new file mode 100644 index 0000000..76a0c57 --- /dev/null +++ b/msg.txt @@ -0,0 +1,3 @@ +Some message which I have expanded on. + +another message \ No newline at end of file diff --git a/my-feature b/my-feature new file mode 100644 index 0000000..d7d69a4 --- /dev/null +++ b/my-feature @@ -0,0 +1 @@ +hdaskdhasklhdlask diff --git a/myfile_anand.txt b/myfile_anand.txt new file mode 100644 index 0000000..0125a2e --- /dev/null +++ b/myfile_anand.txt @@ -0,0 +1 @@ +Great! diff --git a/mynewfeature.txt b/mynewfeature.txt new file mode 100644 index 0000000..dc5a9ee --- /dev/null +++ b/mynewfeature.txt @@ -0,0 +1,2 @@ +Fork process: +fork, clone, new-branch, make changes, add changes, commit changes, push diff --git a/mynewfile.txt b/mynewfile.txt new file mode 100644 index 0000000..5ab6646 --- /dev/null +++ b/mynewfile.txt @@ -0,0 +1,4 @@ + +this is a completely new file +bla bla bla + diff --git a/new-lesson-file b/new-lesson-file new file mode 100644 index 0000000..1c333fb --- /dev/null +++ b/new-lesson-file @@ -0,0 +1 @@ +Hello this is the new lesson diff --git a/new_file.txt b/new_file.txt new file mode 100644 index 0000000..b30fee8 --- /dev/null +++ b/new_file.txt @@ -0,0 +1 @@ +New file in new branch diff --git a/newfile.txt b/newfile.txt new file mode 100644 index 0000000..fa49b07 --- /dev/null +++ b/newfile.txt @@ -0,0 +1 @@ +new file diff --git a/password.txt b/password.txt new file mode 100644 index 0000000..d61cb51 --- /dev/null +++ b/password.txt @@ -0,0 +1 @@ +super secret passowrd diff --git a/pead.md b/pead.md new file mode 100644 index 0000000..1b7e447 --- /dev/null +++ b/pead.md @@ -0,0 +1,5 @@ +# Post-Earnings Announcement Drift + +$$PEAD = \frac{SUE}{(\sqrt{var(SUE)}} * \frac{1}{N} \sum^N_{i=1} AR_i$$ + +where SUE is the standardized unexpected earnings, Var(SUE) is the variance of SUE, N is the number of firms in the sample, and AR_i​ is the abnormal return of firm i. diff --git a/pets.csv b/pets.csv new file mode 100644 index 0000000..f2310cb --- /dev/null +++ b/pets.csv @@ -0,0 +1,7 @@ +ID;Name;Age;Breed +01;Cube;6;BorderCollie +02;Eden;6;BelgianMallinois +03;Maze;4;BorderCollie +04;Halo;3;LabradorRetriever +05;Luzi;3;EuropeanSledgeHound +06;Isla;2;LabradorRetriever diff --git a/print_ciao.py b/print_ciao.py new file mode 100644 index 0000000..cca8e56 --- /dev/null +++ b/print_ciao.py @@ -0,0 +1 @@ +print("ciao") \ No newline at end of file diff --git a/scikit_demo.ipynb b/scikit_demo.ipynb new file mode 100644 index 0000000..c09a762 --- /dev/null +++ b/scikit_demo.ipynb @@ -0,0 +1,434 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A first example of machine learning\n", + "==\n", + "In this notebook we'll apply a scikit-learn pipeline to a simple dataset (the listing of apartments in Airbnb of Berlin), and see how overfitting looks like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T21:59:01.195267Z", + "start_time": "2019-01-05T21:59:01.003209Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can invoke system commands by prepending them with a `!`, commands like `head`, `tail`, `wc` can be useful to quickly inspect a text file. Most of them are not present on Windows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T21:59:01.345919Z", + "start_time": "2019-01-05T21:59:01.197499Z" + } + }, + "outputs": [], + "source": [ + "!head listings.csv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "numpy provides the function `loadtxt` to load simple CSV files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T21:59:01.355499Z", + "start_time": "2019-01-05T21:59:01.349177Z" + } + }, + "outputs": [], + "source": [ + "#np.loadtxt('listings.csv', delimiter=',', usecols=(54, 59, 48, 49, 79 ), skiprows=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It does not work because this file contains newlines inside the fields. Luckily the Python CSV module can still process it.\n", + "\n", + "This code loads some columns from the CSV into separate numpy arrays.\n", + "\n", + "First, we create plain Python lists, then replace them with proper arrays (faster and smaller).\n", + "\n", + "Don;t worry: with Pandas this kind of operation becomes much easier." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T21:59:03.589649Z", + "start_time": "2019-01-05T21:59:01.360913Z" + } + }, + "outputs": [], + "source": [ + "from csv import DictReader\n", + "\n", + "review_scores_rating = []\n", + "price = []\n", + "latitude = []\n", + "longitude = []\n", + "bathrooms = []\n", + "\n", + "for l in DictReader(open('listings.csv')):\n", + " price.append(l['price'])\n", + " review_scores_rating.append(l['review_scores_rating'])\n", + " latitude.append(l['latitude'])\n", + " longitude.append(l['longitude'])\n", + " bathrooms.append(l['bathrooms'])\n", + "\n", + "latitude = np.array([float(l) for l in latitude])\n", + "longitude = np.array([float(l) for l in longitude])\n", + "price = np.array([float(l[1:].replace(',', '')) for l in price])\n", + "\n", + "# We assume the rating is 1 if not specified\n", + "review_scores_rating = np.array([int(l) if l != '' else 0 for l in review_scores_rating])\n", + "\n", + "# We assume there's 1 bathroom if not stated otherwise\n", + "bathrooms = np.array([float(l) if l != '' else 1 for l in bathrooms])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's very useful to have a look at the shape of the numpy arrays." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T21:59:03.596806Z", + "start_time": "2019-01-05T21:59:03.591360Z" + } + }, + "outputs": [], + "source": [ + "print(latitude.shape)\n", + "print(bathrooms.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T21:59:05.079158Z", + "start_time": "2019-01-05T21:59:03.601693Z" + } + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "# change the figure size\n", + "from matplotlib.pyplot import figure\n", + "figure(num=None, figsize=(8, 6), dpi=80)\n", + "\n", + "# reshape is needed to create a second dimension of size 1\n", + "X = price.T.reshape(-1, 1)\n", + "Y = review_scores_rating.T\n", + "model = LinearRegression()\n", + "model.fit(X, Y)\n", + "model.score(X,Y)\n", + "\n", + "\n", + "plt.scatter(X, Y, marker='X')\n", + "\n", + "x_plot = np.linspace(0, 9000, 200)\n", + "y_plot = model.predict(x_plot.reshape(-1, 1))\n", + "plt.plot(x_plot, y_plot, color='red')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Turns out there are prices much much greater than the rest, making the visualization and the model less effective. Let's ignore them by placing a cap of 500 on the price.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T21:59:05.091151Z", + "start_time": "2019-01-05T21:59:05.081299Z" + } + }, + "outputs": [], + "source": [ + "too_high = np.argwhere(price > 500)\n", + "print(f'shape before: {price.shape}')\n", + "Ylow = np.delete(Y, too_high)\n", + "Xlow = np.delete(price, too_high).reshape(-1, 1)\n", + "print(f'shape after: {Xlow.shape}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T21:59:05.486107Z", + "start_time": "2019-01-05T21:59:05.098969Z" + } + }, + "outputs": [], + "source": [ + "model = LinearRegression()\n", + "model.fit(Xlow, Ylow)\n", + "model.score(Xlow, Ylow)\n", + "\n", + "\n", + "plt.scatter(Xlow, Ylow, marker='X')\n", + "\n", + "x_plot = np.linspace(0, 500, 200)\n", + "y_plot = model.predict(x_plot.reshape(-1, 1))\n", + "plt.plot(x_plot, y_plot, color='red')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-04T14:40:29.991524Z", + "start_time": "2019-01-04T14:40:29.977525Z" + } + }, + "source": [ + "In scikit you can combine models using `make_pipeline`, in this case we combine `PolynomialFeatures` with `LinearRegression`, to run a linear regression on the features generated by the first step, which are the original ones multiplied and to various powers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T22:00:05.692525Z", + "start_time": "2019-01-05T21:59:05.488667Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "from sklearn.preprocessing import PolynomialFeatures\n", + "from sklearn.pipeline import make_pipeline\n", + "\n", + "X = np.vstack((latitude, longitude, bathrooms)).T\n", + "print(f'the shape of X is {X.shape}')\n", + "Y = review_scores_rating.T\n", + "print(f'the shape of Y is {Y.shape}')\n", + "\n", + "\n", + "for degree in range(1, 20):\n", + " model = make_pipeline(PolynomialFeatures(degree), LinearRegression())\n", + " model.fit(X, Y)\n", + " score = model.score(X, Y)\n", + " print(f'with degree {degree} the score was {score:.5f}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-04T15:50:18.004263Z", + "start_time": "2019-01-04T15:50:17.991237Z" + } + }, + "source": [ + "The model reaches the best score at degree 11 (notice it could change with other cities). This seems the best result, but what is happening here is that we have overfitting. The dataset we use to check the model is the same we used to train it.\n", + "\n", + "Let's try instead by partitioning the data in train and test datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T22:00:58.558258Z", + "start_time": "2019-01-05T22:00:05.694891Z" + } + }, + "outputs": [], + "source": [ + "train_X = X[:21000,:]\n", + "test_X = X[21000:,:]\n", + "\n", + "train_Y = Y[:21000]\n", + "test_Y = Y[21000:]\n", + "\n", + "for degree in range(1, 20):\n", + " model = make_pipeline(PolynomialFeatures(degree), LinearRegression())\n", + " model.fit(train_X, train_Y)\n", + " score = model.score(test_X, test_Y)\n", + " print(f'with degree {degree} the score was {score}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T22:00:59.052579Z", + "start_time": "2019-01-05T22:00:58.560697Z" + } + }, + "outputs": [], + "source": [ + "# change the figure size\n", + "from matplotlib.pyplot import figure\n", + "figure(num=None, figsize=(8, 6), dpi=80)\n", + "\n", + "# reshape is needed to create a second dimension of size 1\n", + "X = price.T.reshape(-1, 1)\n", + "\n", + "\n", + "\n", + "model = make_pipeline(PolynomialFeatures(20), LinearRegression())\n", + "#model = LinearRegression()\n", + "model.fit(X, Y)\n", + "model.score(X,Y)\n", + "\n", + "\n", + "\n", + "plt.scatter(X, Y, marker='X')\n", + "\n", + "x_plot = np.linspace(0, 9000, 200)\n", + "y_plot = model.predict(x_plot.reshape(-1, 1))\n", + "plt.plot(x_plot, y_plot, color='red')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Turns out there are prices much much greater than the rest, making the visualization and the model pointless. Let's ignore them by placing a cap of 500 on the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T22:00:59.627490Z", + "start_time": "2019-01-05T22:00:59.055636Z" + } + }, + "outputs": [], + "source": [ + "figure(num=None, figsize=(8, 6), dpi=80)\n", + "\n", + "\n", + "model = make_pipeline(PolynomialFeatures(30), LinearRegression())\n", + "#model = LinearRegression()\n", + "model.fit(Xlow, Ylow)\n", + "model.score(Xlow, Ylow)\n", + "\n", + "\n", + "plt.scatter(Xlow, Ylow, marker='X')\n", + "\n", + "x_plot = np.linspace(0, 500, 200)\n", + "y_plot = model.predict(x_plot.reshape(-1, 1))\n", + "plt.plot(x_plot, y_plot, color='red')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just for fun, let's draw a map of prices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2019-01-05T22:01:00.875236Z", + "start_time": "2019-01-05T22:00:59.631315Z" + } + }, + "outputs": [], + "source": [ + "figure(num=None, figsize=(9, 7), dpi=80)\n", + "\n", + "plt.scatter(latitude, longitude, c=review_scores_rating, marker='.', cmap=plt.cm.get_cmap('inferno'))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.2" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/simonwashere.txt b/simonwashere.txt new file mode 100644 index 0000000..ee88aa0 --- /dev/null +++ b/simonwashere.txt @@ -0,0 +1,2 @@ +simon was here +and modified this file diff --git a/somestuff.txt b/somestuff.txt new file mode 100644 index 0000000..9f52124 --- /dev/null +++ b/somestuff.txt @@ -0,0 +1,6 @@ +blah, hello why +sdsfs + + +this is nely added + diff --git a/test.py b/test.py new file mode 100644 index 0000000..333f496 --- /dev/null +++ b/test.py @@ -0,0 +1,9 @@ +import numpy +import pandas +import meaning_of_life + +def show_meaning(): + return 42 + +answer_meaning = show_meaning() +print(answer_meaning) diff --git a/unicorns.txt b/unicorns.txt new file mode 100644 index 0000000..7800d42 --- /dev/null +++ b/unicorns.txt @@ -0,0 +1,3 @@ +and some unicorns here + +unicorns are not bugs to be fixed diff --git a/viktest.txt b/viktest.txt new file mode 100644 index 0000000..24e8535 --- /dev/null +++ b/viktest.txt @@ -0,0 +1 @@ +This contains some test text. \ No newline at end of file