Skip to content

Commit e23c458

Browse files
committed
first time commit
0 parents  commit e23c458

File tree

1,720 files changed

+522487
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,720 files changed

+522487
-0
lines changed

## Basic Scrapping.txt

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
## Basic Scrapping
2+
3+
# Using the scrapy shell serves for testing xpath expresions:
4+
scrapy shell <url>
5+
6+
...
7+
8+
scrapy shell http://news.ycombinator.com
9+
10+
11+
response.xpath('//td[@class="title"]').extract()
12+
13+
response.xpath('//td[@class="title"]').extract()[0]
14+
15+
response.xpath('//td[@class="title"]/a').extract()[0]
16+
17+
response.xpath('//td[@class="title"]/a/text()').extract()[0]
18+
19+
response.xpath('//td[@class="title"]/a/@href').extract()[0]
20+
21+
response.xpath('//span[@class=yclinks"]/a[3]/@href').extract() <-- library link at the bottom of Hacker News
22+
23+
response.xpath('//td[@class="subtext"]/a/@href').extract()[0] <-- comment links
24+
25+
response.xpath('//td[@class="subtext"]/a/text()').extract()[0] <-- comment links

### quick guide to heroku ###.txt

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
### quick guide to heroku ###
2+
heroku login -- enter credentials
3+
4+
activate virtualenv
5+
6+
use pip freeze > requirements.txt -- so heroku can understand
7+
what your app needs
8+
9+
create a Procfile
10+
add code: web: python run.py and save it like Procfile no extension
11+
12+
update run.py (this added the port)
13+
14+
git init
15+
git add .
16+
git commit -am "initial"
17+
18+
heroku create
19+
20+
git push heroku master
21+
heroku ps:scale web=1
22+
23+
heroku ps <-- see if app is serving
24+
25+
heroku open <-- launches browser with app
26+
27+
herokus logs <-- shows logs
28+
29+
### in case of trouble ###
30+
remember to make changes if log throws errors and then
31+
add changes and commit them.
32+
and then heroku push too.
33+
34+
Also watchout for errors on rsa key
35+

.gitignore

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#python specific
2+
*.pyc
3+
4+
## generic files to ignore
5+
*~
6+
*.lock
7+
*.DS_Store
8+
*.swp
9+
*.out
10+
*.pdf
11+
12+
# Virtualenv
13+
.Python
14+
bin
15+
lib
16+
include
17+
18+
# Mac OS X custom attribute files
19+
.DS_Store
20+

.gitignorels

Whitespace-only changes.

README

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
hello world shit sheep ship

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Excercises from Real Python book 2

api_for_rotten.py

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# GET data from Rotten Tomatoes, parse and write to database
2+
3+
import json, requests, sqlite3
4+
5+
API_KEY = 'm5549a8w6d7z9w7d4yugpezf'
6+
url = requests.get("http://api.rottentomatoes.com/api/public/v1.0/"+
7+
"lists/movies/in_theaters.json?apikey={}".format(API_KEY))
8+
9+
# convert data from feed to binary
10+
binary = url.content
11+
#print "This is binary\n"
12+
#print binary
13+
14+
# decode the json feed
15+
output = json.loads(binary)
16+
#print "\nthis is output\n"
17+
#print output
18+
19+
# grab the list of movies
20+
movies = output["movies"]
21+
22+
with sqlite3.connect("movies.db") as connection:
23+
c = connection.cursor()
24+
25+
# iterate through each movie and write to the database
26+
for movie in movies:
27+
c.execute("INSERT INTO new_movies VALUES (?,?,?,?,?,?,?)",
28+
(movie["title"], movie["year"],
29+
movie["mpaa_rating"],
30+
movie["release_dates"]["theater"],
31+
movie["runtime"], movie["ratings"]["critics_score"],
32+
movie["ratings"]["audience_score"]))
33+
# retrieve data
34+
c.execute("SELECT * FROM new_movies ORDER BY title ASC")
35+
36+
# fetchall() retrieves all records from the query
37+
rows = c.fetchall()
38+
39+
# output the rows to the screen, row by row
40+
for r in rows:
41+
print "title "+str(r[0]), r[1], r[2], r[4], r[5], r[6]
42+
43+

api_sentiments.py

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import requests
2+
3+
url = 'http://text-processing.com/api/sentiment/'
4+
data = {'text':'great'}
5+
r = requests.post(url,data=data)
6+
print r.content

cars.json

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"CARS":[
3+
{
4+
"MAKE":"Ford",
5+
"MODEL":"Focus",
6+
"COST":"15000"
7+
},
8+
{
9+
"MAKE":"Honda",
10+
"MODEL":"Civic",
11+
"COST":"20000"
12+
},
13+
{
14+
"MAKE":"Toyota",
15+
"MODEL":"Camry", "COST":"25000"
16+
}, {
17+
"MAKE":"Honda", "MODEL":"Accord", "COST":"22000"
18+
} ]
19+
}

chapter 3/bs_scraping.py

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from bs4 import BeautifulSoup
2+
from urllib2 import urlopen
3+
URL = 'http://web2py.com'
4+
htmlPage = urlopen(URL)
5+
htmlText = htmlPage.read()
6+
mySoup = BeautifulSoup(htmlText)
7+
8+
for link in mySoup.find_all('a'):
9+
print link["href"]

chapter 3/clientp.py

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Download stock quotes in CSV
2+
3+
import requests
4+
import time
5+
6+
i = 0
7+
stock_list = ["GOOG","YHOO","AOL"]
8+
while (i<1):
9+
base_url = 'http://download.finance.yahoo.com/d/quotes.csv'
10+
# retrieve data from web server
11+
for stock in stock_list:
12+
data = requests.get(base_url, params ={'s':stock,'f':'sl1d1t1c1ohgv', 'e': '.csv'})
13+
14+
# write the data to csv
15+
with open("stocks.csv", "a") as code:
16+
code.write(data.content)
17+
i+=1
18+
19+
# pause for 3 seconds
20+
time.sleep(3)

chapter 3/hackernews/hackernews/__init__.py

Whitespace-only changes.
+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
url,title
2+
,
3+
http://projecteuler.net/news,Project Euler Returns
4+
,
5+
http://www.technologyreview.com/news/527051/the-man-who-really-built-bitcoin/,"Gavin Andresen, the Most Powerful Person in the World of Bitcoin"
6+
,
7+
https://code.google.com/p/chromium/issues/detail?id=162757#c64,Blink won’t implement pointer events
8+
,
9+
http://techcrunch.com/2014/08/15/zen99/,"Zen99 (YC S14) Makes Life Easier for Freelancers with Finance, Insurance Tools"
10+
,
11+
http://www.practicalguidetomobileapps.com/,Show HN: Practical Guide to Mobile Apps Using Phonegap and Famo.us
12+
,
13+
http://www.bloomberg.com/news/2014-08-14/san-francisco-office-rents-seen-topping-manhattan-in-2015.html,San Francisco Office Rents Seen Topping Manhattan in 2015
14+
,
15+
http://s3.thinkaurelius.com/docs/titan/0.5.0/,Titan Distributed Graph Database 0.5.0
16+
,
17+
http://www.shellcheck.net/,ShellCheck – Online shell script analyzer
18+
,
19+
http://acko.net/files/pres/siggraph-2014-bof/online.html,MathBox 2
20+
,
21+
http://simonschreibt.de/gat/renderhell/ ,Render Hell 1.0
22+
,
23+
http://kivy.org/,Kivy – Open-source Python library for rapid development of applications
24+
,
25+
http://datatracker.ietf.org/doc/draft-kirsch-ietf-tcp-stealth/,TCP Stealth
26+
,
27+
http://www.math.vanderbilt.edu/~schectex/commerrs/,The most common errors in undergraduate mathematics
28+
,
29+
http://www.newyorker.com/magazine/2012/12/24/utopian-for-beginners,An amateur linguist loses control of the language he invented (2012)
30+
,
31+
http://limscentral.com/,Show HN: I created a Laboratory Customer Relationship Management Tool
32+
,
33+
https://bugs.ruby-lang.org/issues/10137,Incremental GC in Ruby MRI
34+
,
35+
http://martiancraft.com/blog/2014/08/an-unreal-decision/,An Unreal Decision
36+
,
37+
http://www.theatlantic.com/technology/archive/2014/08/why-email-will-never-die/375973/,Email Is Still the Best Thing on the Internet
38+
,
39+
http://sebastien-gabriel.com/designers-guide-to-dpi/,Designers Guide To DPI
40+
,
41+
http://www.slate.com/blogs/bad_astronomy/2014/08/14/portraits_in_uv_thomas_leveritt_video_of_faces_in_ultraviolet.html,The Faces of Ultraviolet
42+
,
43+
http://www.wired.com/2014/08/a-hair-salon-gurus-next-big-thing-ending-the-12b-tyranny-of-shampoo/,A Hair Salon Guru’s Next Big Thing: Ending Shampoo
44+
,
45+
http://techcrunch.com/2014/08/14/y-combinator-and-mithril-invest-in-helion-a-nuclear-fusion-startup/,"Y Combinator And Mithril Invest In Helion, A Nuclear Fusion Startup"
46+
,
47+
http://www.bbc.com/news/business-28756059,The 30-year-old health sector billionaire
48+
,
49+
http://www.theatlantic.com/features/archive/2014/08/the-future-of-college/375071/,The Future of College?
50+
,
51+
http://www.theatlantic.com/technology/archive/2014/08/advertising-is-the-internets-original-sin/376041/,It's not too late to ditch the ad-based business model and build a better web
52+
,
53+
https://github.com/siddontang/mixer,Mixer – A MySQL Proxy powered by Go
54+
,
55+
http://bonsaiden.github.io/Tuff.gb/,Show HN: I'm building a game for the Nintendo GameBoy
56+
,
57+
https://itunes.apple.com/us/app/biodigital-human-anatomy-health/id771825569,BioDigital: A 3D Medical Anatomy App
58+
,
59+
https://careers.stackoverflow.com/jobs/65030/full-stack-junior-developer-unbabel,Unbabel (YC W14) is looking for a junior developer in Portugal
60+
,
61+
http://henrysmith.org/blog/2014/08/04/not-planning-any-nuclear-attacks/,I am not planning any nuclear attacks
62+
news?p=2,More
+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define here the models for your scraped items
4+
#
5+
# See documentation in:
6+
# http://doc.scrapy.org/en/latest/topics/items.html
7+
8+
from scrapy.item import Item, Field
9+
10+
11+
class HackernewsItem(Item):
12+
# define the fields for your item here like:
13+
# name = scrapy.Field()
14+
title = Field()
15+
url = Field()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Define your item pipelines here
4+
#
5+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
6+
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7+
8+
9+
class HackernewsPipeline(object):
10+
def process_item(self, item, spider):
11+
return item
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Scrapy settings for hackernews project
4+
#
5+
# For simplicity, this file contains only the most important settings by
6+
# default. All the other settings are documented here:
7+
#
8+
# http://doc.scrapy.org/en/latest/topics/settings.html
9+
#
10+
11+
BOT_NAME = 'hackernews'
12+
13+
SPIDER_MODULES = ['hackernews.spiders']
14+
NEWSPIDER_MODULE = 'hackernews.spiders'
15+
16+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
17+
#USER_AGENT = 'hackernews (+http://www.yourdomain.com)'
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# This package will contain the spiders of your Scrapy project
2+
#
3+
# Please refer to the documentation for information on how to create and manage
4+
# your spiders.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# spider.py
2+
3+
from scrapy.spider import BaseSpider
4+
from scrapy.selector import HtmlXPathSelector
5+
from hackernews.items import HackernewsItem
6+
7+
class MySpider(BaseSpider):
8+
# Naming the spider
9+
name = "wiki"
10+
11+
# allowed domains to scrape
12+
allowed_domains = ["en.wikipedia.org"]
13+
start_urls = ["http://en.wikipedia.org/wiki/Category:2014_films"]
14+
15+
def parse(self,response):
16+
hxs = HtmlXPathSelector(response)
17+
titles = response.xpath('//tr[@style="vertical-align: top;"]//li')
18+
items = []
19+
for title in titles:
20+
item = WikipediaItem()
21+
item["title"] = title.select("a/text()").extract()
22+
item["url"] = title.select("a/@href").extract()
23+
items.append(item)
24+
return items
25+
26+

chapter 3/hackernews/scrapy.cfg

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
5+
6+
[settings]
7+
default = hackernews.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = hackernews

chapter 3/socrata/project.db

324 KB
Binary file not shown.

chapter 3/socrata/scrapy.cfg

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Automatically created by: scrapy startproject
2+
#
3+
# For more information about the [deploy] section see:
4+
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
5+
6+
[settings]
7+
default = socrata.settings
8+
9+
[deploy]
10+
#url = http://localhost:6800/
11+
project = socrata

chapter 3/socrata/socrata/__init__.py

Whitespace-only changes.

chapter 3/socrata/socrata/db.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import sqlite3
2+
3+
conn = sqlite3.connect("project.db")
4+
cursor = conn.cursor()
5+
cursor.execute("""CREATE TABLE data (text TEXT, url TEXT, views TEXT)""")

0 commit comments

Comments
 (0)