-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.rb
117 lines (94 loc) · 3.19 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
require 'scraperwiki'
require 'mechanize'
require 'rest-client'
# TODO: Use https
# There's a problem with their ssl cert, which prevents
# the Wayback machine from archiving and requires not verifying ssl
# on our end. So for now, get the http version.
BASE_URL = 'http://minerals.org.au'
ORG_NAME = 'Minerals Council of Australia'
DEFAULT_AUTHOR = 'MCA National'
def web_archive(page)
url = "https://web.archive.org/save/#{page.uri.to_s}"
begin
archive_request_response = RestClient.get(url)
"https://web.archive.org" + archive_request_response.headers[:content_location]
rescue RestClient::BadGateway => e
puts "archive.org ping returned error response for #{url}: " + e.to_s
end
end
def find_meta_tag_content(page, key, value)
tag = page.search(:meta).find do |t|
t[key] === value
end
tag['content'] if tag
end
def extract_author_or_default(page)
page.at('.field-name-field-pbundle-title')&.text || DEFAULT_AUTHOR
end
def extract_article_body(page)
page.at('.field-name-body > div > div')&.inner_html ||
page.at('article .content > div > div > div').inner_html
end
def parse_utc_time_or_nil(string)
Time.parse(string).utc.to_s if string
end
def save_article(page)
published = parse_utc_time_or_nil(
find_meta_tag_content(page, :property,'article:published_time')
)
updated = parse_utc_time_or_nil(
find_meta_tag_content(page, :property, 'og:updated_time')
)
# Skip if we already have the current version of article
saved_article = ScraperWiki.select("* FROM data WHERE url='#{page.uri.to_s}'").last rescue nil
if saved_article && saved_article&.dig("updated").eql?(updated)
puts "Skipping #{page.uri.to_s}, already saved"
else
puts "Saving: #{page.uri.to_s}, #{published}"
article = {
'name' => find_meta_tag_content(page, :property, 'og:title'),
'url' => page.uri.to_s,
'scraped_at' => Time.now.utc.to_s,
'published' => published,
'updated' => updated,
'author' => extract_author_or_default(page),
'summary' => find_meta_tag_content(page, :property, 'og:description'),
'content' => extract_article_body(page),
'syndication' => web_archive(page),
'org' => ORG_NAME,
'photo' => find_meta_tag_content(page, :property, 'og:image')
}
ScraperWiki.save_sqlite(['url'], article)
end
end
def save_articles_and_click_next_while_articles(agent, index_page)
web_archive(index_page)
puts "Collecting articles on #{index_page.uri.to_s}"
articles = index_page.search('.view-news-listings .item-list > ul li')
if articles.any?
articles.each do |article_item|
sleep 1
save_article(agent.get(BASE_URL + article_item.at(:a)['href']))
end
end
next_page_link = index_page.links.select do |link|
link.text.eql? 'next'
end.pop
if next_page_link
puts "Clicking for the next page"
save_articles_and_click_next_while_articles(
agent,
next_page_link.click
)
else
puts "That's the last page my friends, no more articles to collect."
end
end
agent = Mechanize.new
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
initial_index_page = agent.get(BASE_URL + "/media?page=0")
save_articles_and_click_next_while_articles(
agent,
initial_index_page
)