-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstat.rb
executable file
·292 lines (249 loc) · 9.64 KB
/
stat.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/env ruby
require "bundler"
Bundler.require
require "open-uri"
require "timeout"
# Some weird polish boards have invalid ssl cert, require a cookie, etc.
def open_uri uri, &block
open uri, {ssl_verify_mode: OpenSSL::SSL::VERIFY_NONE, "Cookie" => "accept=1"}, &block
end
def stat_of uri
# Use 4chan api if it's possible
open_uri uri+"/0.json" do |i|
a = JSON.parse i.read
# Select all threads
posts = a["threads"].
# ...that are not sticky, but at most 10 threads of a page
select { |i| i["posts"][0]["sticky"] == 0 }[0..10].
# ...select all posts in those threads
map { |i| i["posts"] }.
# ...and flatten the array
flatten
# If a board doesn't have any posts, then return 0 posts per second
return 0.0 if posts.empty?
# Select the oldest post
oldest = posts.min { |a,b| a["no"] <=> b["no"] }
# And the newest posts
newest = posts.max { |a,b| a["no"] <=> b["no"] }
# Calculate a difference in post numbers between them
posts = newest["no"] - oldest["no"] + 1
# Calculate time difference between now and an oldest post
time = Time.now - Time.at(oldest["time"])
# Calculate a post per second fraction of a given board
posts.to_f / time.to_f
end
rescue SocketError
# If network fails, retry
retry
rescue Exception # OpenURI::HTTPError, JSON::ParserError
# 4chan api doesn't work here, let's try manually parsing HTML
begin
doc = Nokogiri::HTML open_uri uri+"/"
# Let's detect an engine of a board
mitsuba = !doc.css('.absBotDisclaimer').empty?
# mitsuba is for 4chan and karachan, since they are compatible
kusaba = !doc.css('.footer a[href$="cultnet.net/"],
.footer a[href$="kusabax.org/"],
#footer a[href$="cultnet.net/"]').empty? | (uri =~ /heretyk\.org/)
tinyboard = !doc.css('footer a[href$="tinyboard.org/"]').empty?
northboard = !doc.css('#software a[href$="NorthBoard/"]').empty?
krautchan = uri =~ /krautchan\.net/
fourtwenty = uri =~ /420chan\.org/
if not mitsuba and not tinyboard and not kusaba and not northboard \
and not krautchan and not fourtwenty
raise "Not supported: #{uri}"
end
# A CSS selector that would give us every thread
thread_selector = (mitsuba|northboard|krautchan) ? ".thread" :
(kusaba|tinyboard) ? 'div[id^="thread"]:not(#thread_controls)' :
fourtwenty ? 'div[id*="thread"]' :
false
# A CSS selector that would determine, if a given thread is sticky or not
sticky_selector = mitsuba ? "img.stickyIcon" :
kusaba ? 'img[src="pin.png"]' :
tinyboard ? "i.fa-thumb-tack" :
northboard ? 'img[src$="attach.png"]' :
krautchan ? 'img[src$="sticky.png"]' :
fourtwenty ? 'FIXME' :
false
# This one should give us a part of a post, that would apply both to
# a thread and posts inside.
postinfo_selector = mitsuba ? ".postInfo" :
kusaba ? ".reply" :
tinyboard ? ".intro" :
northboard ? ".postinfo" :
krautchan ? ".postheader" :
fourtwenty ? ".thread_header, .replyheader" :
false
# A selector to give us a post number
postid_selector = mitsuba ? '.quotePost, a[title="Reply to this post"]' :
(kusaba|fourtwenty) ? ".reflink>a:last" :
tinyboard ? ">.post_no" :
northboard ? ".post_number > a[onclick]" :
krautchan ? ".postnumber > .quotelink:last" :
false
# A selector to give us a date of a post
date_selector = mitsuba ? ".dateTime" :
kusaba ? "label:first, .post_header" :
tinyboard ? "time" :
northboard ? ".post_time" :
krautchan ? ".postdate" :
fourtwenty ? ".idhighlight" :
false
# Select all threads...
threads = doc.css(thread_selector)
# ...that are not sticky and at most 10 of them
threads = threads.select { |t| t.css(sticky_selector).empty? }[0..10]
# Select both threads and posts
posts = threads.map { |t| t.css(postinfo_selector) }
# Unfortunately, we aren't able to do it via CSS selectors on kusaba,
# so let's manually add the threads to the posts array
posts += threads if kusaba
# Flatten the array, so that [[1,2,3],[4,5,6]] becomes [1,2,3,4,5,6]
posts = posts.flatten
# Parse the posts
posts = posts.map do |p|
# Select a postid
pid = p.css(postid_selector)
# A minor difference in tinyboard handling
if tinyboard
pid = pid.last
else
pid = pid.first
end
# Get a post id
pid = pid.text.sub("No.", "").strip.to_i
# Select a date
date = p.css(date_selector).first
# If it can be done without parsing a date, let's do it.
if date["datetime"] # Tinyboard
date = date["datetime"]
elsif date["data-utc"] # 4chan
date = Time.at(date["data-utc"].to_i)
else # It can't be selected cleanly on kusaba, so a little hackery here
date = date.children.select do |i|
i.class == Nokogiri::XML::Text
end.map(&:text).join
end
# Time.parse is a pretty cool guy, it can parse every date and
# not afraid of anything
date = Time.parse(date) if date.class != Time
# Return an array of tuples (postid, date)
[pid, date]
end
# Return 0, if there are no posts
return 0.0 if posts.empty?
# Select the oldest post
oldest = posts.min { |a,b| a[0] <=> b[0] }
# Select the newest post
newest = posts.max { |a,b| a[1] <=> b[1] }
# Calculate, how many post ids have passed between the oldest post and now
posts = newest[0] - oldest[0] + 1
time = Time.now - oldest[1]
posts.to_f / time.to_f
rescue SocketError
# If network fails, retry
retry
rescue Exception => e
# Some error happened. Let's save it for further inspection
$error = e
puts "Error: #{e}: #{uri}"
0.0
end
end
# Select a chanset
chans = nil
case ARGV[0]
# Polish chans
when "pl"
vi = %w[b cp r+oc id waifu wiz veto int slav
sci psl h c c++ vg lsd ku fso btc trv
a az ac mu tv lit vp x hk fr
sr swag sex pro med soc trap pr psy
meta chan mit 3 fem synch homo dump]
kara = %w[4 b fz z r id $ c co a edu f fa
h kib ku l med mil mu oc p po pony
sci sp tech thc trv v8 vg wall x og
int kara g hen s dew]
kiwi = %w[b a co hob kul tec v wc kiwi]
wilchan = %w[b a art mf vg porn lsd h o admin chancraft]
heretyk = %w[b t meta]
sis = %w[sis int thud admin]
_8chan = %w[rzabczan flutter karachan vichan gakuran kuco 4]
_8chpl = JSON.load(Net::HTTP.get("8ch.pl", "/boards.json")).map { |i| i['uri'] }
misc = []
vi = vi.map { |i| "https://pl.vichan.net/"+i }
kara = kara.map { |i| "https://kara.8ch.net/"+i }
kiwi = kiwi.map { |i| "https://kiwiszon.org/boards/"+i }
wilchan = wilchan.map { |i| "https://wilchan.org/"+i }
heretyk = heretyk.map { |i| "http://heretyk.org/"+i }
sis = sis.map { |i| "http://sischan.pl/"+i }
_8chan = _8chan.map { |i| "https://8ch.net/"+i }
_8chpl = _8chpl.map { |i| "http://8ch.pl/"+i }
chans = misc + vi + kiwi + wilchan + _8chan + _8chpl + kara + heretyk + sis
# 4chan vs 8chan revolution
when "v"
_4chan = %w[b vg v int pol a co tg sp fit g mu]
_8chan = %w[b v int pol a co tg sp tech gg fit mu]
misc = %w[https://krautchan.net/int
http://boards.420chan.org/b]
_4chan = _4chan.map { |i| "https://boards.4chan.org/"+i }
_8chan = _8chan.map { |i| "https://8chan.co/"+i }
chans = _4chan + _8chan + misc
end
require "thread"
require "pp"
results = []
ths = []
# This mutex synchronizes screen output and array writes
mut = Mutex.new
chans.each do |i|
# Spawn a thread for each board...
ths << Thread.new do
# Give 5 minutes for the fetching and calculations to happen.
begin
Timeout::timeout 300 do
# calculate stats for it and
k = stat_of i
mut.synchronize do
# Feedback that a given board has been handled
puts "Got #{i}"
# Push stats of a given board to our array
results << [i, k]
end
end
rescue Timeout::Error
mut.synchronize do
puts "Timeout: #{i}"
end
retry
end
end
end
# Wait until all threads are done waiting
ths.each do |i|
i.join
end
case ARGV[1]
when "json" # JSON output
File.open("history/#{Time.now.to_i}.json", "w") do |f|
f << results.to_json
end
when "sql" # output to a database
require_relative "setup_db"
time_now = Time.now
results.each do |b,c|
DB[:stats].insert(timestamp: time_now, board: b, pps: c)
end
else # HTML output
# Output the results, sorted by the activity, in a posts per hour format
File.open("out.html", "w") do |f|
f << "<!DOCTYPE html><html><body><table><tr><th>Board<th>Posts per hour</tr>"
results.sort { |a,b| a[1] <=> b[1] }.reverse.each do |b,c|
f << "<tr><td><a href='#{b}'>#{b}</a><td>#{c*3600}</tr>"
end
f << "</table><hr><address>Generated with <a href='https://github.com/czaks/chanstats'>chanstats</a></address></body></html>"
end
end
# Run an interactive console if there were any errors
binding.pry if $error