forked from pdurbin/scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinthenews
executable file
·53 lines (39 loc) · 1.41 KB
/
inthenews
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/perl
use strict;
use warnings;
use MIME::Lite;
use HTML::TreeBuilder;
use LWP::Simple qw(get $ua);
use YAML;
use Data::Dumper;
#binmode STDOUT, ":utf8";
# Wikipedia blocks LWP::Simple's default User-Agent
# http://stackoverflow.com/questions/24546/why-cant-i-fetch-wikipedia-pages-with-lwpsimple
$ua->agent('Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.18) Gecko/2010020219 Firefox/3.0.18');
my $url = 'http://en.wikipedia.org/wiki/Template:In_the_news';
#my $file = '/tmp/Template:In_the_news';
#my $tree = HTML::TreeBuilder->new->parse_file($file);
my $content = get("$url") or die "Couldn't download $url";
my $tree = HTML::TreeBuilder->new->parse($content)->eof;
my $body_content = $tree->look_down( id => 'mw-content-text' )->look_down( _tag => 'ul' );
my $data;
for my $story ( $body_content->look_down( _tag => 'li' ) ) {
my $as_text = $story->as_trimmed_text;
push( @$data, "$as_text" );
}
my $html;
for my $story ( $body_content->look_down( _tag => 'ul' ) ) {
my $as_html = $story->as_HTML;
$html .= $as_html;
}
$tree->delete();
$html =~ s{(<a href=")}{$1http://en.wikipedia.org}g;
my $msg = MIME::Lite->new(
From => '"greptilian.com" <[email protected]>',
'Reply-To' => '[email protected]',
To => '[email protected]',
Subject => 'Wikipedia - In the News',
Type => 'text/html',
Data => $html,
);
$msg->send();