Skip to content

Commit a5d668c

Browse files
committed
Added scraper for toi
1 parent 9cfd396 commit a5d668c

File tree

1 file changed

+81
-0
lines changed

1 file changed

+81
-0
lines changed

scraping/toiscraper.php

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
<?
2+
3+
// // include ("../lib/phphtmlparser/src/htmlparser.inc");
4+
include('../lib/simplehtmldom_1_5/simple_html_dom.php');
5+
include('../config/settings.php');
6+
// include('html2text.inc');
7+
//
8+
// $html = file_get_html('http://www.google.com/');
9+
10+
// // Find all images
11+
// foreach($html->find('img') as $element)
12+
// echo $element->src . '<br>';
13+
14+
// // Find all links
15+
// foreach($html->find('a') as $element)
16+
// echo $element->href . '<br>';
17+
//
18+
19+
class ToiScraper {
20+
21+
public function getArticleLinks($keyword )
22+
{
23+
global $articleSources;
24+
$results = array();
25+
$url = $articleSources["toi"]["search"];
26+
$html = file_get_html("$url$keyword");
27+
foreach($html->find('div.title a') as $element)
28+
$results[] = $element->href;
29+
30+
return $results;
31+
}
32+
33+
public function getArticleContent($url)
34+
{
35+
$html = file_get_html($url);
36+
// print_r($html);
37+
$result["title"] = $html->find('h1.multi-line-title-1', 0)->innertext;
38+
$result["content"] = "";
39+
foreach ($html->find('div.mod-articletext p') as $key => $value)
40+
$result["content"] .= $value->innertext;
41+
42+
foreach ($html->find('iframe') as $key => $value)
43+
if(($pos = strrpos( $value->src, '/' )))
44+
{
45+
$id = substr( $value->src, $pos+1);
46+
break;
47+
}
48+
49+
$result["commentUrl"] = "http://timesofindia.indiatimes.com/opinions/$id?commenttype=mostrecommended&sorttype=bycount";
50+
return $result;
51+
}
52+
53+
public function getComments($url)
54+
{
55+
$html = file_get_html($url, true);
56+
$comments = array();
57+
foreach ($html->find('div.cmtBox div div.box') as $key => $value) {
58+
$ele = $value->last_child()->first_child()->first_child();
59+
if($ele->tag == "span")
60+
$comments[] = $ele->innertext;
61+
}
62+
return $comments;
63+
}
64+
}
65+
66+
$scraper = new ToiScraper();
67+
$links = $scraper->getArticleLinks('modi');
68+
for($i = 1 ; $i < 2; $i++)
69+
{
70+
$articleContent = $scraper->getArticleContent($articleSources["toi"]["base"]. $links[$i]);
71+
$comments = $scraper->getComments($articleContent["commentUrl"]));
72+
}
73+
74+
print_r($articleContent);
75+
echo "<br><br>";
76+
print_r($comments);
77+
78+
79+
80+
?>
81+

0 commit comments

Comments
 (0)