|
| 1 | +<? |
| 2 | + |
| 3 | +// // include ("../lib/phphtmlparser/src/htmlparser.inc"); |
| 4 | +include('../lib/simplehtmldom_1_5/simple_html_dom.php'); |
| 5 | +include('../config/settings.php'); |
| 6 | +// include('html2text.inc'); |
| 7 | +// |
| 8 | +// $html = file_get_html('http://www.google.com/'); |
| 9 | + |
| 10 | +// // Find all images |
| 11 | +// foreach($html->find('img') as $element) |
| 12 | +// echo $element->src . '<br>'; |
| 13 | + |
| 14 | +// // Find all links |
| 15 | +// foreach($html->find('a') as $element) |
| 16 | +// echo $element->href . '<br>'; |
| 17 | +// |
| 18 | + |
| 19 | +class ToiScraper { |
| 20 | + |
| 21 | + public function getArticleLinks($keyword ) |
| 22 | + { |
| 23 | + global $articleSources; |
| 24 | + $results = array(); |
| 25 | + $url = $articleSources["toi"]["search"]; |
| 26 | + $html = file_get_html("$url$keyword"); |
| 27 | + foreach($html->find('div.title a') as $element) |
| 28 | + $results[] = $element->href; |
| 29 | + |
| 30 | + return $results; |
| 31 | + } |
| 32 | + |
| 33 | + public function getArticleContent($url) |
| 34 | + { |
| 35 | + $html = file_get_html($url); |
| 36 | + // print_r($html); |
| 37 | + $result["title"] = $html->find('h1.multi-line-title-1', 0)->innertext; |
| 38 | + $result["content"] = ""; |
| 39 | + foreach ($html->find('div.mod-articletext p') as $key => $value) |
| 40 | + $result["content"] .= $value->innertext; |
| 41 | + |
| 42 | + foreach ($html->find('iframe') as $key => $value) |
| 43 | + if(($pos = strrpos( $value->src, '/' ))) |
| 44 | + { |
| 45 | + $id = substr( $value->src, $pos+1); |
| 46 | + break; |
| 47 | + } |
| 48 | + |
| 49 | + $result["commentUrl"] = "http://timesofindia.indiatimes.com/opinions/$id?commenttype=mostrecommended&sorttype=bycount"; |
| 50 | + return $result; |
| 51 | + } |
| 52 | + |
| 53 | + public function getComments($url) |
| 54 | + { |
| 55 | + $html = file_get_html($url, true); |
| 56 | + $comments = array(); |
| 57 | + foreach ($html->find('div.cmtBox div div.box') as $key => $value) { |
| 58 | + $ele = $value->last_child()->first_child()->first_child(); |
| 59 | + if($ele->tag == "span") |
| 60 | + $comments[] = $ele->innertext; |
| 61 | + } |
| 62 | + return $comments; |
| 63 | + } |
| 64 | +} |
| 65 | + |
| 66 | +$scraper = new ToiScraper(); |
| 67 | +$links = $scraper->getArticleLinks('modi'); |
| 68 | +for($i = 1 ; $i < 2; $i++) |
| 69 | +{ |
| 70 | + $articleContent = $scraper->getArticleContent($articleSources["toi"]["base"]. $links[$i]); |
| 71 | + $comments = $scraper->getComments($articleContent["commentUrl"])); |
| 72 | +} |
| 73 | + |
| 74 | +print_r($articleContent); |
| 75 | +echo "<br><br>"; |
| 76 | +print_r($comments); |
| 77 | + |
| 78 | + |
| 79 | + |
| 80 | + ?> |
| 81 | + |
0 commit comments