Skip to content

Commit 9cfd396

Browse files
committed
Added settings for scraping and simple html dom library
1 parent f847372 commit 9cfd396

File tree

149 files changed

+30036
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

149 files changed

+30036
-0
lines changed

config/settings.php

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<?
2+
$articleSources = array("toi"=>array("base"=>"http://articles.timesofindia.indiatimes.com/", "search"=>"http://articles.timesofindia.indiatimes.com/keyword/"));
3+
4+
?>
+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
K 25
2+
svn:wc:ra_dav:version-url
3+
V 45
4+
/svnroot/simplehtmldom/!svn/ver/179/trunk/app
5+
END
6+
google.htm
7+
K 25
8+
svn:wc:ra_dav:version-url
9+
V 56
10+
/svnroot/simplehtmldom/!svn/ver/179/trunk/app/google.htm
11+
END
12+
index.php
13+
K 25
14+
svn:wc:ra_dav:version-url
15+
V 55
16+
/svnroot/simplehtmldom/!svn/ver/176/trunk/app/index.php
17+
END
+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
10
2+
3+
dir
4+
182
5+
https://simplehtmldom.svn.sourceforge.net/svnroot/simplehtmldom/trunk/app
6+
https://simplehtmldom.svn.sourceforge.net/svnroot/simplehtmldom
7+
8+
9+
10+
2009-02-23T09:04:02.699587Z
11+
179
12+
me578022
13+
14+
15+
16+
17+
18+
19+
20+
21+
22+
23+
24+
25+
26+
27+
d0e60b4b-9046-0410-940c-b97530268c78
28+
29+
google.htm
30+
file
31+
32+
33+
34+
35+
2010-08-17T19:13:52.500237Z
36+
72a16a066c5a729cd5ddaf6e39082b05
37+
2009-02-23T09:04:02.699587Z
38+
179
39+
me578022
40+
41+
42+
43+
44+
45+
46+
47+
48+
49+
50+
51+
52+
53+
54+
55+
56+
57+
58+
59+
60+
61+
60128
62+
63+
js
64+
dir
65+
66+
index.php
67+
file
68+
69+
70+
71+
72+
2010-08-17T19:13:52.502237Z
73+
3793524c0c846be09274b428d4972b60
74+
2008-12-14T19:02:48.351607Z
75+
176
76+
me578022
77+
78+
79+
80+
81+
82+
83+
84+
85+
86+
87+
88+
89+
90+
91+
92+
93+
94+
95+
96+
97+
98+
4496
99+

lib/simplehtmldom_1_5/app/.svn/text-base/google.htm.svn-base

+891
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
<?php
2+
error_reporting(E_ALL);
3+
include_once('../simple_html_dom.php');
4+
5+
$html = file_get_html('google.htm');
6+
//$html = file_get_html('youtube.htm');
7+
//$html = file_get_html('Product.ibatis.xml');
8+
9+
10+
$lang = '';
11+
$l=$html->find('html', 0);
12+
if ($l!==null)
13+
$lang = $l->lang;
14+
if ($lang!='')
15+
$lang = 'lang="'.$lang.'"';
16+
17+
$charset = $html->find('meta[http-equiv*=content-type]', 0);
18+
$target = array();
19+
$query = '';
20+
21+
if (isset($_REQUEST['query'])) {
22+
$query = $_REQUEST['query'];
23+
$target = $html->find($query);
24+
}
25+
26+
function stat_dom($dom) {
27+
$count_text = 0;
28+
$count_comm = 0;
29+
$count_elem = 0;
30+
$count_tag_end = 0;
31+
$count_unknown = 0;
32+
33+
foreach($dom->nodes as $n) {
34+
if ($n->nodetype==HDOM_TYPE_TEXT)
35+
++$count_text;
36+
if ($n->nodetype==HDOM_TYPE_COMMENT)
37+
++$count_comm;
38+
if ($n->nodetype==HDOM_TYPE_ELEMENT)
39+
++$count_elem;
40+
if ($n->nodetype==HDOM_TYPE_ENDTAG)
41+
++$count_tag_end;
42+
if ($n->nodetype==HDOM_TYPE_UNKNOWN)
43+
++$count_unknown;
44+
}
45+
46+
echo 'Total: '. count($dom->nodes).
47+
', Text: '.$count_text.
48+
', Commnet: '.$count_comm.
49+
', Tag: '.$count_elem.
50+
', End Tag: '.$count_tag_end.
51+
', Unknown: '.$count_unknown;
52+
}
53+
54+
function dump_my_html_tree($node, $show_attr=true, $deep=0, $last=true) {
55+
$count = count($node->nodes);
56+
if ($count>0) {
57+
if($last)
58+
echo '<li class="expandable lastExpandable"><div class="hitarea expandable-hitarea lastExpandable-hitarea"></div>&lt;<span class="tag">'.htmlspecialchars($node->tag).'</span>';
59+
else
60+
echo '<li class="expandable"><div class="hitarea expandable-hitarea"></div>&lt;<span class="tag">'.htmlspecialchars($node->tag).'</span>';
61+
}
62+
else {
63+
$laststr = ($last===false) ? '' : ' class="last"';
64+
echo '<li'.$laststr.'>&lt;<span class="tag">'.htmlspecialchars($node->tag).'</span>';
65+
}
66+
67+
if ($show_attr) {
68+
foreach($node->attr as $k=>$v) {
69+
echo ' '.htmlspecialchars($k).'="<span class="attr">'.htmlspecialchars($node->$k).'</span>"';
70+
}
71+
}
72+
echo '&gt;';
73+
74+
if ($node->tag==='text' || $node->tag==='comment') {
75+
echo htmlspecialchars($node->innertext);
76+
return;
77+
}
78+
79+
if ($count>0) echo "\n<ul style=\"display: none;\">\n";
80+
$i=0;
81+
foreach($node->nodes as $c) {
82+
$last = (++$i==$count) ? true : false;
83+
dump_my_html_tree($c, $show_attr, $deep+1, $last);
84+
}
85+
if ($count>0)
86+
echo "</ul>\n";
87+
88+
//if ($count>0) echo '&lt;/<span class="attr">'.htmlspecialchars($node->tag).'</span>&gt;';
89+
echo "</li>\n";
90+
}
91+
?>
92+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
93+
94+
<html <?=$lang?>>
95+
<head>
96+
<?
97+
if ($lang!='')
98+
echo '<meta http-equiv="content-type" content="text/html; charset=utf-8"/>';
99+
else if ($charset)
100+
echo $charset;
101+
else
102+
echo '<meta http-equiv="content-type" content="text/html; charset=iso-8859-1"/>';
103+
?>
104+
<title>Simple HTML DOM Query Test</title>
105+
<link rel="stylesheet" href="js/jquery.treeview.css" />
106+
<link rel="stylesheet" href="js/screen.css" />
107+
<style>
108+
.tag { color: blue; }
109+
.attr { color: #990033; }
110+
</style>
111+
<script src="js/jquery.js" type="text/javascript"></script>
112+
<script src="js/jquery.treeview.js" type="text/javascript"></script>
113+
<script type="text/javascript">
114+
$(document).ready(function(){
115+
$("#html_tree").treeview({
116+
control:"#sidetreecontrol",
117+
collapsed: true,
118+
prerendered: true
119+
});
120+
});
121+
</script>
122+
</head>
123+
<body>
124+
<div id="main">
125+
<h4>Simple HTML DOM Test</h4>
126+
<form name="form1" method="post" action="">
127+
find: <input name="query" type="text" size="60" maxlength="60" value="<?=htmlspecialchars($query)?>">
128+
<input type="submit" name="Submit" value="Go">
129+
</form>
130+
<br>
131+
HTML STAT (<?stat_dom($html);?>)<br>
132+
<br>
133+
<div id="sidetreecontrol"><a href="?#">Collapse All</a> | <a href="?#">Expand All</a></div><br>
134+
<ul class="treeview" id="html_tree">
135+
<?
136+
ob_start();
137+
foreach($target as $e)
138+
dump_my_html_tree($e, true);
139+
ob_end_flush();
140+
?>
141+
</ul>
142+
</div>
143+
144+
</body></html>

0 commit comments

Comments
 (0)