Skip to content

Commit 2340acf

Browse files
committed
Moved stopwords handling to '_filterCloud()'. Fixes #18.
Also, stopwords are now loaded in the constructor of the plugin. This prevents accessing the stopwords files multiple times if a DokuWiki page contains more than one cloud.
1 parent 5b856e3 commit 2340acf

File tree

1 file changed

+46
-17
lines changed

1 file changed

+46
-17
lines changed

syntax.php

Lines changed: 46 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,14 @@
1515
require_once(DOKU_PLUGIN.'syntax.php');
1616

1717
class syntax_plugin_cloud extends DokuWiki_Syntax_Plugin {
18+
protected $stopwords = null;
19+
20+
/**
21+
* Constructor. Loads stopwords.
22+
*/
23+
public function __construct() {
24+
$this->stopwords = $this->_getStopwords();
25+
}
1826

1927
function getType() { return 'substition'; }
2028
function getPType() { return 'block'; }
@@ -131,6 +139,32 @@ function render($mode, Doku_Renderer $renderer, $data) {
131139
return false;
132140
}
133141

142+
/**
143+
* Helper function for loading and returning the array with stopwords.
144+
*
145+
* Stopwords files are loaded from two locations:
146+
* - inc/lang/"actual language"/stopwords.txt
147+
* - conf/stopwords.txt
148+
*
149+
* If both files exists, then both files are used - the content is merged.
150+
*/
151+
protected function _getStopwords() {
152+
// load stopwords
153+
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
154+
if (@file_exists($swfile)) $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
155+
else $stopwords = array();
156+
157+
// load extra local stopwords
158+
$swfile = DOKU_CONF.'stopwords.txt';
159+
if (@file_exists($swfile)) $stopwords = array_merge($stopwords, file($swfile, FILE_IGNORE_NEW_LINES));
160+
161+
if (count($stopwords) == 0) {
162+
return null;
163+
}
164+
165+
return $stopwords;
166+
}
167+
134168
/**
135169
* Applies filters on the cloud:
136170
* - removes all short words, see config option 'minimum_word_length'
@@ -144,6 +178,12 @@ function _filterCloud(&$cloud, $balcklistName) {
144178
unset($cloud[$key]);
145179
}
146180

181+
// Remove stopwords
182+
foreach ($this->stopwords as $word) {
183+
if (isset($cloud[$word]))
184+
unset($cloud[$word]);
185+
}
186+
147187
// Remove word which are on the blacklist
148188
$blacklist = $this->getConf($balcklistName);
149189
if(!empty($blacklist)) {
@@ -163,15 +203,6 @@ function _filterCloud(&$cloud, $balcklistName) {
163203
function _getWordCloud($num, &$min, &$max) {
164204
global $conf;
165205

166-
// load stopwords
167-
$swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
168-
if (@file_exists($swfile)) $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
169-
else $stopwords = array();
170-
171-
// load extra local stopwords
172-
$swfile = DOKU_CONF.'stopwords.txt';
173-
if (@file_exists($swfile)) $stopwords = array_merge($stopwords, file($swfile, FILE_IGNORE_NEW_LINES));
174-
175206
$cloud = array();
176207

177208
if (@file_exists($conf['indexdir'].'/page.idx')) { // new word-length based index
@@ -182,14 +213,14 @@ function _getWordCloud($num, &$min, &$max) {
182213
$idx = idx_getIndex('i', $len);
183214
$word_idx = idx_getIndex('w', $len);
184215

185-
$this->_addWordsToCloud($cloud, $idx, $word_idx, $stopwords);
216+
$this->_addWordsToCloud($cloud, $idx, $word_idx);
186217
}
187218

188219
} else { // old index
189220
$idx = file($conf['cachedir'].'/index.idx');
190221
$word_idx = file($conf['cachedir'].'/word.idx');
191222

192-
$this->_addWordsToCloud($cloud, $idx, $word_idx, $stopwords);
223+
$this->_addWordsToCloud($cloud, $idx, $word_idx);
193224
}
194225

195226
$this->_filterCloud($cloud, 'word_blacklist');
@@ -200,17 +231,15 @@ function _getWordCloud($num, &$min, &$max) {
200231
/**
201232
* Adds all words in given index as $word => $freq to $cloud array
202233
*/
203-
function _addWordsToCloud(&$cloud, $idx, $word_idx, &$stopwords) {
234+
function _addWordsToCloud(&$cloud, $idx, $word_idx) {
204235
$wcount = count($word_idx);
205236

206237
// collect the frequency of the words
207238
for ($i = 0; $i < $wcount; $i++) {
208239
$key = trim($word_idx[$i]);
209-
if (!is_int(array_search($key, $stopwords))) {
210-
$value = explode(':', $idx[$i]);
211-
if (!trim($value[0])) continue;
212-
$cloud[$key] = count($value);
213-
}
240+
$value = explode(':', $idx[$i]);
241+
if (!trim($value[0])) continue;
242+
$cloud[$key] = count($value);
214243
}
215244
}
216245

0 commit comments

Comments
 (0)