Skip to content

Commit cb5b31c

Browse files
authored
Fix errors in stemming factory for W/ASM build with newest ZIMs #92 (#93)
This introduces a whitelist for supported languages in Xapian stemming module.
1 parent 5531753 commit cb5b31c

File tree

5 files changed

+141
-16
lines changed

5 files changed

+141
-16
lines changed

Makefile

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ build/lib/liblzma.so :
4141

4242
build/lib/libz.a :
4343
# Version not yet available in dev.kiwix.org
44-
wget -N https://zlib.net/zlib-1.3.tar.gz
44+
wget -N https://zlib.net/zlib-1.3.1.tar.gz
4545
tar xf zlib-*.tar.gz
4646
cd zlib-*/ ; emconfigure ./configure --prefix=`pwd`/../build
4747
cd zlib-*/ ; emmake make
@@ -67,18 +67,32 @@ build/lib/libicudata.so :
6767

6868
build/lib/libxapian.a : build/lib/libz.a
6969
# Origin: https://oligarchy.co.uk/xapian/1.4.18/xapian-core-1.4.18.tar.xz
70-
[ ! -f xapian-*.tar.gz ] && wget -N https://dev.kiwix.org/kiwix-build/xapian-core-1.4.23.tar.xz || true
70+
# Also: https://dev.kiwix.org/kiwix-build/xapian-core-1.4.23.tar.xz
71+
[ ! -f xapian-*.tar.gz ] && wget -N https://oligarchy.co.uk/xapian/1.4.29/xapian-core-1.4.29.tar.xz || true
7172
tar xf xapian-core-*.tar.xz
7273
# Some options coming from https://github.com/xapian/xapian/tree/master/xapian-core/emscripten
7374
# cd xapian-core-1.4.18; emconfigure ./configure --prefix=`pwd`/../build "CFLAGS=-I`pwd`/../build/include -L`pwd`/../build/lib" "CXXFLAGS=-I`pwd`/../build/include -L`pwd`/../build/lib" CPPFLAGS='-DFLINTLOCK_USE_FLOCK' CXXFLAGS='-Oz -s USE_ZLIB=1 -fno-rtti' --disable-backend-honey --disable-backend-inmemory --disable-shared --disable-backend-remote
7475
cd xapian-core-*/ ; emconfigure ./configure --prefix=`pwd`/../build "CFLAGS=-I`pwd`/../build/include -L`pwd`/../build/lib" "CXXFLAGS=-I`pwd`/../build/include -L`pwd`/../build/lib" --disable-shared --disable-backend-remote
75-
cd xapian-core-*/ ; emmake make "CFLAGS=-I`pwd`/../build/include -L`pwd`/../build/lib -std=c++14" "CXXFLAGS="-I`pwd`/../build/include -L`pwd`/../build/lib -std=c++14"
76+
cd xapian-core-*/ ; emmake make "CFLAGS=-I`pwd`/../build/include -L`pwd`/../build/lib -std=c++14" "CXXFLAGS=-I`pwd`/../build/include -L`pwd`/../build/lib -std=c++14"
7677
cd xapian-core-*/ ; emmake make install
7778

7879
build/lib/libzim.a : build/lib/liblzma.so build/lib/libz.a build/lib/libzstd.a build/lib/libicudata.so build/lib/libxapian.a
7980
# Origin: wget -N --content-disposition https://github.com/openzim/libzim/archive/7.2.2.tar.gz
80-
[ ! -f libzim-*.tar.xz ] && wget -N https://download.openzim.org/release/libzim/libzim-9.0.0.tar.xz || true
81+
[ ! -f libzim-*.tar.xz ] && wget -N https://download.openzim.org/release/libzim/libzim-9.3.0.tar.xz || true
8182
tar xf libzim-*.tar.xz
83+
@echo "=== APPLYING WHITELIST-BASED LIBZIM PATCHES ==="
84+
# Add required header for std::set
85+
sed -i '/#include <unicode\/locid.h>/a #include <set>' libzim-*/src/search.cpp
86+
sed -i '/#include <unicode\/locid.h>/a #include <set>' libzim-*/src/suggestion.cpp
87+
# SEARCH.CPP - Whitelist all Xapian-supported languages, use 'none' for all others
88+
sed -i 's/m_stemmer = Xapian::Stem(languageLocale.getLanguage());/{ std::string stemLang = languageLocale.getLanguage(); static const std::set<std::string> supportedLangs = {"ar", "hy", "eu", "ca", "da", "nl", "en", "fi", "fr", "de", "el", "hi", "hu", "id", "ga", "it", "lt", "ne", "no", "pt", "ro", "ru", "sr", "es", "sv", "tr"}; if (supportedLangs.find(stemLang) != supportedLangs.end()) { m_stemmer = Xapian::Stem(stemLang); } else { m_stemmer = Xapian::Stem("none"); } }/' libzim-*/src/search.cpp
89+
# SUGGESTION.CPP - Whitelist all Xapian-supported languages, use 'none' for all others
90+
sed -i 's/m_stemmer = Xapian::Stem(languageLocale.getLanguage());/{ std::string stemLang = languageLocale.getLanguage(); static const std::set<std::string> supportedLangs = {"ar", "hy", "eu", "ca", "da", "nl", "en", "fi", "fr", "de", "el", "hi", "hu", "id", "ga", "it", "lt", "ne", "no", "pt", "ro", "ru", "sr", "es", "sv", "tr"}; if (supportedLangs.find(stemLang) != supportedLangs.end()) { m_stemmer = Xapian::Stem(stemLang); } else { m_stemmer = Xapian::Stem("none"); } }/' libzim-*/src/suggestion.cpp
91+
@echo "=== VERIFYING PATCHES APPLIED ==="
92+
@echo "search.cpp - Headers added: $$(grep -c '#include <set>' libzim-*/src/search.cpp || echo '0')"
93+
@echo "suggestion.cpp - Headers added: $$(grep -c '#include <set>' libzim-*/src/suggestion.cpp || echo '0')"
94+
@echo "search.cpp - Whitelist added: $$(grep -c 'supportedLangs' libzim-*/src/search.cpp || echo '0')"
95+
@echo "suggestion.cpp - Whitelist added: $$(grep -c 'supportedLangs' libzim-*/src/suggestion.cpp || echo '0')"
8296
# It's no use trying to compile examples
8397
sed -i -e "s/^subdir('examples')//" libzim-*/meson.build
8498
cd libzim-*/ ; PKG_CONFIG_PATH=/src/build/lib/pkgconfig meson --prefix=`pwd`/../build --cross-file=../emscripten-crosscompile.ini . build -DUSE_MMAP=false
@@ -121,4 +135,4 @@ clean :
121135
rm -rf libzim_wasm-*
122136
rm -rf build
123137

124-
.PHONY : all clean
138+
.PHONY : all clean

complete-whitelist.mk

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/usr/bin/make -f
2+
3+
# Complete whitelist approach with all Xapian-supported languages
4+
# Run with: make -f complete-whitelist.mk
5+
6+
all: complete-whitelist
7+
8+
complete-whitelist:
9+
@echo "=== Downloading fresh libzim 9.3.0 ==="
10+
rm -rf libzim-9.3.0 9.3.0.tar.gz
11+
wget -q https://github.com/openzim/libzim/archive/9.3.0.tar.gz
12+
tar -xzf 9.3.0.tar.gz
13+
14+
@echo "=== APPLYING COMPLETE WHITELIST PATCHES ==="
15+
16+
# Add #include <set> to both files first
17+
sed -i '/#include <unicode\/locid.h>/a #include <set>' libzim-9.3.0/src/search.cpp
18+
sed -i '/#include <unicode\/locid.h>/a #include <set>' libzim-9.3.0/src/suggestion.cpp
19+
20+
# SEARCH.CPP - Complete list of all Xapian-supported languages
21+
sed -i 's/m_stemmer = Xapian::Stem(languageLocale.getLanguage());/{ std::string stemLang = languageLocale.getLanguage(); static const std::set<std::string> supportedLangs = {"ar", "hy", "eu", "ca", "da", "nl", "en", "fi", "fr", "de", "el", "hi", "hu", "id", "ga", "it", "lt", "ne", "no", "pt", "ro", "ru", "sr", "es", "sv", "tr"}; if (supportedLangs.find(stemLang) != supportedLangs.end()) { m_stemmer = Xapian::Stem(stemLang); } else { m_stemmer = Xapian::Stem("none"); } }/' libzim-9.3.0/src/search.cpp
22+
23+
# SUGGESTION.CPP - Complete list of all Xapian-supported languages
24+
sed -i 's/m_stemmer = Xapian::Stem(languageLocale.getLanguage());/{ std::string stemLang = languageLocale.getLanguage(); static const std::set<std::string> supportedLangs = {"ar", "hy", "eu", "ca", "da", "nl", "en", "fi", "fr", "de", "el", "hi", "hu", "id", "ga", "it", "lt", "ne", "no", "pt", "ro", "ru", "sr", "es", "sv", "tr"}; if (supportedLangs.find(stemLang) != supportedLangs.end()) { m_stemmer = Xapian::Stem(stemLang); } else { m_stemmer = Xapian::Stem("none"); } }/' libzim-9.3.0/src/suggestion.cpp
25+
26+
@echo "=== VERIFICATION ==="
27+
@echo "search.cpp - Include added: $$(grep -c '#include <set>' libzim-9.3.0/src/search.cpp || echo '0')"
28+
@echo "suggestion.cpp - Include added: $$(grep -c '#include <set>' libzim-9.3.0/src/suggestion.cpp || echo '0')"
29+
@echo "search.cpp - Complete whitelist added: $$(grep -c 'supportedLangs' libzim-9.3.0/src/search.cpp || echo '0')"
30+
@echo "suggestion.cpp - Complete whitelist added: $$(grep -c 'supportedLangs' libzim-9.3.0/src/suggestion.cpp || echo '0')"
31+
@echo ""
32+
@echo "Supported languages: ar, hy, eu, ca, da, nl, en, fi, fr, de, el, hi, hu, id, ga, it, lt, ne, no, pt, ro, ru, sr, es, sv, tr"
33+
@echo "All other languages will use 'none' stemmer"
34+
35+
clean:
36+
rm -rf libzim-9.3.0 9.3.0.tar.gz
37+
38+
.PHONY: all complete-whitelist clean

libzim_bindings.cpp

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ class SuggestionSearchWrapper {
150150
// Implement the suggest method (needs to be after SuggestionSearchWrapper definition)
151151
SuggestionSearchWrapper SuggestionSearcherWrapper::suggest(const std::string& query) {
152152
try {
153+
// Use suggest() method
153154
zim::SuggestionSearch search = searcher->suggest(query);
154155
// Use move constructor
155156
return SuggestionSearchWrapper(std::move(search));
@@ -173,23 +174,59 @@ std::unique_ptr<EntryWrapper> getEntryByPath(std::string url) {
173174
}
174175
}
175176

176-
// Search for a text, and returns the path of the first result
177+
// Search for a text using proper Query API
177178
std::vector<EntryWrapper> search(std::string text, int numResults) {
178-
auto searcher = zim::Searcher(*g_archive);
179-
auto query = zim::Query(text);
180-
auto search = searcher.search(query);
181-
auto searchResultSet = search.getResults(0, numResults);
182-
std::vector<EntryWrapper> ret;
183-
for(auto entry:searchResultSet) {
184-
ret.push_back(EntryWrapper(entry));
179+
try {
180+
auto searcher = zim::Searcher(*g_archive);
181+
182+
// FIX: Use proper Query construction
183+
zim::Query query; // Create empty query first
184+
query.setQuery(text); // Then set the query text
185+
186+
auto search = searcher.search(query);
187+
auto searchResultSet = search.getResults(0, numResults);
188+
std::vector<EntryWrapper> ret;
189+
for(auto entry:searchResultSet) {
190+
ret.push_back(EntryWrapper(entry));
191+
}
192+
return ret;
193+
} catch(const std::exception& e) {
194+
std::cout << "Search error: " << e.what() << std::endl;
195+
return std::vector<EntryWrapper>();
196+
}
197+
}
198+
199+
// Enhanced search with language control
200+
std::vector<EntryWrapper> searchWithLanguage(std::string text, int numResults, std::string language = "") {
201+
try {
202+
auto searcher = zim::Searcher(*g_archive);
203+
zim::Query query;
204+
205+
// Set the query text using proper API
206+
query.setQuery(text);
207+
208+
// TODO: Add language control if libzim supports it
209+
// This might require additional Query methods or Searcher configuration
210+
211+
auto search = searcher.search(query);
212+
auto searchResultSet = search.getResults(0, numResults);
213+
std::vector<EntryWrapper> ret;
214+
for(auto entry:searchResultSet) {
215+
ret.push_back(EntryWrapper(entry));
216+
}
217+
return ret;
218+
} catch(const std::exception& e) {
219+
std::cout << "Search with language error: " << e.what() << std::endl;
220+
return std::vector<EntryWrapper>();
185221
}
186-
return ret;
187222
}
188223

189-
// Suggestion search function (alternative to class-based approach)
224+
// Suggestion search function using proper API
190225
std::vector<EntryWrapper> suggest(std::string text, int numResults) {
191226
try {
192227
auto suggestionSearcher = zim::SuggestionSearcher(*g_archive);
228+
229+
// Use suggest() method
193230
auto suggestionSearch = suggestionSearcher.suggest(text);
194231
auto resultSet = suggestionSearch.getResults(0, numResults);
195232
std::vector<EntryWrapper> ret;
@@ -218,6 +255,7 @@ EMSCRIPTEN_BINDINGS(libzim_module) {
218255
emscripten::function("getEntryByPath", &getEntryByPath);
219256
emscripten::function("getArticleCount", &getArticleCount);
220257
emscripten::function("search", &search);
258+
emscripten::function("searchWithLanguage", &searchWithLanguage);
221259
emscripten::function("suggest", &suggest);
222260
emscripten::register_vector<char>("vector<char>");
223261
emscripten::register_vector<EntryWrapper>("vector(EntryWrapper)");
@@ -243,4 +281,4 @@ EMSCRIPTEN_BINDINGS(libzim_module) {
243281
.function("getEstimatedMatches", &SuggestionSearchWrapper::getEstimatedMatches)
244282
.function("getResults", &SuggestionSearchWrapper::getResults)
245283
;
246-
}
284+
}

search-debug.patch

Whitespace-only changes.

whitelist-complete.mk

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/usr/bin/make -f
2+
3+
# Whitelist approach with proper headers
4+
# Run with: make -f whitelist-complete.mk
5+
6+
all: whitelist-complete
7+
8+
whitelist-complete:
9+
@echo "=== Downloading fresh libzim 9.3.0 ==="
10+
rm -rf libzim-9.3.0 9.3.0.tar.gz
11+
wget -q https://github.com/openzim/libzim/archive/9.3.0.tar.gz
12+
tar -xzf 9.3.0.tar.gz
13+
14+
@echo "=== APPLYING WHITELIST PATCHES ==="
15+
16+
# Add #include <set> to both files first
17+
sed -i '/#include <unicode\/locid.h>/a #include <set>' libzim-9.3.0/src/search.cpp
18+
sed -i '/#include <unicode\/locid.h>/a #include <set>' libzim-9.3.0/src/suggestion.cpp
19+
20+
# SEARCH.CPP - Only allow known supported languages
21+
sed -i 's/m_stemmer = Xapian::Stem(languageLocale.getLanguage());/{ std::string stemLang = languageLocale.getLanguage(); static const std::set<std::string> supportedLangs = {"da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt", "ro", "ru", "es", "sv", "tr"}; if (supportedLangs.find(stemLang) != supportedLangs.end()) { m_stemmer = Xapian::Stem(stemLang); } else { m_stemmer = Xapian::Stem("none"); } }/' libzim-9.3.0/src/search.cpp
22+
23+
# SUGGESTION.CPP - Only allow known supported languages
24+
sed -i 's/m_stemmer = Xapian::Stem(languageLocale.getLanguage());/{ std::string stemLang = languageLocale.getLanguage(); static const std::set<std::string> supportedLangs = {"da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt", "ro", "ru", "es", "sv", "tr"}; if (supportedLangs.find(stemLang) != supportedLangs.end()) { m_stemmer = Xapian::Stem(stemLang); } else { m_stemmer = Xapian::Stem("none"); } }/' libzim-9.3.0/src/suggestion.cpp
25+
26+
@echo "=== VERIFICATION ==="
27+
@echo "search.cpp - Include added: $$(grep -c '#include <set>' libzim-9.3.0/src/search.cpp || echo '0')"
28+
@echo "suggestion.cpp - Include added: $$(grep -c '#include <set>' libzim-9.3.0/src/suggestion.cpp || echo '0')"
29+
@echo "search.cpp - Whitelist added: $$(grep -c 'supportedLangs' libzim-9.3.0/src/search.cpp || echo '0')"
30+
@echo "suggestion.cpp - Whitelist added: $$(grep -c 'supportedLangs' libzim-9.3.0/src/suggestion.cpp || echo '0')"
31+
32+
clean:
33+
rm -rf libzim-9.3.0 9.3.0.tar.gz
34+
35+
.PHONY: all whitelist-complete clean

0 commit comments

Comments
 (0)