diff --git a/.travis.yml b/.travis.yml index a75414cf..6f458cbf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ python: - "3.7" install: - - pip install pytest nose codecov coverage cached-property + - pip install pytest nose codecov coverage cached-property jieba script: - nosetests --with-coverage diff --git a/setup.py b/setup.py index c4997749..d5accade 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,8 @@ #!python -import os.path, sys +import os.path +import sys + from setuptools import setup, find_packages from setuptools.command.test import test as TestCommand @@ -20,7 +22,7 @@ def finalize_options(self): self.test_suite = True def run_tests(self): - #import here, cause outside the eggs aren't loaded + # import here, cause outside the eggs aren't loaded import pytest pytest.main(self.test_args) @@ -44,18 +46,18 @@ def run_tests(self): zip_safe=True, install_requires=['cached-property'], - tests_require=['pytest'], + tests_require=['pytest', 'jieba'], cmdclass={'test': PyTest}, classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "License :: OSI Approved :: BSD License", - "Natural Language :: English", - "Operating System :: OS Independent", - "Programming Language :: Python :: 2.5", - "Programming Language :: Python :: 3", - "Topic :: Software Development :: Libraries :: Python Modules", - "Topic :: Text Processing :: Indexing", + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 2.5", + "Programming Language :: Python :: 3", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing :: Indexing", ], ) diff --git a/src/whoosh/highlight.py b/src/whoosh/highlight.py index 6fbbe6d0..212a7157 100644 --- a/src/whoosh/highlight.py +++ b/src/whoosh/highlight.py @@ -131,8 +131,8 @@ def __init__(self, text, matches, startchar=0, endchar=-1): self.matched_terms.add(t.text) def __repr__(self): - return "" % (self.startchar, self.endchar, - len(self.matches)) + return "" % (self.startchar, self.endchar, + len(self.matches)) def __len__(self): return self.endchar - self.startchar @@ -695,7 +695,12 @@ def format_fragment(self, fragment, replace=False): index = fragment.startchar text = fragment.text - for t in fragment.matches: + # For overlapping tokens (such as in Chinese), sort by position, + # then by inverse of length. + # Because the formatter is sequential, it will only pick the first + # token for a given position to highlight. This makes sure it picks + # the longest overlapping token. + for t in sorted(fragment.matches, key=lambda token: (token.startchar, -(token.endchar - token.startchar))): if t.startchar is None: continue if t.startchar < index: diff --git a/tests/test_highlighting.py b/tests/test_highlighting.py index 523dff6b..1647d1bf 100644 --- a/tests/test_highlighting.py +++ b/tests/test_highlighting.py @@ -3,6 +3,7 @@ from __future__ import with_statement import pytest +from jieba.analyse import ChineseAnalyzer from whoosh import analysis, highlight, fields, qparser, query from whoosh.compat import u @@ -330,3 +331,24 @@ def test_whole_noterms(): hi = r[0].highlights("text", minscore=0) assert hi == u("alfa bravo charlie delta echo foxtrot golf") + + +def test_overlapping_tokens(): + query_string = u'马克思' + text = u'两次历史性飞跃与马克思主义中国化' + analyzer = ChineseAnalyzer() + formatter = highlight.HtmlFormatter() + + terms = [token.text for token in analyzer(query_string)] + + output = highlight.highlight( + text, + terms, + analyzer, + highlight.WholeFragmenter(), + formatter + ) + + assert output == u'两次历史性飞跃与马克思主义中国化', \ + u'The longest overlapping token 马克思 was not selected by the highlighter' + # as opposed to '两次历史性飞跃与马克思主义中国化'