Skip to content

Commit e1e41a9

Browse files
committed
Add passthrough option to URLTokenFilter. Delegate to URLTokenizer for tokenization in URLTokenFilter.
1 parent 8ce9054 commit e1e41a9

File tree

8 files changed

+163
-36
lines changed

8 files changed

+163
-36
lines changed

README.md

+6-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part.
99

1010
| Elasticsearch Version | Plugin Version |
1111
|-----------------------|----------------|
12+
| 2.1.1 | 2.2.0 |
1213
| 2.1.1 | 2.1.1 |
1314
| 2.0.0 | 2.1.0 |
1415
| 1.6.x, 1.7.x | 2.0.0 |
@@ -18,7 +19,7 @@ This plugin enables URL tokenization and token filtering by URL part.
1819

1920
## Installation
2021
```bash
21-
bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.1.1/elasticsearch-analysis-url-2.1.1.zip
22+
bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.2.0/elasticsearch-analysis-url-2.2.0.zip
2223
```
2324

2425
## Usage
@@ -87,6 +88,10 @@ curl 'http://localhost:9200/index_name/_analyze?analyzer=url_host&pretty' -d 'ht
8788
* `url_decode`: Defaults to `false`. If `true`, the desired portion of the URL will be URL decoded.
8889
* `allow_malformed`: Defaults to `false`. If `true`, documents containing malformed URLs will not be rejected, and an attempt will be made to parse the desired URL part from the malformed URL string.
8990
If the desired part cannot be found, no value will be indexed for that field.
91+
* `passthrough`: Defaults to `false`. If `true`, `allow_malformed` is implied, and any non-url tokens will be passed through the filter. Valid URLs will be tokenized according to the filter's other settings.
92+
* `tokenize_host`: Defaults to `true`. If `true`, the host will be further tokenized using a [reverse path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `.`.
93+
* `tokenize_path`: Defaults to `true`. If `true`, the path will be tokenized using a [path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `/`.
94+
* `tokenize_query`: Defaults to `true`. If `true`, the query string will be split on `&`.
9095

9196
#### Example:
9297
Set up your index like so:

pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>org.elasticsearch</groupId>
88
<artifactId>elasticsearch-analysis-url</artifactId>
9-
<version>2.1.1</version>
9+
<version>2.2.0</version>
1010
<packaging>jar</packaging>
1111
<description>Elasticsearch URL token filter plugin</description>
1212

src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java

+12-1
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,30 @@
1616
public class URLTokenFilterFactory extends AbstractTokenFilterFactory {
1717
private final URLPart part;
1818
private final boolean urlDecode;
19+
private boolean tokenizeHost;
20+
private boolean tokenizePath;
21+
private boolean tokenizeQuery;
1922
private final boolean allowMalformed;
23+
private final boolean passthrough;
2024

2125
@Inject
2226
public URLTokenFilterFactory(Index index, IndexSettingsService indexSettings, @Assisted String name, @Assisted Settings settings) {
2327
super(index, indexSettings.indexSettings(), name, settings);
2428

2529
this.part = URLPart.fromString(settings.get("part", "whole"));
2630
this.urlDecode = settings.getAsBoolean("url_decode", false);
31+
this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
32+
this.tokenizePath = settings.getAsBoolean("tokenize_path", true);
33+
this.tokenizeQuery = settings.getAsBoolean("tokenize_query", true);
2734
this.allowMalformed = settings.getAsBoolean("allow_malformed", false);
35+
this.passthrough = settings.getAsBoolean("passthrough", false);
2836
}
2937

3038
@Override
3139
public TokenStream create(TokenStream tokenStream) {
32-
return new URLTokenFilter(tokenStream, part, urlDecode, allowMalformed);
40+
return new URLTokenFilter(tokenStream, part, urlDecode, allowMalformed, passthrough)
41+
.setTokenizeHost(tokenizeHost)
42+
.setTokenizePath(tokenizePath)
43+
.setTokenizeQuery(tokenizeQuery);
3344
}
3445
}

src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java

+112-27
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
11
package org.elasticsearch.index.analysis.url;
22

3-
import com.google.common.base.Strings;
3+
import com.google.common.collect.ImmutableList;
44
import org.apache.lucene.analysis.TokenFilter;
55
import org.apache.lucene.analysis.TokenStream;
6+
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
7+
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
68
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
9+
import org.elasticsearch.common.Strings;
710
import org.elasticsearch.index.analysis.URLPart;
811

912
import java.io.IOException;
13+
import java.io.StringReader;
1014
import java.net.MalformedURLException;
11-
import java.net.URL;
12-
import java.net.URLDecoder;
15+
import java.util.ArrayList;
16+
import java.util.Iterator;
17+
import java.util.List;
1318
import java.util.regex.Matcher;
1419
import java.util.regex.Pattern;
1520

@@ -24,11 +29,29 @@ public final class URLTokenFilter extends TokenFilter {
2429

2530
private final boolean urlDeocde;
2631

32+
/**
33+
* If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer}
34+
*/
35+
private boolean tokenizeHost = true;
36+
37+
/**
38+
* If true, the url's path will be tokenized using a {@link PathHierarchyTokenizer}
39+
*/
40+
private boolean tokenizePath = true;
41+
42+
/**
43+
* If true, the url's query string will be split on &
44+
*/
45+
private boolean tokenizeQuery = true;
46+
2747
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
2848

2949
private final boolean allowMalformed;
3050

31-
private boolean parsed;
51+
private boolean passthrough;
52+
53+
private List<String> tokens;
54+
private Iterator<String> iterator;
3255

3356
public URLTokenFilter(TokenStream input, URLPart part) {
3457
this(input, part, false);
@@ -39,49 +62,111 @@ public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode) {
3962
}
4063

4164
public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed) {
65+
this(input, part, urlDecode, allowMalformed, false);
66+
}
67+
68+
public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed, boolean passthrough) {
4269
super(input);
4370
this.part = part;
4471
this.urlDeocde = urlDecode;
4572
this.allowMalformed = allowMalformed;
73+
this.passthrough = passthrough;
74+
}
75+
76+
77+
public URLTokenFilter setTokenizeHost(boolean tokenizeHost) {
78+
this.tokenizeHost = tokenizeHost;
79+
return this;
80+
}
81+
82+
public URLTokenFilter setTokenizePath(boolean tokenizePath) {
83+
this.tokenizePath = tokenizePath;
84+
return this;
85+
}
86+
87+
public URLTokenFilter setTokenizeQuery(boolean tokenizeQuery) {
88+
this.tokenizeQuery = tokenizeQuery;
89+
return this;
4690
}
4791

92+
4893
@Override
4994
public boolean incrementToken() throws IOException {
50-
if (input.incrementToken() && !parsed) {
51-
final String urlString = termAttribute.toString();
52-
termAttribute.setEmpty();
53-
if (Strings.isNullOrEmpty(urlString) || urlString.equals("null")) {
95+
if(iterator == null || !iterator.hasNext()){
96+
if ((iterator != null && !iterator.hasNext() && !passthrough) || !advance()) {
97+
return false;
98+
}
99+
}
100+
clearAttributes();
101+
String next = iterator.next();
102+
if (allowMalformed) {
103+
next = parseMalformed(next);
104+
}
105+
termAttribute.append(next);
106+
return true;
107+
}
108+
109+
110+
/**
111+
* Advance to the next token, if any
112+
* @return true if more tokens are forthcoming, false otherwise
113+
* @throws IOException
114+
*/
115+
private boolean advance() throws IOException {
116+
if (input.incrementToken()) {
117+
String urlString = termAttribute.toString();
118+
if ((Strings.isNullOrEmpty(urlString) || "null".equals(urlString)) && !allowMalformed && !passthrough) {
54119
return false;
55120
}
56-
String partString;
57121
try {
58-
URL url = new URL(urlString);
59-
partString = URLUtils.getPart(url, part);
60-
parsed = !Strings.isNullOrEmpty(partString);
61-
} catch (MalformedURLException e) {
62-
if (allowMalformed) {
63-
partString = parseMalformed(urlString);
64-
if (Strings.isNullOrEmpty(partString)) {
65-
return false;
122+
tokens = tokenize(urlString);
123+
} catch (IOException e) {
124+
if (e.getMessage().contains("Malformed URL")) {
125+
if (allowMalformed) {
126+
tokens = ImmutableList.of(urlString);
127+
} else {
128+
throw new MalformedURLException("Malformed URL: " + urlString);
66129
}
67-
parsed = true;
68-
} else {
69-
throw e;
70130
}
131+
throw e;
71132
}
72-
if (urlDeocde) {
73-
partString = URLDecoder.decode(partString, "UTF-8");
74-
}
75-
termAttribute.append(partString);
133+
iterator = tokens.iterator();
76134
return true;
135+
} else {
136+
return false;
77137
}
78-
return false;
79138
}
80139

140+
141+
/**
142+
* Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
143+
* will be passed along to the tokenizer.
144+
* @param input a string to be tokenized
145+
* @return a list of tokens extracted from the input string
146+
* @throws IOException
147+
*/
148+
private List<String> tokenize(String input) throws IOException {
149+
List<String> tokens = new ArrayList<>();
150+
URLTokenizer tokenizer = new URLTokenizer(part);
151+
tokenizer.setUrlDecode(urlDeocde);
152+
tokenizer.setTokenizeHost(tokenizeHost);
153+
tokenizer.setTokenizePath(tokenizePath);
154+
tokenizer.setTokenizeQuery(tokenizeQuery);
155+
tokenizer.setAllowMalformed(allowMalformed || passthrough);
156+
tokenizer.setReader(new StringReader(input));
157+
tokenizer.reset();
158+
while (tokenizer.incrementToken()) {
159+
tokens.add(tokenizer.getAttribute(CharTermAttribute.class).toString());
160+
}
161+
return tokens;
162+
}
163+
164+
81165
@Override
82166
public void reset() throws IOException {
83167
super.reset();
84-
parsed = false;
168+
tokens = null;
169+
iterator = null;
85170
}
86171

87172
private static final Pattern REGEX_PROTOCOL = Pattern.compile("^([a-zA-Z]+)(?=://)");
@@ -104,7 +189,7 @@ private String parseMalformed(String urlString) {
104189
case WHOLE:
105190
return urlString;
106191
default:
107-
return null;
192+
return urlString;
108193
}
109194
}
110195

src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ private int getEndIndex(int start, String partStringRaw) {
287287
* @return a list of tokens
288288
* @throws IOException
289289
*/
290-
List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
290+
private List<Token> tokenize(URLPart part, Tokenizer tokenizer, int start) throws IOException {
291291
tokenizer.reset();
292292
List<Token> tokens = new ArrayList<>();
293293
OffsetAttribute offset;

src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java

+13
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import static org.elasticsearch.index.analysis.url.URLTokenFilterTest.TEST_HTTPS_URL;
1414
import static org.elasticsearch.index.analysis.url.URLTokenFilterTest.TEST_HTTP_URL;
15+
import static org.hamcrest.Matchers.equalTo;
1516
import static org.hamcrest.Matchers.hasSize;
1617

1718
/**
@@ -65,6 +66,18 @@ public void testMalformed() {
6566
assertEquals("found a doc missing http_malformed.port", 1, hits.getTotalHits());
6667
}
6768

69+
70+
@Test
71+
public void testPassthrough() {
72+
List<AnalyzeResponse.AnalyzeToken> tokens = analyzeURL("http://foo.com:9200/foo.bar baz bat.blah", "url_host_passthrough");
73+
assertThat(tokens, hasSize(4));
74+
assertThat(tokens.get(0).getTerm(), equalTo("foo.com"));
75+
assertThat(tokens.get(1).getTerm(), equalTo("com"));
76+
assertThat(tokens.get(2).getTerm(), equalTo("baz"));
77+
assertThat(tokens.get(3).getTerm(), equalTo("bat.blah"));
78+
}
79+
80+
6881
@Test
6982
public void testIndex() {
7083
Map<String, Object> doc = new HashMap<>();

src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ public void testFilterProtocol() throws IOException {
2525

2626
@Test
2727
public void testFilterHost() throws IOException {
28-
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.HOST), "www.foo.bar.com");
28+
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.HOST).setTokenizeHost(false), "www.foo.bar.com");
2929
}
3030

3131
@Test
@@ -35,7 +35,7 @@ public void testFilterPort() throws IOException {
3535

3636
@Test
3737
public void testFilterPath() throws IOException {
38-
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.PATH), "/index_name/type_name/_search.html");
38+
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.PATH).setTokenizePath(false), "/index_name/type_name/_search.html");
3939
}
4040

4141
@Test
@@ -45,7 +45,7 @@ public void testFilterRef() throws IOException {
4545

4646
@Test
4747
public void testFilterQuery() throws IOException {
48-
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.QUERY), "foo=bar&baz=bat");
48+
assertTokenStreamContents(createFilter(TEST_HTTP_URL, URLPart.QUERY).setTokenizeQuery(false), "foo=bar&baz=bat");
4949
}
5050

5151
@Test(expected = MalformedURLException.class)

src/test/resources/test-settings.json

+15-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
},
2626
"url_host": {
2727
"type": "url",
28-
"part": "host"
28+
"part": "host",
29+
"tokenize_host": false
2930
},
3031
"url_port": {
3132
"type": "url",
@@ -34,12 +35,18 @@
3435
"url_query": {
3536
"type": "url",
3637
"part": "query",
37-
"url_decode": true
38+
"url_decode": true,
39+
"tokenize_query": false
3840
},
3941
"url_port_malformed": {
4042
"type": "url",
4143
"part": "port",
4244
"allow_malformed": true
45+
},
46+
"url_host_passthrough": {
47+
"type": "url",
48+
"part": "host",
49+
"passthrough": "true"
4350
}
4451
},
4552
"analyzer": {
@@ -73,6 +80,12 @@
7380
],
7481
"tokenizer": "whitespace"
7582
},
83+
"url_host_passthrough": {
84+
"filter": [
85+
"url_host_passthrough"
86+
],
87+
"tokenizer": "whitespace"
88+
},
7689
"tokenizer_url_protocol": {
7790
"tokenizer": "url_protocol"
7891
},

0 commit comments

Comments
 (0)