1
1
package org .elasticsearch .index .analysis .url ;
2
2
3
- import com .google .common .base . Strings ;
3
+ import com .google .common .collect . ImmutableList ;
4
4
import org .apache .lucene .analysis .TokenFilter ;
5
5
import org .apache .lucene .analysis .TokenStream ;
6
+ import org .apache .lucene .analysis .path .PathHierarchyTokenizer ;
7
+ import org .apache .lucene .analysis .path .ReversePathHierarchyTokenizer ;
6
8
import org .apache .lucene .analysis .tokenattributes .CharTermAttribute ;
9
+ import org .elasticsearch .common .Strings ;
7
10
import org .elasticsearch .index .analysis .URLPart ;
8
11
9
12
import java .io .IOException ;
13
+ import java .io .StringReader ;
10
14
import java .net .MalformedURLException ;
11
- import java .net .URL ;
12
- import java .net .URLDecoder ;
15
+ import java .util .ArrayList ;
16
+ import java .util .Iterator ;
17
+ import java .util .List ;
13
18
import java .util .regex .Matcher ;
14
19
import java .util .regex .Pattern ;
15
20
@@ -24,11 +29,29 @@ public final class URLTokenFilter extends TokenFilter {
24
29
25
30
private final boolean urlDeocde ;
26
31
32
+ /**
33
+ * If true, the url's host will be tokenized using a {@link ReversePathHierarchyTokenizer}
34
+ */
35
+ private boolean tokenizeHost = true ;
36
+
37
+ /**
38
+ * If true, the url's path will be tokenized using a {@link PathHierarchyTokenizer}
39
+ */
40
+ private boolean tokenizePath = true ;
41
+
42
+ /**
43
+ * If true, the url's query string will be split on &
44
+ */
45
+ private boolean tokenizeQuery = true ;
46
+
27
47
private final CharTermAttribute termAttribute = addAttribute (CharTermAttribute .class );
28
48
29
49
private final boolean allowMalformed ;
30
50
31
- private boolean parsed ;
51
+ private boolean passthrough ;
52
+
53
+ private List <String > tokens ;
54
+ private Iterator <String > iterator ;
32
55
33
56
public URLTokenFilter (TokenStream input , URLPart part ) {
34
57
this (input , part , false );
@@ -39,49 +62,111 @@ public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode) {
39
62
}
40
63
41
64
public URLTokenFilter (TokenStream input , URLPart part , boolean urlDecode , boolean allowMalformed ) {
65
+ this (input , part , urlDecode , allowMalformed , false );
66
+ }
67
+
68
+ public URLTokenFilter (TokenStream input , URLPart part , boolean urlDecode , boolean allowMalformed , boolean passthrough ) {
42
69
super (input );
43
70
this .part = part ;
44
71
this .urlDeocde = urlDecode ;
45
72
this .allowMalformed = allowMalformed ;
73
+ this .passthrough = passthrough ;
74
+ }
75
+
76
+
77
+ public URLTokenFilter setTokenizeHost (boolean tokenizeHost ) {
78
+ this .tokenizeHost = tokenizeHost ;
79
+ return this ;
80
+ }
81
+
82
+ public URLTokenFilter setTokenizePath (boolean tokenizePath ) {
83
+ this .tokenizePath = tokenizePath ;
84
+ return this ;
85
+ }
86
+
87
+ public URLTokenFilter setTokenizeQuery (boolean tokenizeQuery ) {
88
+ this .tokenizeQuery = tokenizeQuery ;
89
+ return this ;
46
90
}
47
91
92
+
48
93
@ Override
49
94
public boolean incrementToken () throws IOException {
50
- if (input .incrementToken () && !parsed ) {
51
- final String urlString = termAttribute .toString ();
52
- termAttribute .setEmpty ();
53
- if (Strings .isNullOrEmpty (urlString ) || urlString .equals ("null" )) {
95
+ if (iterator == null || !iterator .hasNext ()){
96
+ if ((iterator != null && !iterator .hasNext () && !passthrough ) || !advance ()) {
97
+ return false ;
98
+ }
99
+ }
100
+ clearAttributes ();
101
+ String next = iterator .next ();
102
+ if (allowMalformed ) {
103
+ next = parseMalformed (next );
104
+ }
105
+ termAttribute .append (next );
106
+ return true ;
107
+ }
108
+
109
+
110
+ /**
111
+ * Advance to the next token, if any
112
+ * @return true if more tokens are forthcoming, false otherwise
113
+ * @throws IOException
114
+ */
115
+ private boolean advance () throws IOException {
116
+ if (input .incrementToken ()) {
117
+ String urlString = termAttribute .toString ();
118
+ if ((Strings .isNullOrEmpty (urlString ) || "null" .equals (urlString )) && !allowMalformed && !passthrough ) {
54
119
return false ;
55
120
}
56
- String partString ;
57
121
try {
58
- URL url = new URL (urlString );
59
- partString = URLUtils .getPart (url , part );
60
- parsed = !Strings .isNullOrEmpty (partString );
61
- } catch (MalformedURLException e ) {
62
- if (allowMalformed ) {
63
- partString = parseMalformed (urlString );
64
- if (Strings .isNullOrEmpty (partString )) {
65
- return false ;
122
+ tokens = tokenize (urlString );
123
+ } catch (IOException e ) {
124
+ if (e .getMessage ().contains ("Malformed URL" )) {
125
+ if (allowMalformed ) {
126
+ tokens = ImmutableList .of (urlString );
127
+ } else {
128
+ throw new MalformedURLException ("Malformed URL: " + urlString );
66
129
}
67
- parsed = true ;
68
- } else {
69
- throw e ;
70
130
}
131
+ throw e ;
71
132
}
72
- if (urlDeocde ) {
73
- partString = URLDecoder .decode (partString , "UTF-8" );
74
- }
75
- termAttribute .append (partString );
133
+ iterator = tokens .iterator ();
76
134
return true ;
135
+ } else {
136
+ return false ;
77
137
}
78
- return false ;
79
138
}
80
139
140
+
141
+ /**
142
+ * Tokenize the given input using a {@link URLTokenizer}. Settings which have been set on this {@link URLTokenFilter}
143
+ * will be passed along to the tokenizer.
144
+ * @param input a string to be tokenized
145
+ * @return a list of tokens extracted from the input string
146
+ * @throws IOException
147
+ */
148
+ private List <String > tokenize (String input ) throws IOException {
149
+ List <String > tokens = new ArrayList <>();
150
+ URLTokenizer tokenizer = new URLTokenizer (part );
151
+ tokenizer .setUrlDecode (urlDeocde );
152
+ tokenizer .setTokenizeHost (tokenizeHost );
153
+ tokenizer .setTokenizePath (tokenizePath );
154
+ tokenizer .setTokenizeQuery (tokenizeQuery );
155
+ tokenizer .setAllowMalformed (allowMalformed || passthrough );
156
+ tokenizer .setReader (new StringReader (input ));
157
+ tokenizer .reset ();
158
+ while (tokenizer .incrementToken ()) {
159
+ tokens .add (tokenizer .getAttribute (CharTermAttribute .class ).toString ());
160
+ }
161
+ return tokens ;
162
+ }
163
+
164
+
81
165
@ Override
82
166
public void reset () throws IOException {
83
167
super .reset ();
84
- parsed = false ;
168
+ tokens = null ;
169
+ iterator = null ;
85
170
}
86
171
87
172
private static final Pattern REGEX_PROTOCOL = Pattern .compile ("^([a-zA-Z]+)(?=://)" );
@@ -104,7 +189,7 @@ private String parseMalformed(String urlString) {
104
189
case WHOLE :
105
190
return urlString ;
106
191
default :
107
- return null ;
192
+ return urlString ;
108
193
}
109
194
}
110
195
0 commit comments