jprante
diff --git a/‎CREDITS.txt
Lines changed: 5 additions & 0 deletions b/‎CREDITS.txt
Lines changed: 5 additions & 0 deletions
diff --git a/‎README.adoc
Lines changed: 114 additions & 6 deletions b/‎README.adoc
Lines changed: 114 additions & 6 deletions
diff --git a/‎build.gradle
Lines changed: 7 additions & 3 deletions b/‎build.gradle
Lines changed: 7 additions & 3 deletions
diff --git a/‎gradle.properties
Lines changed: 1 addition & 1 deletion b/‎gradle.properties
Lines changed: 1 addition & 1 deletion
diff --git a/‎gradle/wrapper/gradle-wrapper.jar
1.72 KB b/‎gradle/wrapper/gradle-wrapper.jar
1.72 KB
diff --git a/‎gradle/wrapper/gradle-wrapper.properties
Lines changed: 2 additions & 2 deletions b/‎gradle/wrapper/gradle-wrapper.properties
Lines changed: 2 additions & 2 deletions
diff --git a/‎settings.gradle
Lines changed: 2 additions & 1 deletion b/‎settings.gradle
Lines changed: 2 additions & 1 deletion
@@ -26,3 +26,8 @@ The FSA in package org.xbib.elastixsearch.common.fsa which provides the dictiona
 the baseform tokenizer is a derived version of
 
 https://github.com/morfologik/morfologik-stemming/tree/master/morfologik-fsa/src/main/java/morfologik/fsa
+
+Thanks to GBI-Genios Deutsche Wirtschaftsdatenbank GmbH for adding the caching-functionality and the "Exact phrase matches".
+The implementation of an exact phrase match query can ignore/skip decompounded tokens while matching phrases.
+The LFU cache for the Patricia Decompounder was inspired by the use of ConcurrentHashMap cache
+in the original pull request: https://github.com/jprante/elasticsearch-analysis-decompound/pull/54/
@@ -32,6 +32,7 @@ The plugin code in each plugin is equivalent to the code in this combined bundle
 [frame="all"]
 |===
 | Plugin version | Elasticsearch version | Release date
+| 6.3.2.2  | 6.3.2     | Oct  2, 2018
 | 5.4.1.0  | 5.4.0     | Jun  1, 2017
 | 5.4.0.1  | 5.4.0     | May 12, 2017
 | 5.4.0.0  | 5.4.0     | May  4, 2017
@@ -614,7 +615,26 @@ GET _analyze
         }
     }
 
-# Example
+# Decompound
+
+This is an implementation of a word decompounder plugin for link:http://github.com/elasticsearch/elasticsearch[Elasticsearch].
+
+Compounding several words into one word is a property not all languages share.
+Compounding is used in German, Scandinavian Languages, Finnish and Korean.
+
+This code is a reworked implementation of the
+link:http://wortschatz.uni-leipzig.de/~cbiemann/software/toolbox/Baseforms%20Tool.htm[Baseforms Tool]
+found in the http://wortschatz.uni-leipzig.de/~cbiemann/software/toolbox/index.htm[ASV toolbox]
+of http://asv.informatik.uni-leipzig.de/staff/Chris_Biemann[Chris Biemann],
+Automatische Sprachverarbeitung of Leipzig University.
+
+Lucene comes with two compound word token filters, a dictionary- and a hyphenation-based variant.
+Both of them have a disadvantage, they require loading a word list in memory before they run.
+This decompounder does not require word lists, it can process german language text out of the box.
+The decompounder uses prebuilt _Compact Patricia Tries_ for efficient word segmentation provided
+by the ASV toolbox.
+
+## Decompound examples
 
 Try it out
 ----
@@ -630,7 +650,7 @@ GET _analyze
 }
 ----
 
-In the mapping, us a token filter of type "decompound"::
+In the mapping, use a token filter of type "decompound"::
 
   {
      "index":{
@@ -678,7 +698,7 @@ Also the Lucene german normalization token filter is provided::
 The input "Ein schöner Tag in Köln im Café an der Straßenecke" will be tokenized into 
 "Ein", "schoner", "Tag", "in", "Koln", "im", "Café", "an", "der", "Strassenecke".
 
-# Threshold
+## Threshold
 
 The decomposing algorithm knows about a threshold when to assume words as decomposed successfully or not.
 If the threshold is too low, words could silently disappear from being indexed. In this case, you have to adapt the
@@ -705,7 +725,7 @@ The default threshold value is 0.51. You can modify it in the settings::
          }
       }
 
-# Subwords
+## Subwords
 
 Sometimes only the decomposed subwords should be indexed. For this, you can use the parameter `"subwords_only": true`
 
@@ -729,7 +749,95 @@ Sometimes only the decomposed subwords should be indexed. For this, you can use
       }
 
 
-## Langdetect
+## Caching
+
+The time consumed by the decompound computation may increase your overall indexing time drastically if applied in
+the billions. You can configure a least-frequently-used cache for mapping a token to the decompounded tokens
+with the following settings:
+
+`use_cache: true` - enables caching
+`cache_size` - sets cache size, default: 100000
+`cache_eviction_factor` - sets cache eviction factor, valida values are between 0.00 and 1.00, default: 0.90
+
+```
+{
+  "settings": {
+    "index": {
+      "number_of_shards": 1,
+      "number_of_replicas": 0,
+      "analysis": {
+        "filter": {
+          "decomp":{
+            "type" : "decompound",
+            "use_payload": true,
+            "use_cache": true
+          }
+        },
+        "analyzer": {
+          "decomp": {
+            "type": "custom",
+            "tokenizer" : "standard",
+            "filter" : [
+              "decomp",
+              "lowercase"
+            ]
+          },
+          "lowercase": {
+            "type": "custom",
+            "tokenizer" : "standard",
+            "filter" : [
+              "lowercase"
+            ]
+          }
+        }
+      }
+    }
+  },
+  "mappings": {
+    "_doc": {
+      "properties": {
+        "text": {
+          "type": "text",
+          "analyzer": "decomp",
+          "search_analyzer": "lowercase"
+        }
+      }
+    }
+  }
+}
+```
+
+## Exact phrase matches
+
+The usage of decompounds can lead to undesired results regarding phrase queries.
+After indexing, decompound tokens ca not be distinguished from original tokens.
+The outcome of a phrase query "Deutsche Bank" could be `Deutsche Spielbankgesellschaft`,
+what is clearly an unexpected result. To enable "exact" phrase queries, each decoumpound token is
+tagged with additional payload data.
+
+To evaluate this payload data, you can use the `exact_phrase` as a wrapper around a query
+containing your phrase queries.
+
+`use_payload` - if set to true, enable payload creation. Default: false
+
+ ```
+{
+  "query": {
+    "exact_phrase": {
+      "query": {
+        "query_string": {
+          "query": "\"deutsche bank\"",
+          "fields": [
+            "message"
+          ]
+        }
+      }
+    }
+  }
+}
+```
+
+# Langdetect
 
     curl -XDELETE 'localhost:9200/test'
 
@@ -797,7 +905,7 @@ Sometimes only the decomposed subwords should be indexed. For this, you can use
     }
     '
 
-## Standardnumber
+# Standardnumber
 
 Try it out
 ----
 
@@ -72,9 +72,13 @@ test {
         exceptionFormat = 'full'
     }
 }
-randomizedTest.enabled = false
-esTest.enabled = true
-esTest.dependsOn jar
+randomizedTest {
+    enabled = false
+}
+esTest {
+    dependsOn jar
+    enabled = true
+}
 
 clean {
     delete fileTree('.') { include '.local*.log' }
 
@@ -1,6 +1,6 @@
 group = org.xbib.elasticsearch.plugin
 name = elasticsearch-plugin-bundle
-version = 6.3.2.1
+version = 6.3.2.2
 
 xbib-elasticsearch-test.version = 6.3.2.1
 elasticsearch.version = 6.3.2
 
@@ -1,6 +1,6 @@
-#Tue Jul 17 20:18:04 CEST 2018
+#Mon Oct 01 19:05:53 CEST 2018
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-4.8.1-all.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-all.zip
@@ -1 +1,2 @@
-rootProject.name = name
+rootProject.name = name
+enableFeaturePreview('STABLE_PUBLISHING')
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-rootProject.name = name`
	`1`	`+rootProject.name = name`
	`2`	`+enableFeaturePreview('STABLE_PUBLISHING')`