Merge pull request #108 from CDRH/feature/kw_normalize

wkdewey · web-flow · commit ea886a5d2541 · 2022-05-19T12:00:25.000-05:00
Feature/kw normalize
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,9 +2,12 @@
 
 All notable changes to Apium will be documented in this file.
 
-The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
-and this project adheres to [Semantic
-Versioning](https://semver.org/spec/v2.0.0.html).
+Starting from Apium v1.0.1, The format is based on [Keep a
+Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to
+[Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+Please respect the 80-character text margin and follow the [GitHub Flavored
+Markdown Spec](https://github.github.com/gfm/).
 
 <!-- Template - Please preserve this order of sections
 ## [Unreleased] - Brief description
@@ -28,8 +31,30 @@ Versioning](https://semver.org/spec/v2.0.0.html).
 ## [Unreleased] - updates in preparation for Habeas release
 [Unreleased]: https://github.com/CDRH/api/compare/v1.0.4...dev
 
+### Added
+- "api_version" added to all response "res" objects
+
 ### Changed
 - upgraded to Rails 6
+- Added support for aggregating buckets by normalized keyword and returning
+  the "top_hits" first document result for a non-normalized display
+- Changes response format of `facets` key
+  
+  From:
+  ```
+  "facets": {
+    "WILLA CATHER": 10,
+    "Willa Cather": 50
+  }
+  ```
+  To:
+  ```
+  "facets": {
+    "willa cather": { "num" : 60, source: "Willa Cather" }
+  }
+  ```
+  Not only is the response format itself different, but there may be fewer
+  facets returned since normalized values which match are combined
 
 ## [v1.0.4](https://github.com/CDRH/api/compare/v1.0....v1.0.4) - Updates & license
 
@@ -38,7 +63,6 @@ Versioning](https://semver.org/spec/v2.0.0.html).
 license added
 
 ### Added
-
 - Documentation on facets and highlighting
 
 ## [v1.0.3](https://github.com/CDRH/api/compare/v1.0.2...v1.0.3) - gem updates
@@ -67,3 +91,8 @@ license added
 - version moved to initializer
 
 ## [v1.0.0](https://github.com/CDRH/api/tree/v1.0.0) - Initial Launch
+
+### Contributors
+
+- Jessica Dussault (jduss4)
+
diff --git a/README.md b/README.md
@@ -1,7 +1,8 @@
 # Apium
 
-Apium is an API to access all public Center for Digital Research in the Humanities resources. It is also an invasive weed in Nebraska.
+Apium is an API to access all public Center for Digital Research in the Humanities resources. It is also a genus of plants which includes celery, fool's water cress, and lesser marshwort.
 
 **[Apium Documentation](docs/README.md)**
+**[Changelog](CHANGELOG.md)**
 
 This project is licensed under the terms of the [MIT license](LICENSE.md).
diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb
@@ -17,6 +17,7 @@ def display_error(error, req_body)
     render(status: 500, json: JSON.pretty_generate({
       "res" => {
         "code" => 500,
+        "api_version" => Api::Application::VERSION,
         "message" => "TODO",
         "info" => {
           "documentation" => "TODO",
diff --git a/app/controllers/collection_controller.rb b/app/controllers/collection_controller.rb
@@ -15,6 +15,7 @@ def show
         "query_string" => request.fullpath
       },
       "res" => {
+        "api_version" => Api::Application::VERSION,
         "code" => 200,
         "info" => {
           "collection" => {},
diff --git a/app/controllers/default_controller.rb b/app/controllers/default_controller.rb
@@ -26,7 +26,6 @@ def root
           "index_updated" => "TODO",
           "license" => METADATA["license"],
           "terms_of_service" => METADATA["terms_of_service"],
-          "version" => Api::Application::VERSION,
           # TODO should we be obtaining these from
           # Rails.application.routes or similar?
           "endpoints" => [
diff --git a/app/services/search_coll_res.rb b/app/services/search_coll_res.rb
@@ -16,14 +16,15 @@ def build_response
           "collection_name" => coll["key"],
           "description" => "TODO",
           "image_id" => "TODO",
+          "api_version" => Api::Application::VERSION,
           "uri" => "TODO",
           "collection" => coll["key"],
           "item_count" => coll["doc_count"],
           "endpoint" => route_paths.collection_path(coll["key"])
         }
       end
 
-      return {
+      {
         "code" => 200,
         "info" => {
           "count" => collections.length,
diff --git a/app/services/search_item_req.rb b/app/services/search_item_req.rb
@@ -117,26 +117,41 @@ def facets
                 "field" => f,
                 "order" => { type => dir },
                 "size" => size
+              },
+              "aggs" => {
+                "top_matches" => {
+                  "top_hits" => {
+                    "_source" => {
+                      "includes" => [ f ]
+                    },
+                    "size" => 1
+                  }
+                }
               }
             }
           }
         }
       else
         aggs[f] = {
           "terms" => {
-            # TODO if dataset is large, can implement partitions?
-            # "include" => {
-            #   "partition" => 0,
-            #   "num_partitions" => 10
-            # },
             "field" => f,
             "order" => { type => dir },
             "size" => size
+          },
+          "aggs" => {
+            "top_matches" => {
+              "top_hits" => {
+                "_source" => {
+                  "includes" => [ f ]
+                },
+                "size" => 1
+              }
+            }
           }
         }
       end
     end
-    return aggs
+    aggs
   end
 
   def filters
@@ -206,7 +221,7 @@ def filters
         filter_list << { "term" => { filter[0] => filter[1].gsub(/\r/, "") } }
       end
     end
-    return filter_list
+    filter_list
   end
 
   def highlights
@@ -228,7 +243,7 @@ def highlights
         end
       end
     end
-    return hl
+    hl
   end
 
   def sort
@@ -275,7 +290,7 @@ def sort
 
     end
 
-    return sort_obj
+    sort_obj
   end
 
   def source
@@ -285,7 +300,7 @@ def source
     criteria = {}
     criteria["includes"] = wlist if !wlist.empty?
     criteria["excludes"] = blist if !blist.empty?
-    return criteria
+    criteria
   end
 
   def text_search
@@ -309,7 +324,7 @@ def text_search
     else
       must = { "match_all" => {} }
     end
-    return must
+    must
   end
 
 end
diff --git a/app/services/search_item_res.rb b/app/services/search_item_res.rb
@@ -4,6 +4,7 @@ class SearchItemRes
 
   @@count = ["hits", "total"]
   @@facets = ["aggregations"]
+  @@facets_label = ["top_matches", "hits", "hits", "_source"]
   @@item = ["hits", "hits", 0, "_source"]
   @@items = ["hits", "hits"]
 
@@ -18,9 +19,10 @@ def build_response
     items = combine_highlights
     facets = reformat_facets
 
-    return {
+    {
       "code" => 200,
       "count" => count,
+      "api_version" => Api::Application::VERSION,
       "facets" => facets,
       "items" => items,
     }
@@ -29,45 +31,83 @@ def build_response
   def combine_highlights
     hits = @body.dig(*@@items)
     if hits
-      return hits.map do |hit|
+      hits.map do |hit|
         hit["_source"]["highlight"] = hit["highlight"] || {}
         hit["_source"]
       end
     else
-      return []
+      []
     end
   end
 
+  def find_source_from_top_hits(top_hits, field, key)
+    # elasticsearch stores nested source results without the "path"
+    nested_child = field.split(".").last
+    hit = top_hits.first.dig("_source", nested_child)
+    # if this is a multivalued field (for example: works or places),
+    # ALL of the values come back as the source, but we only want
+    # the single value from which the key was derived
+    if hit.class == Array
+      # I don't love this, because we will have to match exactly the logic
+      # that got us the key to get this to work
+      match_index = hit
+        .map { |s| remove_nonword_chars(s) }
+        .index(remove_nonword_chars(key))
+      # if nothing matches the original key, return the entire source hit
+      # should return a string, regardless
+      return match_index ? hit[match_index] : hit.join(" ")
+    else
+      # it must be single-valued and therefore we are good to go
+      return hit
+    end
+  end
+
+  def format_bucket_value(facets, field, bucket)
+    # dates return in wonktastic ways, so grab key_as_string instead of gibberish number
+    # but otherwise just grab the key if key_as_string unavailable
+    key = bucket.key?("key_as_string") ? bucket["key_as_string"] : bucket["key"]
+    val = bucket["doc_count"]
+    source = key
+    # top_matches is a top_hits aggregation which returns a list of terms
+    # which were used for the facet.
+    #   Example: "Willa Cather" and "WILLA CATHER"
+    # Those terms will both have been normalized as "willa cather" but
+    # we will want to display one of the non-normalized terms instead
+    top_hits = bucket.dig("top_matches", "hits", "hits")
+    if top_hits
+      source = find_source_from_top_hits(top_hits, field, key)
+    end
+    facets[field][key] = {
+      "num" => val,
+      "source" => source
+    }
+  end
+
   def reformat_facets
-    facets = @body.dig(*@@facets)
-    if facets
-      formatted = {}
-      facets.each do |field, info|
-        formatted[field] = {}
-        buckets = {}
-        # nested fields do not have buckets
-        # at this level in the response structure
-        if info.has_key?("buckets")
-          buckets = info["buckets"]
-        else
-          buckets = info.dig(field, "buckets")
-        end
+    raw_facets = @body.dig(*@@facets)
+    if raw_facets
+      facets = {}
+      raw_facets.each do |field, info|
+        facets[field] = {}
+        # nested fields do not have buckets at this level of response structure
+        buckets = info.key?("buckets") ? info["buckets"] : info.dig(field, "buckets")
         if buckets
-          buckets.each do |b|
-            # dates return in wonktastic ways, so grab key_as_string instead of gibberish number
-            # but otherwise just grab the key if key_as_string unavailable
-            key = b.has_key?("key_as_string") ? b["key_as_string"] : b["key"]
-            val = b["doc_count"]
-            formatted[field][key] = val
-          end
+          buckets.each { |b| format_bucket_value(facets, field, b) }
         else
-          formatted[field] = {}
+          facets[field] = {}
         end
       end
-      return formatted
+      facets
     else
-      return {}
+      {}
     end
   end
 
+  def remove_nonword_chars(term)
+    # transliterate to ascii (ø -> o)
+    transliterated = I18n.transliterate(term)
+    # remove html tags like em, u, and strong, then strip remaining non-alpha characters
+    transliterated.gsub(/<\/?(?:em|strong|u)>|\W/, "").downcase
+  end
+
 end
diff --git a/app/services/search_service.rb b/app/services/search_service.rb
@@ -12,9 +12,9 @@ def initialize(url, params={}, user_req)
 
   def post(url_ending, json)
     res = RestClient.post("#{@url}/#{url_ending}", json.to_json, { "content-type" => "json" } )
-    return JSON.parse(res.body)
+    JSON.parse(res.body)
   rescue => e
-    return e
+    e
   end
 
   def search_collections
@@ -108,7 +108,7 @@ def on_success(req, res)
     if @params["debug"].present?
       json["req"]["query_obj"] = req
     end
-    return json
+    json
   end
 
   def build_collections_response(res)
diff --git a/test/services/search_item_req_test.rb b/test/services/search_item_req_test.rb
diff --git a/test/services/search_item_res_test.rb b/test/services/search_item_res_test.rb