From cd347cbf7683875bb8322732fc4ddcb2ea861a8c Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Fri, 3 Jan 2020 13:22:22 -0600 Subject: [PATCH 01/12] adding support for filters with normalized values BREAKING CHANGE We need to support keyword sorting with normalization such as - case insensitivity - accents converted to ascii - characters like quotation marks, html tags stripped out In order to do that, we will normalize the keyword fields on ingest but the stored field displays as sent. Example: sending "Willa Cather" will result in "willa cather" normalized version but that's not what we want to display to users Therefore, the API needs to return the filter / aggregate used and the number of results AS WELL as the "top" original version There could be many keywords combined into a specific filter which is why we are relying on elasticsearch to determine the top hit and return it for display The breaking change is that instead of facets being a list of fields with keyword and number, now we have a list of fields with keyword listing the number and top source (non-normalized version) --- app/services/search_item_req.rb | 25 ++++++++++++--- app/services/search_item_res.rb | 57 ++++++++++++++++++++------------- 2 files changed, 55 insertions(+), 27 deletions(-) diff --git a/app/services/search_item_req.rb b/app/services/search_item_req.rb index 117462d..e260263 100644 --- a/app/services/search_item_req.rb +++ b/app/services/search_item_req.rb @@ -117,6 +117,16 @@ def facets "field" => f, "order" => { type => dir }, "size" => size + }, + "aggs" => { + "top_matches" => { + "top_hits" => { + "_source" => { + "includes" => [ f ] + }, + "size" => 1 + } + } } } } @@ -124,14 +134,19 @@ def facets else aggs[f] = { "terms" => { - # TODO if dataset is large, can implement partitions? - # "include" => { - # "partition" => 0, - # "num_partitions" => 10 - # }, "field" => f, "order" => { type => dir }, "size" => size + }, + "aggs" => { + "top_matches" => { + "top_hits" => { + "_source" => { + "includes" => [ f ] + }, + "size" => 1 + } + } } } end diff --git a/app/services/search_item_res.rb b/app/services/search_item_res.rb index 4a2c5a7..25b2830 100644 --- a/app/services/search_item_res.rb +++ b/app/services/search_item_res.rb @@ -4,6 +4,7 @@ class SearchItemRes @@count = ["hits", "total"] @@facets = ["aggregations"] + @@facets_label = ["top_matches", "hits", "hits", "_source"] @@item = ["hits", "hits", 0, "_source"] @@items = ["hits", "hits"] @@ -38,33 +39,45 @@ def combine_highlights end end + def format_bucket_value(facets, field, bucket) + # dates return in wonktastic ways, so grab key_as_string instead of gibberish number + # but otherwise just grab the key if key_as_string unavailable + key = bucket.key?("key_as_string") ? bucket["key_as_string"] : bucket["key"] + val = bucket["doc_count"] + source = key + # top_matches is a top_hits aggregation which returns a list of terms + # which were used for the facet. + # Example: "Willa Cather" and "WILLA CATHER" + # Those terms will both have been normalized as "willa cather" but + # we will want to display one of the non-normalized terms instead + matches = bucket.dig("top_matches", "hits", "hits") + if matches + # elasticsearch stores nested source results without the "path" + no_nesting = field.split(".").last + source = matches.first.dig("_source", no_nesting) + end + facets[field][key] = { + "num" => val, + "source" => source + } + end + def reformat_facets - facets = @body.dig(*@@facets) - if facets - formatted = {} - facets.each do |field, info| - formatted[field] = {} - buckets = {} - # nested fields do not have buckets - # at this level in the response structure - if info.has_key?("buckets") - buckets = info["buckets"] - else - buckets = info.dig(field, "buckets") - end + raw_facets = @body.dig(*@@facets) + if raw_facets + facets = {} + raw_facets.each do |field, info| + facets[field] = {} + # nested fields do not have buckets at this level of response structure + buckets = info.key?("buckets") ? info["buckets"] : info.dig(field, "buckets") + if buckets - buckets.each do |b| - # dates return in wonktastic ways, so grab key_as_string instead of gibberish number - # but otherwise just grab the key if key_as_string unavailable - key = b.has_key?("key_as_string") ? b["key_as_string"] : b["key"] - val = b["doc_count"] - formatted[field][key] = val - end + buckets.each { |b| format_bucket_value(facets, field, b) } else - formatted[field] = {} + facets[field] = {} end end - return formatted + return facets else return {} end From 668990407a0dc0d7eccc00b6801e924e31ae6300 Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Fri, 3 Jan 2020 13:26:23 -0600 Subject: [PATCH 02/12] removes unnecessarily returns --- app/services/search_coll_res.rb | 2 +- app/services/search_item_req.rb | 12 ++++++------ app/services/search_item_res.rb | 10 +++++----- app/services/search_service.rb | 6 +++--- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/app/services/search_coll_res.rb b/app/services/search_coll_res.rb index 1958f4b..342f472 100644 --- a/app/services/search_coll_res.rb +++ b/app/services/search_coll_res.rb @@ -23,7 +23,7 @@ def build_response } end - return { + { "code" => 200, "info" => { "count" => collections.length, diff --git a/app/services/search_item_req.rb b/app/services/search_item_req.rb index e260263..e9260c0 100644 --- a/app/services/search_item_req.rb +++ b/app/services/search_item_req.rb @@ -151,7 +151,7 @@ def facets } end end - return aggs + aggs end def filters @@ -221,7 +221,7 @@ def filters filter_list << { "term" => { filter[0] => filter[1].gsub(/\r/, "") } } end end - return filter_list + filter_list end def highlights @@ -243,7 +243,7 @@ def highlights end end end - return hl + hl end def sort @@ -290,7 +290,7 @@ def sort end - return sort_obj + sort_obj end def source @@ -300,7 +300,7 @@ def source criteria = {} criteria["includes"] = wlist if !wlist.empty? criteria["excludes"] = blist if !blist.empty? - return criteria + criteria end def text_search @@ -324,7 +324,7 @@ def text_search else must = { "match_all" => {} } end - return must + must end end diff --git a/app/services/search_item_res.rb b/app/services/search_item_res.rb index 25b2830..f30f61f 100644 --- a/app/services/search_item_res.rb +++ b/app/services/search_item_res.rb @@ -19,7 +19,7 @@ def build_response items = combine_highlights facets = reformat_facets - return { + { "code" => 200, "count" => count, "facets" => facets, @@ -30,12 +30,12 @@ def build_response def combine_highlights hits = @body.dig(*@@items) if hits - return hits.map do |hit| + hits.map do |hit| hit["_source"]["highlight"] = hit["highlight"] || {} hit["_source"] end else - return [] + [] end end @@ -77,9 +77,9 @@ def reformat_facets facets[field] = {} end end - return facets + facets else - return {} + {} end end diff --git a/app/services/search_service.rb b/app/services/search_service.rb index 8724644..dbd8877 100644 --- a/app/services/search_service.rb +++ b/app/services/search_service.rb @@ -12,9 +12,9 @@ def initialize(url, params={}, user_req) def post(url_ending, json) res = RestClient.post("#{@url}/#{url_ending}", json.to_json, { "content-type" => "json" } ) - return JSON.parse(res.body) + JSON.parse(res.body) rescue => e - return e + e end def search_collections @@ -108,7 +108,7 @@ def on_success(req, res) if @params["debug"].present? json["req"]["query_obj"] = req end - return json + json end def build_collections_response(res) From 41765ebf12b8d1671029d64b4e669d64cfd77cab Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Tue, 7 Jan 2020 14:16:43 -0600 Subject: [PATCH 03/12] adds version number to responses as api_version --- app/controllers/application_controller.rb | 1 + app/controllers/collection_controller.rb | 1 + app/controllers/default_controller.rb | 2 +- app/services/search_coll_res.rb | 1 + app/services/search_item_res.rb | 1 + 5 files changed, 5 insertions(+), 1 deletion(-) diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 0404329..15ff0d8 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -17,6 +17,7 @@ def display_error(error, req_body) render(status: 500, json: JSON.pretty_generate({ "res" => { "code" => 500, + "api_version" => Api::Application::VERSION, "message" => "TODO", "info" => { "documentation" => "TODO", diff --git a/app/controllers/collection_controller.rb b/app/controllers/collection_controller.rb index 154caf8..cd1f9b9 100644 --- a/app/controllers/collection_controller.rb +++ b/app/controllers/collection_controller.rb @@ -15,6 +15,7 @@ def show "query_string" => request.fullpath }, "res" => { + "api_version" => Api::Application::VERSION, "code" => 200, "info" => { "collection" => {}, diff --git a/app/controllers/default_controller.rb b/app/controllers/default_controller.rb index c217c06..601fca1 100644 --- a/app/controllers/default_controller.rb +++ b/app/controllers/default_controller.rb @@ -18,6 +18,7 @@ def root }, "res" => { "code" => 200, + "api_version" => Api::Application::VERSION, "info" => { "api_updated" => METADATA["api_updated"], "contact" => METADATA["cdrh@unl.edu"], @@ -26,7 +27,6 @@ def root "index_updated" => "TODO", "license" => METADATA["license"], "terms_of_service" => METADATA["terms_of_service"], - "version" => Api::Application::VERSION, # TODO should we be obtaining these from # Rails.application.routes or similar? "endpoints" => [ diff --git a/app/services/search_coll_res.rb b/app/services/search_coll_res.rb index 342f472..6e44f60 100644 --- a/app/services/search_coll_res.rb +++ b/app/services/search_coll_res.rb @@ -16,6 +16,7 @@ def build_response "collection_name" => coll["key"], "description" => "TODO", "image_id" => "TODO", + "api_version" => Api::Application::VERSION, "uri" => "TODO", "collection" => coll["key"], "item_count" => coll["doc_count"], diff --git a/app/services/search_item_res.rb b/app/services/search_item_res.rb index f30f61f..d37df03 100644 --- a/app/services/search_item_res.rb +++ b/app/services/search_item_res.rb @@ -22,6 +22,7 @@ def build_response { "code" => 200, "count" => count, + "api_version" => Api::Application::VERSION, "facets" => facets, "items" => items, } From fec8509cf02b7467be4ffa7406de538663172c1a Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Tue, 7 Jan 2020 15:53:32 -0600 Subject: [PATCH 04/12] add to changelog file --- CHANGELOG.md | 39 ++++++++++++++++++++++++++++++++++----- README.md | 3 ++- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38f4e61..b2f1dc4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,13 +2,16 @@ All notable changes to Apium will be documented in this file. -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to [Semantic -Versioning](https://semver.org/spec/v2.0.0.html). +Starting from Open ONI v0.11, The format is based on [Keep a +Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to +[Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +Please respect the 80-character text margin and follow the [GitHub Flavored +Markdown Spec](https://github.github.com/gfm/).