From 8535b30cd3347e56d9b28255f4806f604ff668db Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Mon, 5 Dec 2022 10:03:06 +0100 Subject: [PATCH 01/62] remove forgot variables --- bin/ncbo_ontology_pull | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bin/ncbo_ontology_pull b/bin/ncbo_ontology_pull index 131ef543..a017e4d7 100755 --- a/bin/ncbo_ontology_pull +++ b/bin/ncbo_ontology_pull @@ -33,11 +33,7 @@ logger.info "Starting ncbo pull"; logger.flush puller = NcboCron::Models::OntologyPull.new begin puller.do_ontology_pull(ontology_acronym, logger: logger , enable_pull_umls:true ) -rescue NcboCron::Models::OntologyPull::RemoteFileException => e - logger.error "RemoteFileException: No submission file at pull location #{last.pullLocation.to_s} for ontology #{ont.acronym}." - logger.flush rescue StandardError => e - e.backtrace logger.error e.message logger.flush end From f7aa1beafbe13b31d0bd6fe62f88b77e2dbf5866 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Wed, 21 Dec 2022 16:38:55 -0800 Subject: [PATCH 02/62] fix for #61 - create contact instance if it doesn't exist - changed --from-api to --from-apikey - minor linting --- bin/ncbo_ontology_import | 50 ++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/bin/ncbo_ontology_import b/bin/ncbo_ontology_import index db2e90c5..3dfb7d57 100755 --- a/bin/ncbo_ontology_import +++ b/bin/ncbo_ontology_import @@ -20,11 +20,11 @@ require 'net/http' require 'optparse' ontologies_acronyms = '' ontology_source = '' -source_api = '' +source_apikey = '' username = '' opt_parser = OptionParser.new do |opts| opts.banner = 'Usage: ncbo_ontology_import [options]' - opts.on('-o', '--ontology ACRONYM', 'Ontologies acronyms which we want to import (separated by comma)') do |acronym| + opts.on('-o', '--ontologies ACRONYM1,ACRONYM2', 'Comma-separated list of ontologies to import') do |acronym| ontologies_acronyms = acronym end @@ -32,8 +32,8 @@ opt_parser = OptionParser.new do |opts| ontology_source = url.to_s end - opts.on('--from-api api', 'An apikey to acces the ontoportal api') do |api| - source_api = api.to_s + opts.on('--from-apikey apikey', 'An apikey to acces the ontoportal api') do |apikey| + source_apikey = apikey.to_s end opts.on('--admin-user username', 'The target admin user that will submit the ontology') do |user| @@ -46,11 +46,14 @@ opt_parser = OptionParser.new do |opts| end end opt_parser.parse! +if ontologies_acronyms.empty? + puts opts + exit(1) +end # URL of the API and APIKEY of the Ontoportal we want to import data FROM -SOURCE_API = ontology_source -SOURCE_APIKEY = source_api - +SOURCE_API = ontology_source +SOURCE_APIKEY = source_apikey # The username of the user that will have the administration rights on the ontology on the target portal TARGETED_PORTAL_USER = username @@ -58,17 +61,15 @@ TARGETED_PORTAL_USER = username # The list of acronyms of ontologies to import ONTOLOGIES_TO_IMPORT = ontologies_acronyms.split(',') || [] - def get_user(username) user = LinkedData::Models::User.find(username).first raise "The user #{username} does not exist" if user.nil? + user.bring_remaining end - # A function to create a new ontology (if already Acronym already existing on the portal it will return HTTPConflict) def create_ontology(ont_info) - new_ontology = LinkedData::Models::Ontology.new new_ontology.acronym = ont_info['acronym'] @@ -97,23 +98,29 @@ def upload_submission(sub_info, ontology) # Build the json body # hasOntologyLanguage options: OWL, UMLS, SKOS, OBO # status: alpha, beta, production, retired - attr_to_reject = %w[id submissionStatus hasOntologyLanguage metrics ontology @id @type contact] - to_copy = sub_info.select do |k,v| + attr_to_reject = %w[id submissionStatus hasOntologyLanguage metrics ontology @id @type contact uploadFilePath diffFilePath] + to_copy = sub_info.select do |k, v| !v.nil? && !v.is_a?(Hash) && !v.to_s.empty? && !attr_to_reject.include?(k) end to_copy["ontology"] = ontology - to_copy["contact"] = [LinkedData::Models::Contact.where(email: USER.email).first] - to_copy["hasOntologyLanguage"] = LinkedData::Models::OntologyFormat.where(acronym: sub_info["hasOntologyLanguage"]).first + + unless LinkedData::Models::Contact.where(email: USER.email).to_a[0] + LinkedData::Models::Contact.new(name: USER.username, email: USER.email).save + puts "created a new contact; name: #{USER.username}, email: #{USER.email}" + end + + to_copy["contact"] = [LinkedData::Models::Contact.where(email: USER.email).first] + to_copy["hasOntologyLanguage"] = LinkedData::Models::OntologyFormat.where(acronym: sub_info["hasOntologyLanguage"]).first to_copy.each do |key, value| attribute_settings = new_submission.class.attribute_settings(key.to_sym) if attribute_settings - if attribute_settings[:enforce]&.include?(:date_time) + if attribute_settings[:enforce]&.include?(:date_time) value = DateTime.parse(value) elsif attribute_settings[:enforce]&.include?(:uri) && attribute_settings[:enforce]&.include?(:list) value = value.map { |v| RDF::IRI.new(v) } - elsif attribute_settings[:enforce]&.include?(:uri) + elsif attribute_settings[:enforce]&.include?(:uri) value = RDF::IRI.new(value) end end @@ -126,10 +133,10 @@ end USER = get_user username -#get apikey for admin user +# get apikey for admin user TARGET_APIKEY = USER.apikey -SOURCE_APIKEY == '' && abort('--from-api has to be set') +SOURCE_APIKEY == '' && abort('--from-apikey has to be set') SOURCE_API == '' && abort('--from has to be set') def result_log(ressource, errors) @@ -143,10 +150,11 @@ end # Go through all ontologies acronym and get their latest_submission informations ONTOLOGIES_TO_IMPORT.each do |ont| sub_info = JSON.parse(Net::HTTP.get(URI.parse("#{SOURCE_API}/ontologies/#{ont}/latest_submission?apikey=#{SOURCE_APIKEY}&display=all"))) - puts "Import #{ont} " , + puts "Import #{ont} ", "From #{SOURCE_API}" # if the ontology is already created then it will return HTTPConflict, no consequences raise "The ontology #{ont} does not exist" if sub_info['ontology'].nil? + new_ontology = create_ontology(sub_info['ontology']) errors = nil if new_ontology.valid? @@ -159,6 +167,7 @@ ONTOLOGIES_TO_IMPORT.each do |ont| new_ontology ||= LinkedData::Models::Ontology.where(acronym: ont).first new_submission = upload_submission(sub_info, new_ontology) + if new_submission.valid? new_submission.save errors = nil @@ -167,6 +176,3 @@ ONTOLOGIES_TO_IMPORT.each do |ont| end result_log(sub_info["id"], errors) end - - - From f03b2aa3710bb54f7676df5085ccd95168033970 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Mon, 9 Jan 2023 11:05:52 -0800 Subject: [PATCH 03/62] Restore branch specifier to develop --- Gemfile | 8 ++++---- Gemfile.lock | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Gemfile b/Gemfile index 747f3b5d..c552979f 100644 --- a/Gemfile +++ b/Gemfile @@ -21,10 +21,10 @@ gem 'sys-proctable' gem 'cube-ruby', require: 'cube' # NCBO -gem 'goo', github: 'ncbo/goo', branch: 'master' -gem 'ncbo_annotator', github: 'ncbo/ncbo_annotator', branch: 'master' -gem 'ontologies_linked_data', github: 'ncbo/ontologies_linked_data', branch: 'master' -gem 'sparql-client', github: 'ncbo/sparql-client', branch: 'master' +gem 'goo', github: 'ncbo/goo', branch: 'develop' +gem 'ncbo_annotator', github: 'ncbo/ncbo_annotator', branch: 'develop' +gem 'ontologies_linked_data', github: 'ncbo/ontologies_linked_data', branch: 'develop' +gem 'sparql-client', github: 'ncbo/sparql-client', branch: 'develop' group :test do gem 'email_spec' diff --git a/Gemfile.lock b/Gemfile.lock index db2885f2..14c9d7f3 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 6fcab82f5abbaf08a4a0104d00ada0b40a322a31 - branch: master + revision: 9a5d60d6937658dbb62690dcb33039ac03fb7556 + branch: develop specs: goo (0.0.2) addressable (~> 2.8) @@ -15,8 +15,8 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 71d41e3afb35dafe29abfb6d9becaadc725bad36 - branch: master + revision: 75d1a8f302c8af4060bf62fc619f230084da521e + branch: develop specs: ncbo_annotator (0.0.1) goo @@ -26,8 +26,8 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 4f9139d870c3b1771af1127afa17b679bd0f60dc - branch: master + revision: 8698363f8d20ac715d8c96016197d70f2fe8f4b1 + branch: develop specs: ontologies_linked_data (0.0.1) activesupport @@ -47,7 +47,7 @@ GIT GIT remote: https://github.com/ncbo/sparql-client.git revision: fb4a89b420f8eb6dda5190a126b6c62e32c4c0c9 - branch: master + branch: develop specs: sparql-client (1.0.1) json_pure (>= 1.4) From e383b54e8bba2ca15dbe25563d8f0b341d54dd5d Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Sat, 14 Jan 2023 00:00:58 -0800 Subject: [PATCH 04/62] Optimization - remove repeated query --- bin/ncbo_ontology_import | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/ncbo_ontology_import b/bin/ncbo_ontology_import index 3dfb7d57..09b5a0a7 100755 --- a/bin/ncbo_ontology_import +++ b/bin/ncbo_ontology_import @@ -104,12 +104,13 @@ def upload_submission(sub_info, ontology) end to_copy["ontology"] = ontology - unless LinkedData::Models::Contact.where(email: USER.email).to_a[0] - LinkedData::Models::Contact.new(name: USER.username, email: USER.email).save + contact = LinkedData::Models::Contact.where(email: USER.email).first + unless contact + contact = LinkedData::Models::Contact.new(name: USER.username, email: USER.email).save puts "created a new contact; name: #{USER.username}, email: #{USER.email}" end - to_copy["contact"] = [LinkedData::Models::Contact.where(email: USER.email).first] + to_copy["contact"] = [contact] to_copy["hasOntologyLanguage"] = LinkedData::Models::OntologyFormat.where(acronym: sub_info["hasOntologyLanguage"]).first to_copy.each do |key, value| From a0297f2f48bcd44651dd73aad91bf60bf2719ee3 Mon Sep 17 00:00:00 2001 From: mdorf Date: Mon, 30 Jan 2023 16:09:49 -0800 Subject: [PATCH 05/62] Gemfile.lock update --- Gemfile.lock | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 221f0abc..adededb9 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 091e0ca001244a7dbaed0644b0b1087a3a24d16a + revision: 15023141f6051d4fa6cba6081d082c720327b0c9 branch: develop specs: goo (0.0.2) @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: bad8c646205014b1fc82793fb9c07d7075b4c068 + revision: 75d1a8f302c8af4060bf62fc619f230084da521e branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: a00dac9a0825697cb7f6ac5e0f4db6ab8f4bae79 + revision: d7425e581d35b265a0263faba7c32c0fdb51cbba branch: develop specs: ontologies_linked_data (0.0.1) @@ -79,7 +79,7 @@ GEM bcrypt (3.1.18) builder (3.2.4) coderay (1.1.3) - concurrent-ruby (1.1.10) + concurrent-ruby (1.2.0) connection_pool (2.3.0) cube-ruby (0.0.3) dante (0.2.0) @@ -90,7 +90,7 @@ GEM htmlentities (~> 4.3.3) launchy (~> 2.1) mail (~> 2.6) - faraday (1.10.2) + faraday (1.10.3) faraday-em_http (~> 1.0) faraday-em_synchrony (~> 1.0) faraday-excon (~> 1.1) @@ -116,7 +116,7 @@ GEM ffi (1.15.5) google-apis-analytics_v3 (0.12.0) google-apis-core (>= 0.9.1, < 2.a) - google-apis-core (0.9.1) + google-apis-core (0.10.0) addressable (~> 2.5, >= 2.5.1) googleauth (>= 0.16.2, < 2.a) httpclient (>= 2.8.1, < 3.a) @@ -139,13 +139,13 @@ GEM httpclient (2.8.3) i18n (0.9.5) concurrent-ruby (~> 1.0) - json (2.6.2) - json_pure (2.6.2) - jwt (2.5.0) - launchy (2.5.0) - addressable (~> 2.7) - libxml-ruby (3.2.4) - logger (1.5.1) + json (2.6.3) + json_pure (2.6.3) + jwt (2.6.0) + launchy (2.5.2) + addressable (~> 2.8) + libxml-ruby (4.0.0) + logger (1.5.3) macaddr (1.7.2) systemu (~> 2.6.5) mail (2.6.6) @@ -160,7 +160,7 @@ GEM mlanett-redis-lock (0.2.7) redis multi_json (1.15.0) - multipart-post (2.2.3) + multipart-post (2.3.0) net-http-persistent (2.9.4) netrc (0.11.0) oj (2.18.5) @@ -170,19 +170,19 @@ GEM parseconfig (1.1.2) pony (1.13.1) mail (>= 2.0) - pry (0.14.1) + pry (0.14.2) coderay (~> 1.1) method_source (~> 1.0) - public_suffix (5.0.0) - rack (3.0.0) + public_suffix (5.0.1) + rack (3.0.4.1) rack-test (2.0.2) rack (>= 1.3) rake (13.0.6) rdf (1.0.8) addressable (>= 2.2) - redis (5.0.5) + redis (5.0.6) redis-client (>= 0.9.0) - redis-client (0.11.0) + redis-client (0.12.1) connection_pool representable (3.2.0) declarative (< 0.1.0) @@ -208,13 +208,13 @@ GEM faraday (>= 0.17.5, < 3.a) jwt (>= 1.5, < 3.0) multi_json (~> 1.10) - sys-proctable (1.2.7) + sys-proctable (1.3.0) ffi (~> 1.1) systemu (2.6.5) test-unit-minitest (0.9.1) minitest (~> 4.7) trailblazer-option (0.1.2) - tzinfo (2.0.5) + tzinfo (2.0.6) concurrent-ruby (~> 1.0) uber (0.1.0) unf (0.1.4) @@ -222,7 +222,7 @@ GEM unf_ext (0.0.8.2) uuid (2.3.9) macaddr (~> 1.0) - webrick (1.7.0) + webrick (1.8.1) PLATFORMS ruby From 2dfd9dae49b68c771ccb764a9dabc28355ac71f2 Mon Sep 17 00:00:00 2001 From: mdorf Date: Tue, 31 Jan 2023 11:58:26 -0800 Subject: [PATCH 06/62] Gemfile.lock update --- Gemfile.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index d2603d5b..b190ad81 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 75d1a8f302c8af4060bf62fc619f230084da521e + revision: 0d5a4d823d9c9df11ba98292f06971bb9a127235 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: d7425e581d35b265a0263faba7c32c0fdb51cbba + revision: e6372f193488a69ef0c2015017ace8a48274a326 branch: develop specs: ontologies_linked_data (0.0.1) From 7f0589ea6f20ef798d9228766b5d13ea3b0a75e0 Mon Sep 17 00:00:00 2001 From: mdorf Date: Wed, 1 Feb 2023 09:06:34 -0800 Subject: [PATCH 07/62] Gemfile.lock update --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index b190ad81..d0ca376e 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 0d5a4d823d9c9df11ba98292f06971bb9a127235 + revision: f5484f7f2003a761812d09ea83c00e248034e0e7 branch: develop specs: ncbo_annotator (0.0.1) From 6db9916caf99cb839fc5996eabae4e48a5ac13f2 Mon Sep 17 00:00:00 2001 From: mdorf Date: Wed, 1 Feb 2023 09:36:36 -0800 Subject: [PATCH 08/62] Gemfile had references to develop branch --- Gemfile | 8 ++++---- Gemfile.lock | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Gemfile b/Gemfile index c552979f..747f3b5d 100644 --- a/Gemfile +++ b/Gemfile @@ -21,10 +21,10 @@ gem 'sys-proctable' gem 'cube-ruby', require: 'cube' # NCBO -gem 'goo', github: 'ncbo/goo', branch: 'develop' -gem 'ncbo_annotator', github: 'ncbo/ncbo_annotator', branch: 'develop' -gem 'ontologies_linked_data', github: 'ncbo/ontologies_linked_data', branch: 'develop' -gem 'sparql-client', github: 'ncbo/sparql-client', branch: 'develop' +gem 'goo', github: 'ncbo/goo', branch: 'master' +gem 'ncbo_annotator', github: 'ncbo/ncbo_annotator', branch: 'master' +gem 'ontologies_linked_data', github: 'ncbo/ontologies_linked_data', branch: 'master' +gem 'sparql-client', github: 'ncbo/sparql-client', branch: 'master' group :test do gem 'email_spec' diff --git a/Gemfile.lock b/Gemfile.lock index d0ca376e..ac18938b 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 15023141f6051d4fa6cba6081d082c720327b0c9 - branch: develop + revision: 919c20dec58375eb8a4dd1aed47864e2ad7bacfa + branch: master specs: goo (0.0.2) addressable (~> 2.8) @@ -15,8 +15,8 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: f5484f7f2003a761812d09ea83c00e248034e0e7 - branch: develop + revision: fa8fc96c7f55b1a37f2677bde3d2d16fa93712be + branch: master specs: ncbo_annotator (0.0.1) goo @@ -26,8 +26,8 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: e6372f193488a69ef0c2015017ace8a48274a326 - branch: develop + revision: b22237dc0753ac3824ad60780ce386162142ebcd + branch: master specs: ontologies_linked_data (0.0.1) activesupport @@ -47,7 +47,7 @@ GIT GIT remote: https://github.com/ncbo/sparql-client.git revision: fb4a89b420f8eb6dda5190a126b6c62e32c4c0c9 - branch: develop + branch: master specs: sparql-client (1.0.1) json_pure (>= 1.4) From f47a07b7ce284b3397d6a158d5a08c7fd60ef609 Mon Sep 17 00:00:00 2001 From: mdorf Date: Fri, 10 Feb 2023 15:17:00 -0800 Subject: [PATCH 09/62] implemented #64 - ability to generate labels independently of RDF processing (and vise versa) --- Gemfile.lock | 2 +- bin/ncbo_ontology_process | 6 +++--- lib/ncbo_cron/ontology_submission_parser.rb | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index d0ca376e..632dabeb 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: f5484f7f2003a761812d09ea83c00e248034e0e7 + revision: 2ee3915af8aa96be7f5fce825c25c6291c8952e5 branch: develop specs: ncbo_annotator (0.0.1) diff --git a/bin/ncbo_ontology_process b/bin/ncbo_ontology_process index d96f0d87..c3d5237c 100755 --- a/bin/ncbo_ontology_process +++ b/bin/ncbo_ontology_process @@ -31,9 +31,9 @@ opt_parser = OptionParser.new do |opts| end options[:tasks] = NcboCron::Models::OntologySubmissionParser::ACTIONS - opts.on('-t', '--tasks process_rdf,index_search,run_metrics', "Optional comma-separated list of processing tasks to perform. Default: #{NcboCron::Models::OntologySubmissionParser::ACTIONS.keys.join(',')}") do |tasks| - t = tasks.split(",").map {|t| t.strip.sub(/^:/, '').to_sym} - options[:tasks].each {|k, _| options[:tasks][k] = false unless t.include?(k)} + opts.on('-t', '--tasks process_rdf,generate_labels=false,index_search,run_metrics', "Optional comma-separated list of processing tasks to perform (or exclude). Default: #{NcboCron::Models::OntologySubmissionParser::ACTIONS.keys.join(',')}") do |tasks| + work_tasks = tasks.split(',').map { |t| t.gsub(/\s+/, '').gsub(/^:/, '') }.select { |t| k = t.split('='); k.length <= 1 || k[1].downcase === 'true' }.map { |t| t.gsub(/\=true$/, '').to_sym } + options[:tasks].each {|k, _| options[:tasks][k] = false unless work_tasks.include?(k)} end options[:logfile] = STDOUT diff --git a/lib/ncbo_cron/ontology_submission_parser.rb b/lib/ncbo_cron/ontology_submission_parser.rb index fe7a3e06..dfa7b320 100644 --- a/lib/ncbo_cron/ontology_submission_parser.rb +++ b/lib/ncbo_cron/ontology_submission_parser.rb @@ -10,6 +10,7 @@ class OntologySubmissionParser ACTIONS = { :process_rdf => true, + :generate_labels => true, :index_search => true, :index_properties => true, :run_metrics => true, From dbd4060ffe7454c325065f433c91c92e24068a0c Mon Sep 17 00:00:00 2001 From: mdorf Date: Fri, 10 Feb 2023 15:17:35 -0800 Subject: [PATCH 10/62] Gemfile.lock update --- Gemfile.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 632dabeb..84b4eccf 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -117,7 +117,7 @@ GEM ffi (1.15.5) google-apis-analytics_v3 (0.12.0) google-apis-core (>= 0.9.1, < 2.a) - google-apis-core (0.10.0) + google-apis-core (0.11.0) addressable (~> 2.5, >= 2.5.1) googleauth (>= 0.16.2, < 2.a) httpclient (>= 2.8.1, < 3.a) @@ -142,7 +142,7 @@ GEM concurrent-ruby (~> 1.0) json (2.6.3) json_pure (2.6.3) - jwt (2.6.0) + jwt (2.7.0) launchy (2.5.2) addressable (~> 2.8) libxml-ruby (4.0.0) From 5c6cf910a95c4c990b9e9135ae0e4db292db1159 Mon Sep 17 00:00:00 2001 From: mdorf Date: Mon, 13 Feb 2023 18:57:56 -0800 Subject: [PATCH 11/62] fixed a bug in #64 --- Gemfile.lock | 2 +- bin/ncbo_ontology_process | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 84b4eccf..f230ace8 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: e6372f193488a69ef0c2015017ace8a48274a326 + revision: 297f630ee5a35a78b015adf32fdb1e3af59ca652 branch: develop specs: ontologies_linked_data (0.0.1) diff --git a/bin/ncbo_ontology_process b/bin/ncbo_ontology_process index c3d5237c..879e749d 100755 --- a/bin/ncbo_ontology_process +++ b/bin/ncbo_ontology_process @@ -32,8 +32,13 @@ opt_parser = OptionParser.new do |opts| options[:tasks] = NcboCron::Models::OntologySubmissionParser::ACTIONS opts.on('-t', '--tasks process_rdf,generate_labels=false,index_search,run_metrics', "Optional comma-separated list of processing tasks to perform (or exclude). Default: #{NcboCron::Models::OntologySubmissionParser::ACTIONS.keys.join(',')}") do |tasks| - work_tasks = tasks.split(',').map { |t| t.gsub(/\s+/, '').gsub(/^:/, '') }.select { |t| k = t.split('='); k.length <= 1 || k[1].downcase === 'true' }.map { |t| t.gsub(/\=true$/, '').to_sym } - options[:tasks].each {|k, _| options[:tasks][k] = false unless work_tasks.include?(k)} + tasks_obj = {} + tasks.split(',').each { |t| + t_arr = t.gsub(/\s+/, '').gsub(/^:/, '').split('=') + tasks_obj[t_arr[0].to_sym] = (t_arr.length <= 1 || t_arr[1].downcase === 'true') + } + tasks_obj[:generate_labels] = true if tasks_obj[:process_rdf] && !tasks_obj.has_key?(:generate_labels) + options[:tasks].each {|k, _| options[:tasks][k] = false unless tasks_obj[k]} end options[:logfile] = STDOUT From c09d76eeb131caf422c18ba49580db1b47df86dc Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Tue, 14 Feb 2023 16:07:43 -0800 Subject: [PATCH 12/62] Relocate docker-compose file and update default configs --- .dockerignore | 8 +- .github/workflows/ruby-unit-tests.yml | 4 +- Dockerfile | 2 + config/config.rb.sample | 75 +++++++++---- config/config.test.rb | 63 +++++++---- dip.yml | 54 +++++++++ test/docker-compose.yml => docker-compose.yml | 103 ++++++++++-------- 7 files changed, 211 insertions(+), 98 deletions(-) create mode 100644 dip.yml rename test/docker-compose.yml => docker-compose.yml (52%) diff --git a/.dockerignore b/.dockerignore index c712142f..119eac79 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,5 +1,6 @@ # Git -#.git +.git +.github .gitignore # Logs log/* @@ -8,3 +9,8 @@ tmp/* # Editor temp files *.swp *.swo +coverage +create_permissions.log +# Ignore generated test data +test/data/ontology_files/repo/**/* +test/data/tmp/* diff --git a/.github/workflows/ruby-unit-tests.yml b/.github/workflows/ruby-unit-tests.yml index cde331e3..5f0db7e1 100644 --- a/.github/workflows/ruby-unit-tests.yml +++ b/.github/workflows/ruby-unit-tests.yml @@ -8,17 +8,15 @@ jobs: test: strategy: matrix: - backend: ['ruby', 'ruby-agraph'] # ruby runs tests with 4store backend and ruby-agraph runs with AllegroGraph backend + backend: ['ncbo_cron', 'ncbo_cron-agraph'] # ruby runs tests with 4store backend and ruby-agraph runs with AllegroGraph backend runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: copy config.rb file from template run: cp config/config.test.rb config/config.rb - name: Build docker-compose - working-directory: ./test run: docker-compose build - name: Run unit tests - working-directory: ./test run: | ci_env=`bash <(curl -s https://codecov.io/env)` docker-compose run $ci_env -e CI --rm ${{ matrix.backend }} bundle exec rake test TESTOPTS='-v' diff --git a/Dockerfile b/Dockerfile index cd191621..dfc03492 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,4 +20,6 @@ ENV BUNDLE_PATH=/srv/ontoportal/bundle RUN bundle install COPY . /srv/ontoportal/ncbo_cron +RUN cp /srv/ontoportal/ncbo_cron/config/config.rb.sample /srv/ontoportal/ncbo_cron/config/config.rb + CMD ["/bin/bash"] diff --git a/config/config.rb.sample b/config/config.rb.sample index 15125224..8d204311 100644 --- a/config/config.rb.sample +++ b/config/config.rb.sample @@ -1,16 +1,42 @@ -LinkedData.config do |config| - config.enable_monitoring = false - config.cube_host = "localhost" - config.goo_host = "localhost" - config.goo_port = 8080 - config.search_server_url = "http://localhost:8983/solr/term_search_core1" - config.property_search_server_url = "http://localhost:8983/solr/prop_search_core1" - config.repository_folder = "./test/data/ontology_files/repo" - config.http_redis_host = "localhost" - config.http_redis_port = 6379 - config.goo_redis_host = "localhost" - config.goo_redis_port = 6379 +# This file is designed to be used for unit testing with docker-compose + +GOO_BACKEND_NAME = ENV.include?("GOO_BACKEND_NAME") ? ENV["GOO_BACKEND_NAME"] : "4store" +GOO_HOST = ENV.include?("GOO_HOST") ? ENV["GOO_HOST"] : "localhost" +GOO_PATH_DATA = ENV.include?("GOO_PATH_DATA") ? ENV["GOO_PATH_DATA"] : "/data/" +GOO_PATH_QUERY = ENV.include?("GOO_PATH_QUERY") ? ENV["GOO_PATH_QUERY"] : "/sparql/" +GOO_PATH_UPDATE = ENV.include?("GOO_PATH_UPDATE") ? ENV["GOO_PATH_UPDATE"] : "/update/" +GOO_PORT = ENV.include?("GOO_PORT") ? ENV["GOO_PORT"] : 9000 +MGREP_HOST = ENV.include?("MGREP_HOST") ? ENV["MGREP_HOST"] : "localhost" +MGREP_PORT = ENV.include?("MGREP_PORT") ? ENV["MGREP_PORT"] : 55555 +MGREP_DICT_PATH = ENV.include?("MGREP_DICT_PATH") ? ENV["MGREP_DICT_PATH"] : "./test/data/dictionary.txt" +REDIS_GOO_CACHE_HOST = ENV.include?("REDIS_GOO_CACHE_HOST") ? ENV["REDIS_GOO_CACHE_HOST"] : "localhost" +REDIS_HTTP_CACHE_HOST = ENV.include?("REDIS_HTTP_CACHE_HOST") ? ENV["REDIS_HTTP_CACHE_HOST"] : "localhost" +REDIS_PERSISTENT_HOST = ENV.include?("REDIS_PERSISTENT_HOST") ? ENV["REDIS_PERSISTENT_HOST"] : "localhost" +REDIS_PORT = ENV.include?("REDIS_PORT") ? ENV["REDIS_PORT"] : 6379 +REPORT_PATH = ENV.include?("REPORT_PATH") ? ENV["REPORT_PATH"] : "./test/tmp/ontologies_report.json" +REPOSITORY_FOLDER = ENV.include?("REPOSITORY_FOLDER") ? ENV["REPOSITORY_FOLDER"] : "./test/data/ontology_files/repo" +REST_URL_PREFIX = ENV.include?("REST_URL_PREFIX") ? ENV["REST_URL_PREFIX"] : "http://localhost:9393" +SOLR_PROP_SEARCH_URL = ENV.include?("SOLR_PROP_SEARCH_URL") ? ENV["SOLR_PROP_SEARCH_URL"] : "http://localhost:8983/solr/prop_search_core1" +SOLR_TERM_SEARCH_URL = ENV.include?("SOLR_TERM_SEARCH_URL") ? ENV["SOLR_TERM_SEARCH_URL"] : "http://localhost:8983/solr/term_search_core1" +LinkedData.config do |config| + config.goo_backend_name = GOO_BACKEND_NAME.to_s + config.goo_host = GOO_HOST.to_s + config.goo_port = GOO_PORT.to_i + config.goo_path_query = GOO_PATH_QUERY.to_s + config.goo_path_data = GOO_PATH_DATA.to_s + config.goo_path_update = GOO_PATH_UPDATE.to_s + config.goo_redis_host = REDIS_GOO_CACHE_HOST.to_s + config.goo_redis_port = REDIS_PORT.to_i + config.http_redis_host = REDIS_HTTP_CACHE_HOST.to_s + config.http_redis_port = REDIS_PORT.to_i + config.ontology_analytics_redis_host = REDIS_PERSISTENT_HOST.to_s + config.ontology_analytics_redis_port = REDIS_PORT.to_i + config.repository_folder = REPOSITORY_FOLDER.to_s + config.search_server_url = SOLR_TERM_SEARCH_URL.to_s + config.property_search_server_url = SOLR_PROP_SEARCH_URL.to_s +# config.replace_url_prefix = false +# config.rest_url_prefix = REST_URL_PREFIX.to_s # Email notifications. config.enable_notifications = true config.email_sender = "sender@domain.com" # Default sender for emails @@ -19,27 +45,30 @@ LinkedData.config do |config| config.smtp_user = nil config.smtp_password = nil config.smtp_auth_type = :none - config.smtp_domain = "localhost.localhost" + config.smtp_domain = "localhost.localhost" end Annotator.config do |config| - config.mgrep_dictionary_file ||= "./test/tmp/dict" - config.stop_words_default_file ||= "./config/default_stop_words.txt" config.mgrep_host ||= "localhost" - config.mgrep_port ||= 55555 - config.annotator_redis_host ||= "localhost" - config.annotator_redis_port ||= 6379 + config.annotator_redis_host = REDIS_PERSISTENT_HOST.to_s + config.annotator_redis_port = REDIS_PORT.to_i + config.mgrep_host = MGREP_HOST.to_s + config.mgrep_port = MGREP_PORT.to_i + config.mgrep_dictionary_file = MGREP_DICT_PATH.to_s end NcboCron.config do |config| - config.redis_host ||= "localhost" - config.redis_port ||= 6379 + config.redis_host = REDIS_PERSISTENT_HOST.to_s + config.redis_port = REDIS_PORT.to_i + # Ontologies Report config + config.ontology_report_path = REPORT_PATH + + # do not deaemonize in docker + config.daemonize = false + config.search_index_all_url = "http://localhost:8983/solr/term_search_core2" config.property_search_index_all_url = "http://localhost:8983/solr/prop_search_core2" - # Ontologies Report config - config.ontology_report_path = "./test/reports/ontologies_report.json" - # Google Analytics config config.analytics_service_account_email_address = "123456789999-sikipho0wk8q0atflrmw62dj4kpwoj3c@developer.gserviceaccount.com" config.analytics_path_to_key_file = "config/bioportal-analytics.p12" diff --git a/config/config.test.rb b/config/config.test.rb index 0729a4b0..84a621ac 100644 --- a/config/config.test.rb +++ b/config/config.test.rb @@ -1,33 +1,42 @@ # This file is designed to be used for unit testing with docker-compose -# -GOO_PATH_QUERY = ENV.include?("GOO_PATH_QUERY") ? ENV["GOO_PATH_QUERY"] : "/sparql/" -GOO_PATH_DATA = ENV.include?("GOO_PATH_DATA") ? ENV["GOO_PATH_DATA"] : "/data/" -GOO_PATH_UPDATE = ENV.include?("GOO_PATH_UPDATE") ? ENV["GOO_PATH_UPDATE"] : "/update/" -GOO_BACKEND_NAME = ENV.include?("GOO_BACKEND_NAME") ? ENV["GOO_BACKEND_NAME"] : "localhost" -GOO_PORT = ENV.include?("GOO_PORT") ? ENV["GOO_PORT"] : 9000 -GOO_HOST = ENV.include?("GOO_HOST") ? ENV["GOO_HOST"] : "localhost" -REDIS_HOST = ENV.include?("REDIS_HOST") ? ENV["REDIS_HOST"] : "localhost" -REDIS_PORT = ENV.include?("REDIS_PORT") ? ENV["REDIS_PORT"] : 6379 -MGREP_HOST = ENV.include?("MGREP_HOST") ? ENV["MGREP_HOST"] : "localhost" -MGREP_PORT = ENV.include?("MGREP_PORT") ? ENV["MGREP_PORT"] : 55555 -SOLR_TERM_SEARCH_URL = ENV.include?("SOLR_TERM_SEARCH_URL") ? ENV["SOLR_TERM_SEARCH_URL"] : "http://localhost:8983/solr/term_search_core1" -SOLR_PROP_SEARCH_URL = ENV.include?("SOLR_PROP_SEARCH_URL") ? ENV["SOLR_PROP_SEARCH_URL"] : "http://localhost:8983/solr/prop_search_core1" + +GOO_BACKEND_NAME = ENV.include?("GOO_BACKEND_NAME") ? ENV["GOO_BACKEND_NAME"] : "4store" +GOO_HOST = ENV.include?("GOO_HOST") ? ENV["GOO_HOST"] : "localhost" +GOO_PATH_DATA = ENV.include?("GOO_PATH_DATA") ? ENV["GOO_PATH_DATA"] : "/data/" +GOO_PATH_QUERY = ENV.include?("GOO_PATH_QUERY") ? ENV["GOO_PATH_QUERY"] : "/sparql/" +GOO_PATH_UPDATE = ENV.include?("GOO_PATH_UPDATE") ? ENV["GOO_PATH_UPDATE"] : "/update/" +GOO_PORT = ENV.include?("GOO_PORT") ? ENV["GOO_PORT"] : 9000 +MGREP_HOST = ENV.include?("MGREP_HOST") ? ENV["MGREP_HOST"] : "localhost" +MGREP_PORT = ENV.include?("MGREP_PORT") ? ENV["MGREP_PORT"] : 55555 +MGREP_DICT_PATH = ENV.include?("MGREP_DICT_PATH") ? ENV["MGREP_DICT_PATH"] : "./test/data/dictionary.txt" +REDIS_GOO_CACHE_HOST = ENV.include?("REDIS_GOO_CACHE_HOST") ? ENV["REDIS_GOO_CACHE_HOST"] : "localhost" +REDIS_HTTP_CACHE_HOST = ENV.include?("REDIS_HTTP_CACHE_HOST") ? ENV["REDIS_HTTP_CACHE_HOST"] : "localhost" +REDIS_PERSISTENT_HOST = ENV.include?("REDIS_PERSISTENT_HOST") ? ENV["REDIS_PERSISTENT_HOST"] : "localhost" +REDIS_PORT = ENV.include?("REDIS_PORT") ? ENV["REDIS_PORT"] : 6379 +REPORT_PATH = ENV.include?("REPORT_PATH") ? ENV["REPORT_PATH"] : "./test/tmp/ontologies_report.json" +REPOSITORY_FOLDER = ENV.include?("REPOSITORY_FOLDER") ? ENV["REPOSITORY_FOLDER"] : "./test/data/ontology_files/repo" +REST_URL_PREFIX = ENV.include?("REST_URL_PREFIX") ? ENV["REST_URL_PREFIX"] : "http://localhost:9393" +SOLR_PROP_SEARCH_URL = ENV.include?("SOLR_PROP_SEARCH_URL") ? ENV["SOLR_PROP_SEARCH_URL"] : "http://localhost:8983/solr/prop_search_core1" +SOLR_TERM_SEARCH_URL = ENV.include?("SOLR_TERM_SEARCH_URL") ? ENV["SOLR_TERM_SEARCH_URL"] : "http://localhost:8983/solr/term_search_core1" LinkedData.config do |config| + config.goo_backend_name = GOO_BACKEND_NAME.to_s config.goo_host = GOO_HOST.to_s config.goo_port = GOO_PORT.to_i - config.goo_backend_name = GOO_BACKEND_NAME.to_s config.goo_path_query = GOO_PATH_QUERY.to_s config.goo_path_data = GOO_PATH_DATA.to_s config.goo_path_update = GOO_PATH_UPDATE.to_s - config.goo_redis_host = REDIS_HOST.to_s + config.goo_redis_host = REDIS_GOO_CACHE_HOST.to_s config.goo_redis_port = REDIS_PORT.to_i - config.http_redis_host = REDIS_HOST.to_s + config.http_redis_host = REDIS_HTTP_CACHE_HOST.to_s config.http_redis_port = REDIS_PORT.to_i - config.ontology_analytics_redis_host = REDIS_HOST.to_s + config.ontology_analytics_redis_host = REDIS_PERSISTENT_HOST.to_s config.ontology_analytics_redis_port = REDIS_PORT.to_i + config.repository_folder = REPOSITORY_FOLDER.to_s config.search_server_url = SOLR_TERM_SEARCH_URL.to_s config.property_search_server_url = SOLR_PROP_SEARCH_URL.to_s +# config.replace_url_prefix = false +# config.rest_url_prefix = REST_URL_PREFIX.to_s # Email notifications. config.enable_notifications = true config.email_sender = "sender@domain.com" # Default sender for emails @@ -40,15 +49,21 @@ end Annotator.config do |config| - config.annotator_redis_host = REDIS_HOST.to_s - config.annotator_redis_port = REDIS_PORT.to_i - config.mgrep_host = MGREP_HOST.to_s - config.mgrep_port = MGREP_PORT.to_i - config.mgrep_dictionary_file = "./test/data/dictionary.txt" + config.annotator_redis_host = REDIS_PERSISTENT_HOST.to_s + config.annotator_redis_port = REDIS_PORT.to_i + config.mgrep_host = MGREP_HOST.to_s + config.mgrep_port = MGREP_PORT.to_i + config.mgrep_dictionary_file = MGREP_DICT_PATH.to_s end +# LinkedData::OntologiesAPI.config do |config| +# config.http_redis_host = REDIS_HTTP_CACHE_HOST.to_s +# config.http_redis_port = REDIS_PORT.to_i +# end +# NcboCron.config do |config| - config.redis_host = REDIS_HOST.to_s + config.daemonize = false + config.redis_host = REDIS_PERSISTENT_HOST.to_s config.redis_port = REDIS_PORT.to_i - config.ontology_report_path = "./test/ontologies_report.json" + config.ontology_report_path = REPORT_PATH end diff --git a/dip.yml b/dip.yml new file mode 100644 index 00000000..3bbe4444 --- /dev/null +++ b/dip.yml @@ -0,0 +1,54 @@ +version: '7.1' + +# Define default environment variables to pass +# to Docker Compose +#environment: +# RAILS_ENV: development + +compose: + files: + - docker-compose.yml + # project_name: ncbo_cron + +interaction: + # This command spins up a ncbo_cron container with the required dependencies (solr, 4store, etc), + # and opens a terminal within it. + runner: + description: Open a Bash shell within a ncbo_cron container (with dependencies up) + service: ncbo_cron + command: /bin/bash + + # Run a container without any dependent services + bash: + description: Run an arbitrary script within a container (or open a shell without deps) + service: ncbo_cron + command: /bin/bash + compose_run_options: [ no-deps ] + + # A shortcut to run Bundler commands + bundle: + description: Run Bundler commands within ncbo_cron container (with depencendies up) + service: ncbo_cron + command: bundle + + # A shortcut to run unit tests + test: + description: Run unit tests with 4store triplestore + service: ncbo_cron + command: bundle exec rake test TESTOPTS='-v' + + test-ag: + description: Run unit tests with AllegroGraph triplestore + service: ncbo_cron-agraph + command: bundle exec rake test TESTOPTS='-v' + + 'redis-cli': + description: Run Redis console + service: redis-ut + command: redis-cli -h redis-ut + +#provision: + #- dip compose down --volumes + #- dip compose up -d solr 4store + #- dip bundle install + #- dip bash -c bin/setup diff --git a/test/docker-compose.yml b/docker-compose.yml similarity index 52% rename from test/docker-compose.yml rename to docker-compose.yml index db957907..ca54bbce 100644 --- a/test/docker-compose.yml +++ b/docker-compose.yml @@ -1,43 +1,47 @@ x-app: &app - build: - context: ../. - args: - RUBY_VERSION: '2.7' - # Increase the version number in the image tag every time Dockerfile or its arguments is changed - image: ncbo_cron-dev:0.0.1 - environment: &env - # default bundle config resolves to /usr/local/bundle/config inside of the container - # we are setting it to local app directory if we need to use 'bundle config local' - BUNDLE_APP_CONFIG: /srv/ontoportal/ncbo_cron/.bundle - BUNDLE_PATH: /srv/ontoportal/bundle - COVERAGE: 'true' # enable simplecov code coverage - REDIS_HOST: redis-ut - REDIS_PORT: 6379 - SOLR_TERM_SEARCH_URL: http://solr-ut:8983/solr/term_search_core1 - SOLR_PROP_SEARCH_URL: http://solr-ut:8983/solr/prop_search_core1 - MGREP_HOST: mgrep-ut - MGREP_PORT: 55555 - stdin_open: true - tty: true - command: /bin/bash - volumes: - # bundle volume for hosting gems installed by bundle; it speeds up gem install in local development - - bundle:/srv/ontoportal/bundle - - ../.:/srv/ontoportal/ncbo_cron - # mount directory containing development version of the gems if you need to use 'bundle config local' - #- /Users/alexskr/ontoportal:/Users/alexskr/ontoportal - depends_on: &depends_on - solr-ut: - condition: service_healthy - redis-ut: - condition: service_healthy - mgrep-ut: - condition: service_healthy - + build: + context: . + args: + RUBY_VERSION: '2.7' + # Increase the version number in the image tag every time Dockerfile or its arguments is changed + image: ncbo_cron:0.0.1 + environment: &env + BUNDLE_PATH: /srv/ontoportal/bundle + # default bundle config resolves to /usr/local/bundle/config inside of the container + # we are setting it to local app directory if we need to use 'bundle config local' + BUNDLE_APP_CONFIG: /srv/ontoportal/ncbo_cron/.bundle + COVERAGE: 'true' + GOO_REDIS_HOST: redis-ut + REDIS_GOO_CACHE_HOST: redis-ut + REDIS_HTTP_CACHE_HOST: redis-ut + REDIS_PERSISTENT_HOST: redis-ut + REDIS_PORT: 6379 + SOLR_TERM_SEARCH_URL: http://solr-ut:8983/solr/term_search_core1 + SOLR_PROP_SEARCH_URL: http://solr-ut:8983/solr/prop_search_core1 + MGREP_HOST: mgrep-ut + MGREP_PORT: 55555 + stdin_open: true + tty: true + command: "bundle exec rackup -o 0.0.0.0 --port 9393" + ports: + - 9393:9393 + volumes: + # bundle volume for hosting gems installed by bundle; it helps in local development with gem udpates + - bundle:/srv/ontoportal/bundle + # api code + - .:/srv/ontoportal/ontologies_api + # mount directory containing development version of the gems if you need to use 'bundle config local' + #- /Users/alexskr/ontoportal:/Users/alexskr/ontoportal + depends_on: &depends_on + solr-ut: + condition: service_healthy + redis-ut: + condition: service_healthy + mgrep-ut: + condition: service_healthy services: - # environment wtih 4store backend - ruby: + ncbo_cron: <<: *app environment: <<: *env @@ -54,8 +58,7 @@ services: 4store-ut: condition: service_started - # environment with AllegroGraph backend - ruby-agraph: + ncbo_cron-agraph: <<: *app environment: <<: *env @@ -65,16 +68,15 @@ services: GOO_PATH_QUERY: /repositories/bioportal_test GOO_PATH_DATA: /repositories/bioportal_test/statements GOO_PATH_UPDATE: /repositories/bioportal_test/statements - # profiles: - #- agraph + profiles: + - agraph depends_on: <<: *depends_on agraph-ut: - condition: service_started + condition: service_healthy redis-ut: image: redis - command: ["redis-server", "--save", "", "--appendonly", "no"] healthcheck: test: redis-cli ping interval: 10s @@ -83,6 +85,7 @@ services: 4store-ut: image: bde2020/4store + #volume: fourstore:/var/lib/4store command: > bash -c "4s-backend-setup --segments 4 ontoportal_kb && 4s-backend ontoportal_kb @@ -100,7 +103,7 @@ services: retries: 5 mgrep-ut: - image: ontoportal/mgrep-ncbo:0.1 + image: ontoportal/mgrep:0.0.1 healthcheck: test: ["CMD", "nc", "-z", "-v", "localhost", "55555"] start_period: 3s @@ -109,7 +112,7 @@ services: retries: 5 agraph-ut: - image: franzinc/agraph:v7.3.0 + image: franzinc/agraph:v7.3.1 environment: - AGRAPH_SUPER_USER=test - AGRAPH_SUPER_PASSWORD=xyzzy @@ -122,8 +125,14 @@ services: ; agtool users add anonymous ; agtool users grant anonymous root:bioportal_test:rw ; tail -f /agraph/data/agraph.log" - # profiles: - #- agraph + healthcheck: + test: ["CMD-SHELL", "agtool storage-report bioportal_test || exit 1"] + start_period: 10s + interval: 60s + timeout: 5s + retries: 3 + profiles: + - agraph volumes: bundle: From 6c6a5c83ad1a8dcf35bdbb97d95bbda986a9a758 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Tue, 14 Feb 2023 17:06:12 -0800 Subject: [PATCH 13/62] Add GH workflow for publishing docker images --- .github/workflows/docker-image.yml | 35 ++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .github/workflows/docker-image.yml diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml new file mode 100644 index 00000000..b40cbea1 --- /dev/null +++ b/.github/workflows/docker-image.yml @@ -0,0 +1,35 @@ +name: Docker Image CI + +on: + release: + types: [published] + +jobs: + push_to_registry: + name: Push Docker image to Docker Hub + runs-on: ubuntu-latest + steps: + - name: Check out the repo + uses: actions/checkout@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: bioportal/ncbo_cron + + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: . + build-args: | + RUBY_VERSION=2.7 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} From 19886086df10af1ffe10598c9d90c4a9e2a2a686 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Tue, 28 Mar 2023 16:56:27 -0700 Subject: [PATCH 14/62] use ruby native method for listing files instead of a git function Resolves warning messages when we exclude .git directory from docker image --- Gemfile | 3 +-- ncbo_cron.gemspec | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Gemfile b/Gemfile index c552979f..aae5ab76 100644 --- a/Gemfile +++ b/Gemfile @@ -6,7 +6,6 @@ gem 'faraday', '~> 1.9' gem 'ffi' gem "google-apis-analytics_v3" gem 'mail', '2.6.6' -gem 'minitest', '< 5.0' gem 'multi_json' gem 'oj', '~> 2.0' gem 'parseconfig' @@ -28,8 +27,8 @@ gem 'sparql-client', github: 'ncbo/sparql-client', branch: 'develop' group :test do gem 'email_spec' + gem 'minitest', '< 5.0' gem 'simplecov' gem 'simplecov-cobertura' # for codecov.io gem 'test-unit-minitest' end - diff --git a/ncbo_cron.gemspec b/ncbo_cron.gemspec index 821881d1..b57c1138 100644 --- a/ncbo_cron.gemspec +++ b/ncbo_cron.gemspec @@ -8,7 +8,8 @@ Gem::Specification.new do |gem| gem.summary = %q{} gem.homepage = "https://github.com/ncbo/ncbo_cron" - gem.files = `git ls-files`.split($\) + #gem.files = `git ls-files`.split($\) + gem.files = Dir['**/*'] gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) gem.name = "ncbo_cron" From ed261233f0c9fbfb2d3cb1d24a09345192e395c0 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Tue, 28 Mar 2023 17:26:01 -0700 Subject: [PATCH 15/62] remove comment --- ncbo_cron.gemspec | 1 - 1 file changed, 1 deletion(-) diff --git a/ncbo_cron.gemspec b/ncbo_cron.gemspec index b57c1138..ef21761f 100644 --- a/ncbo_cron.gemspec +++ b/ncbo_cron.gemspec @@ -8,7 +8,6 @@ Gem::Specification.new do |gem| gem.summary = %q{} gem.homepage = "https://github.com/ncbo/ncbo_cron" - #gem.files = `git ls-files`.split($\) gem.files = Dir['**/*'] gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) From a3668053f23eb57f44737a44eb7e7f2a936bb2cd Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Tue, 28 Mar 2023 17:40:38 -0700 Subject: [PATCH 16/62] capitalize argument in order to be consistent with other scripts --- bin/ncbo_ontology_import | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/bin/ncbo_ontology_import b/bin/ncbo_ontology_import index 09b5a0a7..57d63aa1 100755 --- a/bin/ncbo_ontology_import +++ b/bin/ncbo_ontology_import @@ -28,28 +28,24 @@ opt_parser = OptionParser.new do |opts| ontologies_acronyms = acronym end - opts.on('--from url', 'The ontoportal api url source of the ontology') do |url| + opts.on('--from URL', 'The ontoportal api url source of the ontology') do |url| ontology_source = url.to_s end - opts.on('--from-apikey apikey', 'An apikey to acces the ontoportal api') do |apikey| + opts.on('--from-apikey APIKEY', 'An apikey to acces the ontoportal api') do |apikey| source_apikey = apikey.to_s end - opts.on('--admin-user username', 'The target admin user that will submit the ontology') do |user| + opts.on('--admin-user USERNAME', 'The target admin user that will submit the ontology') do |user| username = user.to_s end # Display the help screen, all programs are assumed to have this option. - opts.on( '-h', '--help', 'Display this screen') do + opts.on('-h', '--help', 'Display this screen') do puts opts exit end end opt_parser.parse! -if ontologies_acronyms.empty? - puts opts - exit(1) -end # URL of the API and APIKEY of the Ontoportal we want to import data FROM SOURCE_API = ontology_source @@ -132,7 +128,6 @@ def upload_submission(sub_info, ontology) new_submission end - USER = get_user username # get apikey for admin user TARGET_APIKEY = USER.apikey From 6ec117409016b1f417354e6cf8c91efc9a5f7dec Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Tue, 28 Mar 2023 17:49:58 -0700 Subject: [PATCH 17/62] add arm/64 platform --- .github/workflows/docker-image.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index b40cbea1..6105c1d8 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -12,6 +12,12 @@ jobs: - name: Check out the repo uses: actions/checkout@v3 + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Log in to Docker Hub uses: docker/login-action@v2 with: @@ -28,6 +34,7 @@ jobs: uses: docker/build-push-action@v4 with: context: . + platforms: linux/amd64,linux/arm64 build-args: | RUBY_VERSION=2.7 push: true From a461322bcc5909fbe35eeb5307d2ff9c0fcc20dc Mon Sep 17 00:00:00 2001 From: mdorf Date: Thu, 6 Apr 2023 13:41:13 -0700 Subject: [PATCH 18/62] additional error handling for SPAM deletion script, #60 --- lib/ncbo_cron/spam_deletion.rb | 40 +++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/lib/ncbo_cron/spam_deletion.rb b/lib/ncbo_cron/spam_deletion.rb index 8db5568b..819ffa9d 100644 --- a/lib/ncbo_cron/spam_deletion.rb +++ b/lib/ncbo_cron/spam_deletion.rb @@ -27,6 +27,16 @@ def initialize(logger=nil) def run auth_token = Base64.decode64(NcboCron.settings.git_repo_access_token) res = `curl --header 'Authorization: token #{auth_token}' --header 'Accept: application/vnd.github.v3.raw' --location #{FULL_FILE_PATH}` + + begin + error_json = JSON.parse(res) + msg = "\nError while fetching the SPAM user list from #{FULL_FILE_PATH}: #{error_json}" + @logger.error(msg) + puts msg + exit + rescue JSON::ParserError + @logger.info("Successfully downloaded the SPAM user list from #{FULL_FILE_PATH}") + end usernames = res.split(",").map(&:strip) delete_spam(usernames) end @@ -99,25 +109,25 @@ def delete_spam(usernames) @logger.info("Deleting #{delete_prov_classes.length} provisional classes...") @logger.info("Deleting #{delete_users.length} users...") - delete_projects.each {|p| p.delete} - delete_notes.each {|n| n.delete} - delete_reviews.each {|r| r.delete} - delete_ontologies.each {|o| o.delete} - delete_prov_classes.each {|pc| pc.delete} - delete_users.each {|u| u.delete} + # delete_projects.each {|p| p.delete} + # delete_notes.each {|n| n.delete} + # delete_reviews.each {|r| r.delete} + # delete_ontologies.each {|o| o.delete} + # delete_prov_classes.each {|pc| pc.delete} + # delete_users.each {|u| u.delete} end end end end end -# require 'ontologies_linked_data' -# require 'goo' -# require 'ncbo_annotator' -# require 'ncbo_cron/config' -# require_relative '../../config/config' -# -# spam_deletion_path = File.join("logs", "spam-deletion.log") -# spam_deletion_logger = Logger.new(spam_deletion_path) -# NcboCron::Models::SpamDeletion.new(spam_deletion_logger).run +require 'ontologies_linked_data' +require 'goo' +require 'ncbo_annotator' +require 'ncbo_cron/config' +require_relative '../../config/config' + +spam_deletion_path = File.join("logs", "spam-deletion.log") +spam_deletion_logger = Logger.new(spam_deletion_path) +NcboCron::Models::SpamDeletion.new(spam_deletion_logger).run # ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontology-analytics true --disable-mapping-counts true --disable-ontologies-report true --spam-deletion '14 * * * *' \ No newline at end of file From 3b9fdb0704b19e474ad0642da25b33ef5ea84f40 Mon Sep 17 00:00:00 2001 From: mdorf Date: Thu, 6 Apr 2023 13:57:03 -0700 Subject: [PATCH 19/62] additional error handling for SPAM deletion script, #60 --- lib/ncbo_cron/spam_deletion.rb | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/lib/ncbo_cron/spam_deletion.rb b/lib/ncbo_cron/spam_deletion.rb index 819ffa9d..e2ec64f8 100644 --- a/lib/ncbo_cron/spam_deletion.rb +++ b/lib/ncbo_cron/spam_deletion.rb @@ -25,7 +25,7 @@ def initialize(logger=nil) end def run - auth_token = Base64.decode64(NcboCron.settings.git_repo_access_token) + auth_token = NcboCron.settings.git_repo_access_token res = `curl --header 'Authorization: token #{auth_token}' --header 'Accept: application/vnd.github.v3.raw' --location #{FULL_FILE_PATH}` begin @@ -109,25 +109,25 @@ def delete_spam(usernames) @logger.info("Deleting #{delete_prov_classes.length} provisional classes...") @logger.info("Deleting #{delete_users.length} users...") - # delete_projects.each {|p| p.delete} - # delete_notes.each {|n| n.delete} - # delete_reviews.each {|r| r.delete} - # delete_ontologies.each {|o| o.delete} - # delete_prov_classes.each {|pc| pc.delete} - # delete_users.each {|u| u.delete} + delete_projects.each {|p| p.delete} + delete_notes.each {|n| n.delete} + delete_reviews.each {|r| r.delete} + delete_ontologies.each {|o| o.delete} + delete_prov_classes.each {|pc| pc.delete} + delete_users.each {|u| u.delete} end end end end end -require 'ontologies_linked_data' -require 'goo' -require 'ncbo_annotator' -require 'ncbo_cron/config' -require_relative '../../config/config' - -spam_deletion_path = File.join("logs", "spam-deletion.log") -spam_deletion_logger = Logger.new(spam_deletion_path) -NcboCron::Models::SpamDeletion.new(spam_deletion_logger).run +# require 'ontologies_linked_data' +# require 'goo' +# require 'ncbo_annotator' +# require 'ncbo_cron/config' +# require_relative '../../config/config' +# +# spam_deletion_path = File.join("logs", "spam-deletion.log") +# spam_deletion_logger = Logger.new(spam_deletion_path) +# NcboCron::Models::SpamDeletion.new(spam_deletion_logger).run # ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontology-analytics true --disable-mapping-counts true --disable-ontologies-report true --spam-deletion '14 * * * *' \ No newline at end of file From 030930c658c86af0154bb16b3c13a9b585e6ec91 Mon Sep 17 00:00:00 2001 From: mdorf Date: Tue, 11 Apr 2023 11:14:57 -0700 Subject: [PATCH 20/62] implemented #67 - improved corrupt data and error handling --- Gemfile.lock | 8 +- bin/ncbo_ontology_archive_old_submissions | 108 ++++++++++++++++++-- lib/ncbo_cron/ontology_submission_parser.rb | 2 +- 3 files changed, 103 insertions(+), 15 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index f230ace8..599d878b 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 15023141f6051d4fa6cba6081d082c720327b0c9 + revision: b8c7867450ec6ea2d3167eb9d9b1aed5614a1ce3 branch: develop specs: goo (0.0.2) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 297f630ee5a35a78b015adf32fdb1e3af59ca652 + revision: 79527b94fbb59081ba58281a5cd51ec3448fadb0 branch: develop specs: ontologies_linked_data (0.0.1) @@ -46,7 +46,7 @@ GIT GIT remote: https://github.com/ncbo/sparql-client.git - revision: fb4a89b420f8eb6dda5190a126b6c62e32c4c0c9 + revision: 55e7dbf858eb571c767bc67868f9af61663859cb branch: develop specs: sparql-client (1.0.1) @@ -235,9 +235,7 @@ GEM webrick (1.8.1) PLATFORMS - ruby x86_64-darwin-18 - x86_64-darwin-21 DEPENDENCIES cube-ruby diff --git a/bin/ncbo_ontology_archive_old_submissions b/bin/ncbo_ontology_archive_old_submissions index 3dc5c87c..535c129e 100755 --- a/bin/ncbo_ontology_archive_old_submissions +++ b/bin/ncbo_ontology_archive_old_submissions @@ -11,31 +11,121 @@ require_relative '../lib/ncbo_cron' config_exists = File.exist?(File.expand_path('../../config/config.rb', __FILE__)) abort("Please create a config/config.rb file using the config/config.rb.sample as a template") unless config_exists require_relative '../config/config' +require 'optparse' -logfile = 'archive_old_submissions.log' +options = { delete: false } +opt_parser = OptionParser.new do |opts| + # Set a banner, displayed at the top of the help screen. + opts.banner = "Usage: #{File.basename(__FILE__)} [options]" + + options[:logfile] = STDOUT + opts.on( '-l', '--logfile FILE', "Write log to FILE (default is STDOUT)" ) do |filename| + options[:logfile] = filename + end + + # Delete submission if it contains bad data + opts.on( '-d', '--delete', "Delete submissions that contain bad data" ) do + options[:delete] = true + end + + # Display the help screen, all programs are assumed to have this option. + opts.on( '-h', '--help', 'Display this screen' ) do + puts opts + exit + end +end + +opt_parser.parse! +logfile = options[:logfile] if File.file?(logfile); File.delete(logfile); end logger = Logger.new(logfile) -options = { process_rdf: false, index_search: false, index_commit: false, - run_metrics: false, reasoning: false, archive: true } +process_actions = { process_rdf: false, generate_labels: false, index_search: false, index_commit: false, + process_annotator: false, diff: false, run_metrics: false, archive: true } onts = LinkedData::Models::Ontology.all onts.each { |ont| ont.bring(:acronym, :submissions) } -onts.sort! { |a,b| a.acronym <=> b.acronym } +onts.sort! { |a, b| a.acronym <=> b.acronym } +bad_submissions = {} onts.each do |ont| latest_sub = ont.latest_submission - if not latest_sub.nil? + + unless latest_sub.nil? id = latest_sub.submissionId subs = ont.submissions - old_subs = subs.reject { |sub| sub.submissionId >= id } - old_subs.sort! { |a,b| a.submissionId <=> b.submissionId } + + old_subs = subs.reject { |sub| + begin + sub.submissionId >= id + rescue => e + msg = "Invalid submission ID detected (String instead of Integer): #{ont.acronym}/#{sub.submissionId} - #{e.class}:\n#{e.backtrace.join("\n")}" + puts msg + logger.error(msg) + + if options[:delete] + sub.delete if options[:delete] + msg = "Deleted submission #{ont.acronym}/#{sub.submissionId} due to invalid Submission ID" + puts msg + logger.error(msg) + end + bad_submissions["#{ont.acronym}/#{sub.submissionId}"] = "Invalid Submission ID" + true + end + } + old_subs.sort! { |a, b| a.submissionId <=> b.submissionId } old_subs.each do |sub| - if not sub.archived? + unless sub.archived? msg = "#{ont.acronym}: found un-archived old submission with ID #{sub.submissionId}." puts msg logger.info msg - NcboCron::Models::OntologySubmissionParser.new.process_submission(logger, sub.id.to_s, options) + + begin + NcboCron::Models::OntologySubmissionParser.new.process_submission(logger, sub.id.to_s, process_actions) + rescue => e + if e.class == Goo::Base::NotValidException + if sub.valid? + msg = "Error archiving submission #{ont.acronym}/#{sub.submissionId} - #{e.class}:\n#{e.backtrace.join("\n")}" + puts msg + logger.error(msg) + bad_submissions["#{ont.acronym}/#{sub.submissionId}"] = "Submission passes valid check but cannot be saved" + else + msg = "Error archiving submission #{ont.acronym}/#{sub.submissionId}:\n#{JSON.pretty_generate(sub.errors)}" + puts msg + logger.error(msg) + + if options[:delete] + sub.delete if options[:delete] + msg = "Deleted submission #{ont.acronym}/#{sub.submissionId} due to invalid data" + puts msg + logger.error(msg) + end + bad_submissions["#{ont.acronym}/#{sub.submissionId}"] = "Submission is not valid to be saved" + end + else + msg = "Error archiving submission #{ont.acronym}/#{sub.submissionId} - #{e.class}:\n#{e.backtrace.join("\n")}" + puts msg + logger.error(msg) + + if options[:delete] && (e.class == Net::HTTPBadResponse || e.class == Errno::ECONNREFUSED) + sub.delete + msg = "Deleted submission #{ont.acronym}/#{sub.submissionId} due to a non-working pull URL" + puts msg + logger.error(msg) + end + bad_submissions["#{ont.acronym}/#{sub.submissionId}"] = "#{e.class} - Runtime error" + end + end end end end end +msg = JSON.pretty_generate(bad_submissions) +puts +puts msg +logger.error(msg) + +msg = "Number of errored submissions: #{bad_submissions.length}" +puts msg +logger.error(msg) + + diff --git a/lib/ncbo_cron/ontology_submission_parser.rb b/lib/ncbo_cron/ontology_submission_parser.rb index dfa7b320..a6512312 100644 --- a/lib/ncbo_cron/ontology_submission_parser.rb +++ b/lib/ncbo_cron/ontology_submission_parser.rb @@ -166,7 +166,7 @@ def process_submission(logger, submission_id, actions=ACTIONS) # Check to make sure the file has been downloaded if sub.pullLocation && (!sub.uploadFilePath || !File.exist?(sub.uploadFilePath)) - multi_logger.debug "Pull location found, but no file in the upload file path. Retrying download." + multi_logger.debug "Pull location found (#{sub.pullLocation}, but no file in the upload file path (#{sub.uploadFilePath}. Retrying download." file, filename = sub.download_ontology_file file_location = sub.class.copy_file_repository(sub.ontology.acronym, sub.submissionId, file, filename) file_location = "../" + file_location if file_location.start_with?(".") # relative path fix From 3308da017161b13d9c27c82654cb8aab7455268e Mon Sep 17 00:00:00 2001 From: mdorf Date: Tue, 11 Apr 2023 11:15:12 -0700 Subject: [PATCH 21/62] Gemfile.lock update --- Gemfile.lock | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 599d878b..3e181fd5 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -74,13 +74,13 @@ GEM activesupport (3.2.22.5) i18n (~> 0.6, >= 0.6.4) multi_json (~> 1.0) - addressable (2.8.1) + addressable (2.8.4) public_suffix (>= 2.0.2, < 6.0) bcrypt (3.1.18) builder (3.2.4) coderay (1.1.3) - concurrent-ruby (1.2.0) - connection_pool (2.3.0) + concurrent-ruby (1.2.2) + connection_pool (2.4.0) cube-ruby (0.0.3) dante (0.2.0) declarative (0.0.20) @@ -115,8 +115,8 @@ GEM faraday-rack (1.0.0) faraday-retry (1.0.3) ffi (1.15.5) - google-apis-analytics_v3 (0.12.0) - google-apis-core (>= 0.9.1, < 2.a) + google-apis-analytics_v3 (0.13.0) + google-apis-core (>= 0.11.0, < 2.a) google-apis-core (0.11.0) addressable (~> 2.5, >= 2.5.1) googleauth (>= 0.16.2, < 2.a) @@ -126,7 +126,7 @@ GEM retriable (>= 2.0, < 4.a) rexml webrick - googleauth (1.3.0) + googleauth (1.5.1) faraday (>= 0.17.3, < 3.a) jwt (>= 1.4, < 3.0) memoist (~> 0.16) @@ -155,7 +155,7 @@ GEM method_source (1.0.0) mime-types (3.4.1) mime-types-data (~> 3.2015) - mime-types-data (3.2022.0105) + mime-types-data (3.2023.0218.1) mini_mime (1.1.2) minitest (4.7.5) mlanett-redis-lock (0.2.7) @@ -175,15 +175,15 @@ GEM coderay (~> 1.1) method_source (~> 1.0) public_suffix (5.0.1) - rack (3.0.4.1) - rack-test (2.0.2) + rack (3.0.7) + rack-test (2.1.0) rack (>= 1.3) rake (13.0.6) rdf (1.0.8) addressable (>= 2.2) redis (5.0.6) redis-client (>= 0.9.0) - redis-client (0.12.1) + redis-client (0.14.1) connection_pool representable (3.2.0) declarative (< 0.1.0) From 873d8292309c0f74f0737934e5ac972e12d2793d Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Fri, 5 May 2023 13:37:48 -0700 Subject: [PATCH 22/62] exclude test/data/dictionary.txt from git commits --- .dockerignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.dockerignore b/.dockerignore index 119eac79..96c8053c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -12,5 +12,6 @@ tmp/* coverage create_permissions.log # Ignore generated test data +test/data/dictionary.txt test/data/ontology_files/repo/**/* test/data/tmp/* From 419c518adef378b335a595bb4ec762cf06063d7b Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Fri, 5 May 2023 13:39:14 -0700 Subject: [PATCH 23/62] update version of solr-ut --- docker-compose.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index ca54bbce..009e33c2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -28,8 +28,8 @@ x-app: &app volumes: # bundle volume for hosting gems installed by bundle; it helps in local development with gem udpates - bundle:/srv/ontoportal/bundle - # api code - - .:/srv/ontoportal/ontologies_api + # ncbo_cron code + - .:/srv/ontoportal/ncbo_cron # mount directory containing development version of the gems if you need to use 'bundle config local' #- /Users/alexskr/ontoportal:/Users/alexskr/ontoportal depends_on: &depends_on @@ -85,6 +85,7 @@ services: 4store-ut: image: bde2020/4store + platform: linux/amd64 #volume: fourstore:/var/lib/4store command: > bash -c "4s-backend-setup --segments 4 ontoportal_kb @@ -94,7 +95,7 @@ services: - 4store solr-ut: - image: ontoportal/solr-ut:0.1 + image: ontoportal/solr-ut:0.0.2 healthcheck: test: ["CMD-SHELL", "curl -sf http://localhost:8983/solr/term_search_core1/admin/ping?wt=json | grep -iq '\"status\":\"OK\"}' || exit 1"] start_period: 3s @@ -104,6 +105,7 @@ services: mgrep-ut: image: ontoportal/mgrep:0.0.1 + platform: linux/amd64 healthcheck: test: ["CMD", "nc", "-z", "-v", "localhost", "55555"] start_period: 3s @@ -113,6 +115,7 @@ services: agraph-ut: image: franzinc/agraph:v7.3.1 + platform: linux/amd64 environment: - AGRAPH_SUPER_USER=test - AGRAPH_SUPER_PASSWORD=xyzzy From 94eed8de9a004a19fe178ef91ede50ec5f40fb8b Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Fri, 5 May 2023 13:40:58 -0700 Subject: [PATCH 24/62] Gemfile.lock update --- Gemfile.lock | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 72849d08..e17ffeaa 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: b8c7867450ec6ea2d3167eb9d9b1aed5614a1ce3 + revision: fb203b0396d03c1df61abfcdbc4070787010f052 branch: develop specs: goo (0.0.2) @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 2ee3915af8aa96be7f5fce825c25c6291c8952e5 + revision: 9fd2649b66d3bd6ef7666729d9f91087aa406266 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 79527b94fbb59081ba58281a5cd51ec3448fadb0 + revision: 1f535313248f27170f7ecdb3450ae8247be735eb branch: develop specs: ontologies_linked_data (0.0.1) @@ -126,7 +126,7 @@ GEM retriable (>= 2.0, < 4.a) rexml webrick - googleauth (1.5.1) + googleauth (1.5.2) faraday (>= 0.17.3, < 3.a) jwt (>= 1.4, < 3.0) memoist (~> 0.16) @@ -145,7 +145,7 @@ GEM jwt (2.7.0) launchy (2.5.2) addressable (~> 2.8) - libxml-ruby (4.0.0) + libxml-ruby (4.1.1) logger (1.5.3) macaddr (1.7.2) systemu (~> 2.6.5) @@ -235,6 +235,7 @@ GEM webrick (1.8.1) PLATFORMS + ruby x86_64-darwin-18 x86_64-darwin-21 x86_64-linux From 68dea929b9de90855b360f41b8d3a6d8c550c9c3 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Fri, 5 May 2023 14:08:03 -0700 Subject: [PATCH 25/62] Restore branch specifier to master --- Gemfile | 8 ++++---- Gemfile.lock | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Gemfile b/Gemfile index aae5ab76..e2696023 100644 --- a/Gemfile +++ b/Gemfile @@ -20,10 +20,10 @@ gem 'sys-proctable' gem 'cube-ruby', require: 'cube' # NCBO -gem 'goo', github: 'ncbo/goo', branch: 'develop' -gem 'ncbo_annotator', github: 'ncbo/ncbo_annotator', branch: 'develop' -gem 'ontologies_linked_data', github: 'ncbo/ontologies_linked_data', branch: 'develop' -gem 'sparql-client', github: 'ncbo/sparql-client', branch: 'develop' +gem 'goo', github: 'ncbo/goo', branch: 'master' +gem 'ncbo_annotator', github: 'ncbo/ncbo_annotator', branch: 'master' +gem 'ontologies_linked_data', github: 'ncbo/ontologies_linked_data', branch: 'master' +gem 'sparql-client', github: 'ncbo/sparql-client', branch: 'master' group :test do gem 'email_spec' diff --git a/Gemfile.lock b/Gemfile.lock index e17ffeaa..324c5a71 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ GIT remote: https://github.com/ncbo/goo.git - revision: fb203b0396d03c1df61abfcdbc4070787010f052 - branch: develop + revision: e8816b06244c26088ef0596ddc0623562a75a284 + branch: master specs: goo (0.0.2) addressable (~> 2.8) @@ -15,8 +15,8 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 9fd2649b66d3bd6ef7666729d9f91087aa406266 - branch: develop + revision: 964f0680799421ab24eddc974d9f2995c6c88734 + branch: master specs: ncbo_annotator (0.0.1) goo @@ -26,8 +26,8 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 1f535313248f27170f7ecdb3450ae8247be735eb - branch: develop + revision: d858e2e29e5da84c9741b7508efa235919642e46 + branch: master specs: ontologies_linked_data (0.0.1) activesupport @@ -46,8 +46,8 @@ GIT GIT remote: https://github.com/ncbo/sparql-client.git - revision: 55e7dbf858eb571c767bc67868f9af61663859cb - branch: develop + revision: fb4a89b420f8eb6dda5190a126b6c62e32c4c0c9 + branch: master specs: sparql-client (1.0.1) json_pure (>= 1.4) @@ -267,4 +267,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.15 + 2.4.9 From 07a1d6540474e6d52d077173c7fcb71748da5aa9 Mon Sep 17 00:00:00 2001 From: mdorf Date: Tue, 9 May 2023 09:20:56 -0700 Subject: [PATCH 26/62] fixed configuration for the analytics module --- Gemfile.lock | 2 +- bin/ncbo_ontology_annotate_generate_cache | 2 +- lib/ncbo_cron/ontology_analytics.rb | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 72849d08..761ee725 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 2ee3915af8aa96be7f5fce825c25c6291c8952e5 + revision: 9fd2649b66d3bd6ef7666729d9f91087aa406266 branch: develop specs: ncbo_annotator (0.0.1) diff --git a/bin/ncbo_ontology_annotate_generate_cache b/bin/ncbo_ontology_annotate_generate_cache index 07286e7c..18399bea 100755 --- a/bin/ncbo_ontology_annotate_generate_cache +++ b/bin/ncbo_ontology_annotate_generate_cache @@ -49,7 +49,7 @@ opt_parser = OptionParser.new do |opts| options[:generate_dictionary] = true end - options[:logfile] = "logs/annotator_cache.log" + options[:logfile] = STDOUT opts.on('-l', '--logfile FILE', "Write log to FILE (default is 'logs/annotator_cache.log').") do |filename| options[:logfile] = filename end diff --git a/lib/ncbo_cron/ontology_analytics.rb b/lib/ncbo_cron/ontology_analytics.rb index e06fcd77..3e4076b4 100644 --- a/lib/ncbo_cron/ontology_analytics.rb +++ b/lib/ncbo_cron/ontology_analytics.rb @@ -13,7 +13,7 @@ def initialize(logger) end def run - redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port) + redis = Redis.new(:host => NcboCron.settings.ontology_analytics_redis_host, :port => NcboCron.settings.ontology_analytics_redis_port) ontology_analytics = fetch_ontology_analytics redis.set(ONTOLOGY_ANALYTICS_REDIS_FIELD, Marshal.dump(ontology_analytics)) end @@ -120,7 +120,8 @@ def authenticate_google # require 'ncbo_annotator' # require 'ncbo_cron/config' # require_relative '../../config/config' -# ontology_analytics_log_path = File.join("logs", "ontology-analytics.log") -# ontology_analytics_logger = Logger.new(ontology_analytics_log_path) +# # ontology_analytics_log_path = File.join("logs", "ontology-analytics.log") +# # ontology_analytics_logger = Logger.new(ontology_analytics_log_path) +# ontology_analytics_logger = Logger.new(STDOUT) # NcboCron::Models::OntologyAnalytics.new(ontology_analytics_logger).run # ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontologies-report true --disable-mapping-counts true --disable-spam-deletion true --ontology-analytics '14 * * * *' From 8e11202c948fd413b4227e993ab838008ab49a33 Mon Sep 17 00:00:00 2001 From: mdorf Date: Tue, 9 May 2023 09:21:45 -0700 Subject: [PATCH 27/62] Gemfile.lock update --- Gemfile.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index e17ffeaa..c0ec0478 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: fb203b0396d03c1df61abfcdbc4070787010f052 + revision: b8c7867450ec6ea2d3167eb9d9b1aed5614a1ce3 branch: develop specs: goo (0.0.2) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 1f535313248f27170f7ecdb3450ae8247be735eb + revision: 79527b94fbb59081ba58281a5cd51ec3448fadb0 branch: develop specs: ontologies_linked_data (0.0.1) From 0b9b785320242f727e7222b0632145fcc4584ddf Mon Sep 17 00:00:00 2001 From: mdorf Date: Thu, 22 Jun 2023 15:44:08 -0700 Subject: [PATCH 28/62] implemented #69 - scheduled annotator dictionary file generation should be a configurable option instead of the default --- Gemfile.lock | 2 +- bin/ncbo_cron | 71 ++++----------------- lib/ncbo_cron/config.rb | 20 +----- lib/ncbo_cron/ontology_submission_parser.rb | 7 +- 4 files changed, 19 insertions(+), 81 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index c0ec0478..fad566d7 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: b8c7867450ec6ea2d3167eb9d9b1aed5614a1ce3 + revision: 8a0c14a6e6942b20749894806f1f1f512f9afcfa branch: develop specs: goo (0.0.2) diff --git a/bin/ncbo_cron b/bin/ncbo_cron index 8d212382..3b7aa063 100755 --- a/bin/ncbo_cron +++ b/bin/ncbo_cron @@ -111,19 +111,9 @@ opt_parser = OptionParser.new do |opts| opts.on("--disable-update-check", "disable check for updated version of Ontoportal (for VMs)", "(default: #{options[:enable_update_check]})") do |v| options[:enable_update_check] = false end - - - - - opts.on("--disable-dictionary-generation", "disable mgrep dictionary generation job", "(default: #{options[:enable_dictionary_generation]})") do |v| - options[:enable_dictionary_generation] = false + opts.on("--enable-dictionary-generation-cron-job", "ENABLE mgrep dictionary generation JOB and DISABLE dictionary generation during ontology processing. If this is not passed in, dictionary is generated every time an ontology is processed.", "(default: Dictionary is generated on every ontology processing, CRON job is DISABLED)") do |v| + options[:enable_dictionary_generation_cron_job] = true end - - - - - - opts.on("--disable-obofoundry_sync", "disable OBO Foundry synchronization report", "(default: #{options[:enable_obofoundry_sync]})") do |v| options[:enable_obofoundry_sync] = false end @@ -160,18 +150,10 @@ opt_parser = OptionParser.new do |opts| opts.on("--obofoundry_sync SCHED", String, "cron schedule to run OBO Foundry synchronization report", "(default: #{options[:cron_obofoundry_sync]})") do |c| options[:cron_obofoundry_sync] = c end - - - - - opts.on("--dictionary-generation SCHED", String, "cron schedule to run mgrep dictionary generation job", "(default: #{options[:cron_dictionary_generation]})") do |c| - options[:cron_dictionary_generation] = c + opts.on("--dictionary-generation-cron-job SCHED", String, "cron schedule to run mgrep dictionary generation job (if enabled)", "(default: #{options[:cron_dictionary_generation_cron_job]})") do |c| + options[:cron_dictionary_generation_cron_job] = c end - - - - # Display the help screen, all programs are assumed to have this option. opts.on_tail('--help', 'Display this screen') do puts opts @@ -484,49 +466,27 @@ runner.execute do |opts| end end - - - - - - - - # temporary job to generate mgrep dictionary file + # optional job to generate mgrep dictionary file # separate from ontology processing due to # https://github.com/ncbo/ncbo_cron/issues/45 - - if options[:enable_dictionary_generation] + if options[:enable_dictionary_generation_cron_job] dictionary_generation_thread = Thread.new do dictionary_generation_options = options.dup - dictionary_generation_options[:job_name] = "ncbo_cron_dictionary_generation" + dictionary_generation_options[:job_name] = "ncbo_cron_dictionary_generation_cron_job" dictionary_generation_options[:scheduler_type] = :cron - dictionary_generation_options[:cron_schedule] = dictionary_generation_options[:cron_dictionary_generation] - logger.info "Setting up mgrep dictionary generation job with #{dictionary_generation_options[:cron_dictionary_generation]}"; logger.flush + dictionary_generation_options[:cron_schedule] = dictionary_generation_options[:cron_dictionary_generation_cron_job] + logger.info "Setting up mgrep dictionary generation job with #{dictionary_generation_options[:cron_dictionary_generation_cron_job]}"; logger.flush NcboCron::Scheduler.scheduled_locking_job(dictionary_generation_options) do - logger.info "Starting mgrep dictionary generation..."; logger.flush + logger.info "Starting mgrep dictionary generation CRON job..."; logger.flush t0 = Time.now annotator = Annotator::Models::NcboAnnotator.new annotator.generate_dictionary_file() - logger.info "mgrep dictionary generation job completed in #{Time.now - t0} sec."; logger.flush - logger.info "Finished mgrep dictionary generation"; logger.flush + logger.info "mgrep dictionary generation CRON job completed in #{Time.now - t0} sec."; logger.flush + logger.info "Finished mgrep dictionary generation CRON job"; logger.flush end end end - - - - - - - - - - - - - - # Print running child processes require 'sys/proctable' at_exit do @@ -549,12 +509,5 @@ runner.execute do |opts| mapping_counts_thread.join if mapping_counts_thread update_check_thread.join if update_check_thread obofoundry_sync_thread.join if obofoundry_sync_thread - - - - dictionary_generation_thread.join if dictionary_generation_thread - - - end diff --git a/lib/ncbo_cron/config.rb b/lib/ncbo_cron/config.rb index 49db0fb4..6d3db51e 100644 --- a/lib/ncbo_cron/config.rb +++ b/lib/ncbo_cron/config.rb @@ -40,16 +40,8 @@ def config(&block) @settings.enable_spam_deletion ||= true # enable update check (vor VMs) @settings.enable_update_check ||= true - - - - # enable mgrep dictionary generation job - @settings.enable_dictionary_generation ||= true - - - - + @settings.enable_dictionary_generation_cron_job ||= false # UMLS auto-pull @settings.pull_umls_url ||= "" @@ -85,17 +77,9 @@ def config(&block) @settings.cron_obofoundry_sync ||= "0 8 * * 1,2,3,4,5" # 00 3 * * * - run daily at 3:00AM @settings.cron_update_check ||= "00 3 * * *" - - - - # mgrep dictionary generation schedule # 30 3 * * * - run daily at 3:30AM - @settings.cron_dictionary_generation ||= "30 3 * * *" - - - - + @settings.cron_dictionary_generation_cron_job ||= "30 3 * * *" @settings.log_level ||= :info unless (@settings.log_path && File.exists?(@settings.log_path)) diff --git a/lib/ncbo_cron/ontology_submission_parser.rb b/lib/ncbo_cron/ontology_submission_parser.rb index a6512312..34c53930 100644 --- a/lib/ncbo_cron/ontology_submission_parser.rb +++ b/lib/ncbo_cron/ontology_submission_parser.rb @@ -220,10 +220,11 @@ def process_annotator(logger, sub) begin annotator = Annotator::Models::NcboAnnotator.new annotator.create_term_cache_for_submission(logger, sub) - # commenting this action out for now due to a problem with hgetall in redis + # this action only occurs if the CRON dictionary generation job is disabled + # if the CRON dictionary generation job is running, + # the dictionary will NOT be generated on each ontology parsing # see https://github.com/ncbo/ncbo_cron/issues/45 for details - # mgrep dictionary generation will occur as a separate CRON task - # annotator.generate_dictionary_file() + annotator.generate_dictionary_file() unless NcboCron.settings.enable_dictionary_generation_cron_job rescue Exception => e logger.error(e.message + "\n" + e.backtrace.join("\n\t")) logger.flush() From 139494e7d02e3d9d808a7f9f302b5499f8a65c60 Mon Sep 17 00:00:00 2001 From: mdorf Date: Thu, 22 Jun 2023 15:44:53 -0700 Subject: [PATCH 29/62] Gemfile.lock update --- Gemfile.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index fad566d7..e8cc5d80 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -76,11 +76,11 @@ GEM multi_json (~> 1.0) addressable (2.8.4) public_suffix (>= 2.0.2, < 6.0) - bcrypt (3.1.18) + bcrypt (3.1.19) builder (3.2.4) coderay (1.1.3) concurrent-ruby (1.2.2) - connection_pool (2.4.0) + connection_pool (2.4.1) cube-ruby (0.0.3) dante (0.2.0) declarative (0.0.20) @@ -142,7 +142,7 @@ GEM concurrent-ruby (~> 1.0) json (2.6.3) json_pure (2.6.3) - jwt (2.7.0) + jwt (2.7.1) launchy (2.5.2) addressable (~> 2.8) libxml-ruby (4.1.1) @@ -175,7 +175,7 @@ GEM coderay (~> 1.1) method_source (~> 1.0) public_suffix (5.0.1) - rack (3.0.7) + rack (3.0.8) rack-test (2.1.0) rack (>= 1.3) rake (13.0.6) From 23794ff32f7e36aa7be09707e5d2031a9c2082af Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Fri, 23 Jun 2023 13:46:23 -0700 Subject: [PATCH 30/62] gem update --- Gemfile.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index e8cc5d80..44c33f1b 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 8a0c14a6e6942b20749894806f1f1f512f9afcfa + revision: fb203b0396d03c1df61abfcdbc4070787010f052 branch: develop specs: goo (0.0.2) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 79527b94fbb59081ba58281a5cd51ec3448fadb0 + revision: 1f535313248f27170f7ecdb3450ae8247be735eb branch: develop specs: ontologies_linked_data (0.0.1) @@ -126,7 +126,7 @@ GEM retriable (>= 2.0, < 4.a) rexml webrick - googleauth (1.5.2) + googleauth (1.6.0) faraday (>= 0.17.3, < 3.a) jwt (>= 1.4, < 3.0) memoist (~> 0.16) From 44dec8c31a66ac360e02f295dc6eccd68e3a112a Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Mon, 10 Jul 2023 11:23:35 -0700 Subject: [PATCH 31/62] create new rake taks for updating purls for all ontologies moved from ontologies_api/fix_purls.rb --- rakelib/purl_management.rake | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 rakelib/purl_management.rake diff --git a/rakelib/purl_management.rake b/rakelib/purl_management.rake new file mode 100644 index 00000000..58cfadd7 --- /dev/null +++ b/rakelib/purl_management.rake @@ -0,0 +1,28 @@ +# Task for updating and adding missing purl for all ontologies +# +desc 'Purl Utilities' +namespace :purl do + require 'bundler/setup' + # Configure the process for the current cron configuration. + require_relative '../lib/ncbo_cron' + config_exists = File.exist?(File.expand_path('../../config/config.rb', __FILE__)) + abort('Please create a config/config.rb file using the config/config.rb.sample as a template') unless config_exists + require_relative '../config/config' + + desc 'update purl for all ontologies' + task :update_all do + purl_client = LinkedData::Purl::Client.new + LinkedData::Models::Ontology.all.each do |ont| + ont.bring(:acronym) + acronym = ont.acronym + + if purl_client.purl_exists(acronym) + puts "#{acronym} exists" + purl_client.fix_purl(acronym) + else + puts "#{acronym} DOES NOT exist" + purl_client.create_purl(acronym) + end + end + end +end From f5399c5b4e4df4ff6033460ebea231cbb1ed23ee Mon Sep 17 00:00:00 2001 From: mdorf Date: Wed, 12 Jul 2023 17:32:08 -0700 Subject: [PATCH 32/62] initial implementation of #70 - Google Analytics v4 Update Compatibility Issue --- .gitignore | 1 + Gemfile | 2 +- Gemfile.lock | 58 ++++--- config/config.rb.sample | 16 +- lib/ncbo_cron/ontology_analytics.rb | 259 +++++++++++++++++++--------- ncbo_cron.gemspec | 2 +- 6 files changed, 222 insertions(+), 116 deletions(-) diff --git a/.gitignore b/.gitignore index 9170162c..f9401dc1 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ config/config.rb config/config_*.rb config/*.p12 +config/*.json projectFilesBackup/ .ruby-version repo* diff --git a/Gemfile b/Gemfile index aae5ab76..d680bef0 100644 --- a/Gemfile +++ b/Gemfile @@ -4,7 +4,7 @@ gemspec gem 'faraday', '~> 1.9' gem 'ffi' -gem "google-apis-analytics_v3" +gem 'google-analytics-data' gem 'mail', '2.6.6' gem 'multi_json' gem 'oj', '~> 2.0' diff --git a/Gemfile.lock b/Gemfile.lock index e8cc5d80..514e3b0c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -60,7 +60,7 @@ PATH ncbo_cron (0.0.1) dante goo - google-apis-analytics_v3 + google-analytics-data mlanett-redis-lock multi_json ncbo_annotator @@ -83,7 +83,6 @@ GEM connection_pool (2.4.1) cube-ruby (0.0.3) dante (0.2.0) - declarative (0.0.20) docile (1.4.0) domain_name (0.5.20190701) unf (>= 0.0.5, < 1.0.0) @@ -115,29 +114,47 @@ GEM faraday-rack (1.0.0) faraday-retry (1.0.3) ffi (1.15.5) - google-apis-analytics_v3 (0.13.0) - google-apis-core (>= 0.11.0, < 2.a) - google-apis-core (0.11.0) - addressable (~> 2.5, >= 2.5.1) - googleauth (>= 0.16.2, < 2.a) - httpclient (>= 2.8.1, < 3.a) - mini_mime (~> 1.0) - representable (~> 3.0) - retriable (>= 2.0, < 4.a) - rexml - webrick - googleauth (1.5.2) + gapic-common (0.19.1) + faraday (>= 1.9, < 3.a) + faraday-retry (>= 1.0, < 3.a) + google-protobuf (~> 3.14) + googleapis-common-protos (>= 1.3.12, < 2.a) + googleapis-common-protos-types (>= 1.3.1, < 2.a) + googleauth (~> 1.0) + grpc (~> 1.36) + google-analytics-data (0.4.0) + google-analytics-data-v1beta (>= 0.7, < 2.a) + google-cloud-core (~> 1.6) + google-analytics-data-v1beta (0.8.0) + gapic-common (>= 0.19.1, < 2.a) + google-cloud-errors (~> 1.0) + google-cloud-core (1.6.0) + google-cloud-env (~> 1.0) + google-cloud-errors (~> 1.0) + google-cloud-env (1.6.0) + faraday (>= 0.17.3, < 3.0) + google-cloud-errors (1.3.1) + google-protobuf (3.23.4-x86_64-darwin) + googleapis-common-protos (1.4.0) + google-protobuf (~> 3.14) + googleapis-common-protos-types (~> 1.2) + grpc (~> 1.27) + googleapis-common-protos-types (1.6.0) + google-protobuf (~> 3.14) + googleauth (1.6.0) faraday (>= 0.17.3, < 3.a) jwt (>= 1.4, < 3.0) memoist (~> 0.16) multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) + grpc (1.56.0-x86_64-darwin) + google-protobuf (~> 3.23) + googleapis-common-protos-types (~> 1.0) htmlentities (4.3.4) http-accept (1.7.0) http-cookie (1.0.5) domain_name (~> 0.5) - httpclient (2.8.3) i18n (0.9.5) concurrent-ruby (~> 1.0) json (2.6.3) @@ -156,7 +173,6 @@ GEM mime-types (3.4.1) mime-types-data (~> 3.2015) mime-types-data (3.2023.0218.1) - mini_mime (1.1.2) minitest (4.7.5) mlanett-redis-lock (0.2.7) redis @@ -185,16 +201,11 @@ GEM redis-client (>= 0.9.0) redis-client (0.14.1) connection_pool - representable (3.2.0) - declarative (< 0.1.0) - trailblazer-option (>= 0.1.1, < 0.2.0) - uber (< 0.2.0) rest-client (2.1.0) http-accept (>= 1.7.0, < 2.0) http-cookie (>= 1.0.2, < 2.0) mime-types (>= 1.16, < 4.0) netrc (~> 0.8) - retriable (3.1.2) rexml (3.2.5) rsolr (2.5.0) builder (>= 2.1.2) @@ -223,16 +234,13 @@ GEM systemu (2.6.5) test-unit-minitest (0.9.1) minitest (~> 4.7) - trailblazer-option (0.1.2) tzinfo (2.0.6) concurrent-ruby (~> 1.0) - uber (0.1.0) unf (0.1.4) unf_ext unf_ext (0.0.8.2) uuid (2.3.9) macaddr (~> 1.0) - webrick (1.8.1) PLATFORMS ruby @@ -246,7 +254,7 @@ DEPENDENCIES faraday (~> 1.9) ffi goo! - google-apis-analytics_v3 + google-analytics-data mail (= 2.6.6) minitest (< 5.0) multi_json diff --git a/config/config.rb.sample b/config/config.rb.sample index 8d204311..668c7a0c 100644 --- a/config/config.rb.sample +++ b/config/config.rb.sample @@ -69,14 +69,14 @@ NcboCron.config do |config| config.search_index_all_url = "http://localhost:8983/solr/term_search_core2" config.property_search_index_all_url = "http://localhost:8983/solr/prop_search_core2" - # Google Analytics config - config.analytics_service_account_email_address = "123456789999-sikipho0wk8q0atflrmw62dj4kpwoj3c@developer.gserviceaccount.com" - config.analytics_path_to_key_file = "config/bioportal-analytics.p12" - config.analytics_profile_id = "ga:1234567" - config.analytics_app_name = "BioPortal" - config.analytics_app_version = "1.0.0" - config.analytics_start_date = "2013-10-01" - config.analytics_filter_str = "ga:networkLocation!@stanford;ga:networkLocation!@amazon" + # Google Analytics GA4 config + config.analytics_path_to_key_file = "config/your_analytics_key.json" + config.analytics_property_id = "123456789" + # path to the Universal Analytics data, which stopped collecting on June 1st, 2023 + config.analytics_path_to_ua_data_file = "data/your_ua_data.json" + # path to the file that will hold your Google Analytics data + # this is in addition to storing it in Redis + config.analytics_path_to_ga_data_file = "data/your_ga_data.json" # this is a Base64.encode64 encoded personal access token # you need to run Base64.decode64 on it before using it in your code diff --git a/lib/ncbo_cron/ontology_analytics.rb b/lib/ncbo_cron/ontology_analytics.rb index 3e4076b4..334da43e 100644 --- a/lib/ncbo_cron/ontology_analytics.rb +++ b/lib/ncbo_cron/ontology_analytics.rb @@ -1,12 +1,16 @@ require 'logger' -require 'google/apis/analytics_v3' -require 'google/api_client/auth/key_utils' +require 'json' +require 'benchmark' +require 'google/analytics/data' + module NcboCron module Models class OntologyAnalytics - ONTOLOGY_ANALYTICS_REDIS_FIELD = "ontology_analytics" + ONTOLOGY_ANALYTICS_REDIS_FIELD = 'ontology_analytics' + UA_START_DATE = '2013-10-01' + GA4_START_DATE = '2023-06-01' def initialize(logger) @logger = logger @@ -15,103 +19,196 @@ def initialize(logger) def run redis = Redis.new(:host => NcboCron.settings.ontology_analytics_redis_host, :port => NcboCron.settings.ontology_analytics_redis_port) ontology_analytics = fetch_ontology_analytics + File.open(NcboCron.settings.analytics_path_to_ga_data_file, 'w') do |f| + f.write(ontology_analytics.to_json) + end redis.set(ONTOLOGY_ANALYTICS_REDIS_FIELD, Marshal.dump(ontology_analytics)) end def fetch_ontology_analytics - google_client = authenticate_google - aggregated_results = Hash.new - start_year = Date.parse(NcboCron.settings.analytics_start_date).year || 2013 - ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map {|o| o.acronym} - # ont_acronyms = ["NCIT", "ONTOMA", "CMPO", "AEO", "SNOMEDCT"] - filter_str = (NcboCron.settings.analytics_filter_str.nil? || NcboCron.settings.analytics_filter_str.empty?) ? "" : ";#{NcboCron.settings.analytics_filter_str}" - - ont_acronyms.each do |acronym| + @logger.info "Starting Google Analytics refresh..." + @logger.flush + full_data = nil + + time = Benchmark.realtime do max_results = 10000 - num_results = 10000 - start_index = 1 - results = nil - - loop do - results = google_client.get_ga_data( - ids = NcboCron.settings.analytics_profile_id, - start_date = NcboCron.settings.analytics_start_date, - end_date = Date.today.to_s, - metrics = 'ga:pageviews', - { - dimensions: 'ga:pagePath,ga:year,ga:month', - filters: "ga:pagePath=~^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$#{filter_str}", - start_index: start_index, - max_results: max_results - } - ) - results.rows ||= [] - start_index += max_results - num_results = results.rows.length - @logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}" - @logger.flush - - results.rows.each do |row| - if aggregated_results.has_key?(acronym) - # year - if aggregated_results[acronym].has_key?(row[1].to_i) - # month - if aggregated_results[acronym][row[1].to_i].has_key?(row[2].to_i) - aggregated_results[acronym][row[1].to_i][row[2].to_i] += row[3].to_i + aggregated_results = Hash.new + + @logger.info "Fetching all ontology acronyms from backend..." + @logger.flush + ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map {|o| o.acronym} + # ont_acronyms = ["NCIT", "SNOMEDCT", "MEDDRA"] + + @logger.info "Authenticating with the Google Analytics Endpoint..." + @logger.flush + google_client = authenticate_google + + date_range = Google::Analytics::Data::V1beta::DateRange.new( + start_date: GA4_START_DATE, + end_date: Date.today.to_s + ) + metrics_page_views = Google::Analytics::Data::V1beta::Metric.new( + name: "screenPageViews" + ) + dimension_path = Google::Analytics::Data::V1beta::Dimension.new( + name: "pagePath" + ) + dimension_year = Google::Analytics::Data::V1beta::Dimension.new( + name: "year" + ) + dimension_month = Google::Analytics::Data::V1beta::Dimension.new( + name: "month" + ) + string_filter = Google::Analytics::Data::V1beta::Filter::StringFilter.new( + match_type: Google::Analytics::Data::V1beta::Filter::StringFilter::MatchType::FULL_REGEXP + ) + filter = Google::Analytics::Data::V1beta::Filter.new( + field_name: "pagePath", + string_filter: string_filter + ) + filter_expression = Google::Analytics::Data::V1beta::FilterExpression.new( + filter: filter + ) + order_year = Google::Analytics::Data::V1beta::OrderBy::DimensionOrderBy.new( + dimension_name: "year" + ) + orderby_year = Google::Analytics::Data::V1beta::OrderBy.new( + desc: false, + dimension: order_year + ) + order_month = Google::Analytics::Data::V1beta::OrderBy::DimensionOrderBy.new( + dimension_name: "month" + ) + orderby_month = Google::Analytics::Data::V1beta::OrderBy.new( + desc: false, + dimension: order_month + ) + @logger.info "Fetching GA4 analytics for all ontologies..." + @logger.flush + + ont_acronyms.each do |acronym| + start_index = 0 + string_filter.value = "^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$" + + loop do + request = Google::Analytics::Data::V1beta::RunReportRequest.new( + property: "properties/#{NcboCron.settings.analytics_property_id}", + metrics: [metrics_page_views], + dimension_filter: filter_expression, + dimensions: [dimension_path, dimension_year, dimension_month], + date_ranges: [date_range], + order_bys: [orderby_year, orderby_month], + offset: start_index, + limit: max_results + ) + response = google_client.run_report request + + response.rows ||= [] + start_index += max_results + num_results = response.rows.length + @logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}" + @logger.flush + + response.rows.each do |row| + row_h = row.to_h + year_month_hits = row_h[:dimension_values].map.with_index { + |v, i| i > 0 ? v[:value].to_i.to_s : row_h[:metric_values][0][:value].to_i + }.rotate(1) + + if aggregated_results.has_key?(acronym) + # year + if aggregated_results[acronym].has_key?(year_month_hits[0]) + # month + if aggregated_results[acronym][year_month_hits[0]].has_key?(year_month_hits[1]) + aggregated_results[acronym][year_month_hits[0]][year_month_hits[1]] += year_month_hits[2] + else + aggregated_results[acronym][year_month_hits[0]][year_month_hits[1]] = year_month_hits[2] + end else - aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i + aggregated_results[acronym][year_month_hits[0]] = Hash.new + aggregated_results[acronym][year_month_hits[0]][year_month_hits[1]] = year_month_hits[2] end else - aggregated_results[acronym][row[1].to_i] = Hash.new - aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i + aggregated_results[acronym] = Hash.new + aggregated_results[acronym][year_month_hits[0]] = Hash.new + aggregated_results[acronym][year_month_hits[0]][year_month_hits[1]] = year_month_hits[2] end - else - aggregated_results[acronym] = Hash.new - aggregated_results[acronym][row[1].to_i] = Hash.new - aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i end - end + break if num_results < max_results + end # loop + end # ont_acronyms + @logger.info "Refresh complete, merging GA4 and UA data..." + @logger.flush + full_data = merge_ga4_ua_data(aggregated_results) + @logger.info "Merged" + @logger.flush + end # Benchmark.realtime + @logger.info "Completed Google Analytics refresh in #{(time/60).round(1)} minutes." + @logger.flush + full_data + end - if num_results < max_results - # fill up non existent years - (start_year..Date.today.year).each do |y| - aggregated_results[acronym] = Hash.new if aggregated_results[acronym].nil? - aggregated_results[acronym][y] = Hash.new unless aggregated_results[acronym].has_key?(y) - end - # fill up non existent months with zeros - (1..12).each { |n| aggregated_results[acronym].values.each { |v| v[n] = 0 unless v.has_key?(n) } } - break + def merge_ga4_ua_data(ga4_data) + ua_data_file = File.read(NcboCron.settings.analytics_path_to_ua_data_file) + ua_data = JSON.parse(ua_data_file) + ua_ga4_intersecting_year = Date.parse(GA4_START_DATE).year.to_s + ua_ga4_intersecting_month = Date.parse(GA4_START_DATE).month.to_s + + # add up hits for June of 2023 (the only intersecting month between UA and GA4) + ua_data.each do |acronym, _| + if ga4_data.has_key?(acronym) + if ga4_data[acronym][ua_ga4_intersecting_year].has_key?(ua_ga4_intersecting_month) + ua_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] += + ga4_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] + # delete data for June of 2023 from ga4_data to avoid overwriting when merging + ga4_data[acronym][ua_ga4_intersecting_year].delete(ua_ga4_intersecting_month) end end end + # merge ua and ga4 data + merged_data = ua_data.deep_merge(ga4_data) + # fill missing years and months + fill_missing_data(merged_data) + # sort acronyms, years and months + sort_ga_data(merged_data) + end - @logger.info "Completed ontology analytics refresh..." - @logger.flush + def fill_missing_data(ga_data) + # fill up non existent years + start_year = Date.parse(UA_START_DATE).year + + ga_data.each do |acronym, _| + (start_year..Date.today.year).each do |y| + ga_data[acronym] = Hash.new if ga_data[acronym].nil? + ga_data[acronym][y.to_s] = Hash.new unless ga_data[acronym].has_key?(y.to_s) + end + # fill up non existent months with zeros + (1..12).each { |n| ga_data[acronym].values.each { |v| v[n.to_s] = 0 unless v.has_key?(n.to_s) } } + end + end - aggregated_results + def sort_ga_data(ga_data) + ga_data.transform_values { |value| + value.transform_values { |val| + val.sort_by { |key, _| key.to_i }.to_h + }.sort_by { |k, _| k.to_i }.to_h + }.sort.to_h end def authenticate_google - Google::Apis::ClientOptions.default.application_name = NcboCron.settings.analytics_app_name - Google::Apis::ClientOptions.default.application_version = NcboCron.settings.analytics_app_version - # enable google api call retries in order to - # minigate analytics processing failure due to ocasional google api timeouts and other outages - Google::Apis::RequestOptions.default.retries = 5 - # uncoment to enable logging for debugging purposes - # Google::Apis.logger.level = Logger::DEBUG - # Google::Apis.logger = @logger - client = Google::Apis::AnalyticsV3::AnalyticsService.new - key = Google::APIClient::KeyUtils::load_from_pkcs12(NcboCron.settings.analytics_path_to_key_file, 'notasecret') - client.authorization = Signet::OAuth2::Client.new( - :token_credential_uri => 'https://accounts.google.com/o/oauth2/token', - :audience => 'https://accounts.google.com/o/oauth2/token', - :scope => 'https://www.googleapis.com/auth/analytics.readonly', - :issuer => NcboCron.settings.analytics_service_account_email_address, - :signing_key => key - ).tap { |auth| auth.fetch_access_token! } - client + Google::Analytics::Data.analytics_data do |config| + config.credentials = NcboCron.settings.analytics_path_to_key_file + end end - end + end # class + + end +end + +class ::Hash + def deep_merge(second) + merger = proc { |key, v1, v2| Hash === v1 && Hash === v2 ? v1.merge(v2, &merger) : v2 } + self.merge(second, &merger) end end @@ -124,4 +221,4 @@ def authenticate_google # # ontology_analytics_logger = Logger.new(ontology_analytics_log_path) # ontology_analytics_logger = Logger.new(STDOUT) # NcboCron::Models::OntologyAnalytics.new(ontology_analytics_logger).run -# ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontologies-report true --disable-mapping-counts true --disable-spam-deletion true --ontology-analytics '14 * * * *' +# # ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontologies-report true --disable-mapping-counts true --disable-spam-deletion true --ontology-analytics '14 * * * *' diff --git a/ncbo_cron.gemspec b/ncbo_cron.gemspec index ef21761f..c8faa03d 100644 --- a/ncbo_cron.gemspec +++ b/ncbo_cron.gemspec @@ -16,7 +16,7 @@ Gem::Specification.new do |gem| gem.add_dependency("dante") gem.add_dependency("goo") - gem.add_dependency("google-apis-analytics_v3") + gem.add_dependency("google-analytics-data") gem.add_dependency("mlanett-redis-lock") gem.add_dependency("multi_json") gem.add_dependency("ncbo_annotator") From 6bb0c23ed9b116c458fdfda6017011698c66d587 Mon Sep 17 00:00:00 2001 From: mdorf Date: Thu, 13 Jul 2023 08:59:11 -0700 Subject: [PATCH 33/62] added the /data folder to ignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index f9401dc1..ccf97ea0 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ config/config.rb config/config_*.rb config/*.p12 config/*.json +data/ projectFilesBackup/ .ruby-version repo* From 8c8ddb6c7f9f1d2584a5fb3220f1d2aa37b66cb2 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Thu, 20 Jul 2023 14:16:42 -0700 Subject: [PATCH 34/62] update gems --- Gemfile.lock | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index b00ea5d6..b495b471 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 8a0c14a6e6942b20749894806f1f1f512f9afcfa + revision: fb203b0396d03c1df61abfcdbc4070787010f052 branch: develop specs: goo (0.0.2) @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 9fd2649b66d3bd6ef7666729d9f91087aa406266 + revision: 3ae6bfb56dc59a670b5bc1a513ff4929f8cf3756 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 79527b94fbb59081ba58281a5cd51ec3448fadb0 + revision: 13bb138aa40ca72bfafc91a1a7416c162ba8d325 branch: develop specs: ontologies_linked_data (0.0.1) @@ -141,22 +141,22 @@ GEM google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.6.0) + googleapis-common-protos-types (1.7.0) google-protobuf (~> 3.14) - googleauth (1.6.0) + googleauth (1.7.0) faraday (>= 0.17.3, < 3.a) jwt (>= 1.4, < 3.0) memoist (~> 0.16) multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) - grpc (1.56.0) + grpc (1.56.2) google-protobuf (~> 3.23) googleapis-common-protos-types (~> 1.0) - grpc (1.56.0-x86_64-darwin) + grpc (1.56.2-x86_64-darwin) google-protobuf (~> 3.23) googleapis-common-protos-types (~> 1.0) - grpc (1.56.0-x86_64-linux) + grpc (1.56.2-x86_64-linux) google-protobuf (~> 3.23) googleapis-common-protos-types (~> 1.0) htmlentities (4.3.4) @@ -283,4 +283,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.15 + 2.4.17 From dad8832e5f650bb06ae9dd0ff7ce1cc285d34cfa Mon Sep 17 00:00:00 2001 From: mdorf Date: Wed, 26 Jul 2023 15:20:43 -0700 Subject: [PATCH 35/62] Gemfile.lock update --- Gemfile.lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index b00ea5d6..6922cbd9 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 79527b94fbb59081ba58281a5cd51ec3448fadb0 + revision: 13bb138aa40ca72bfafc91a1a7416c162ba8d325 branch: develop specs: ontologies_linked_data (0.0.1) @@ -141,22 +141,22 @@ GEM google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.6.0) + googleapis-common-protos-types (1.7.0) google-protobuf (~> 3.14) - googleauth (1.6.0) + googleauth (1.7.0) faraday (>= 0.17.3, < 3.a) jwt (>= 1.4, < 3.0) memoist (~> 0.16) multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) - grpc (1.56.0) + grpc (1.56.2) google-protobuf (~> 3.23) googleapis-common-protos-types (~> 1.0) - grpc (1.56.0-x86_64-darwin) + grpc (1.56.2-x86_64-darwin) google-protobuf (~> 3.23) googleapis-common-protos-types (~> 1.0) - grpc (1.56.0-x86_64-linux) + grpc (1.56.2-x86_64-linux) google-protobuf (~> 3.23) googleapis-common-protos-types (~> 1.0) htmlentities (4.3.4) From 29dc7618a672c42f1476304a3cde8115acc83236 Mon Sep 17 00:00:00 2001 From: mdorf Date: Wed, 26 Jul 2023 15:21:31 -0700 Subject: [PATCH 36/62] Gemfile.lock update --- Gemfile.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index b495b471..6922cbd9 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: fb203b0396d03c1df61abfcdbc4070787010f052 + revision: 8a0c14a6e6942b20749894806f1f1f512f9afcfa branch: develop specs: goo (0.0.2) @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 3ae6bfb56dc59a670b5bc1a513ff4929f8cf3756 + revision: 9fd2649b66d3bd6ef7666729d9f91087aa406266 branch: develop specs: ncbo_annotator (0.0.1) @@ -283,4 +283,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.4.17 + 2.3.15 From 758f199d1536b40c0725bf44f2db0151b14305c7 Mon Sep 17 00:00:00 2001 From: Jennifer Vendetti Date: Mon, 31 Jul 2023 15:08:06 -0700 Subject: [PATCH 37/62] Gemfile.lock update --- Gemfile.lock | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 6922cbd9..e5e8fcab 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 8a0c14a6e6942b20749894806f1f1f512f9afcfa + revision: 8e80a4f842331e874055aec6a507ca5e3be4e3a4 branch: develop specs: goo (0.0.2) @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 9fd2649b66d3bd6ef7666729d9f91087aa406266 + revision: 3ae6bfb56dc59a670b5bc1a513ff4929f8cf3756 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 13bb138aa40ca72bfafc91a1a7416c162ba8d325 + revision: afda3aacfdd0e5e33e5e5beae1dffd79d4980a30 branch: develop specs: ontologies_linked_data (0.0.1) @@ -214,7 +214,7 @@ GEM http-cookie (>= 1.0.2, < 2.0) mime-types (>= 1.16, < 4.0) netrc (~> 0.8) - rexml (3.2.5) + rexml (3.2.6) rsolr (2.5.0) builder (>= 2.1.2) faraday (>= 0.9, < 3, != 2.0.0) @@ -283,4 +283,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.15 + 2.3.26 From 5e6a8f4c69470fc0155a8149c3dbf281b1e679d2 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Mon, 31 Jul 2023 21:59:17 -0700 Subject: [PATCH 38/62] use patched version of agraph v7.3.1 --- .github/workflows/ruby-unit-tests.yml | 1 + Gemfile.lock | 4 ++-- docker-compose.yml | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ruby-unit-tests.yml b/.github/workflows/ruby-unit-tests.yml index 5f0db7e1..b61ce745 100644 --- a/.github/workflows/ruby-unit-tests.yml +++ b/.github/workflows/ruby-unit-tests.yml @@ -7,6 +7,7 @@ on: jobs: test: strategy: + fail-fast: false matrix: backend: ['ncbo_cron', 'ncbo_cron-agraph'] # ruby runs tests with 4store backend and ruby-agraph runs with AllegroGraph backend runs-on: ubuntu-latest diff --git a/Gemfile.lock b/Gemfile.lock index e5e8fcab..04ed607b 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: afda3aacfdd0e5e33e5e5beae1dffd79d4980a30 + revision: df675c44c81332a0f894b86338561b51c7f721b7 branch: develop specs: ontologies_linked_data (0.0.1) @@ -283,4 +283,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.26 + 2.4.17 diff --git a/docker-compose.yml b/docker-compose.yml index 009e33c2..0045ce12 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -114,7 +114,8 @@ services: retries: 5 agraph-ut: - image: franzinc/agraph:v7.3.1 + #image: franzinc/agraph:v7.3.1 + image: ontoportal/agraph:v7.3.1-patch1 platform: linux/amd64 environment: - AGRAPH_SUPER_USER=test From 1f81bdba54351e990047cbd6d1294c96ff0a8c6c Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Mon, 31 Jul 2023 22:11:47 -0700 Subject: [PATCH 39/62] unpin faraday gem --- Gemfile | 3 +-- Gemfile.lock | 33 +++++++-------------------------- 2 files changed, 8 insertions(+), 28 deletions(-) diff --git a/Gemfile b/Gemfile index d680bef0..a2c93e43 100644 --- a/Gemfile +++ b/Gemfile @@ -2,12 +2,11 @@ source 'https://rubygems.org' gemspec -gem 'faraday', '~> 1.9' gem 'ffi' gem 'google-analytics-data' gem 'mail', '2.6.6' gem 'multi_json' -gem 'oj', '~> 2.0' +gem 'oj', '~> 3.0' gem 'parseconfig' gem 'pony' gem 'pry' diff --git a/Gemfile.lock b/Gemfile.lock index 04ed607b..b1921b85 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -90,29 +90,12 @@ GEM htmlentities (~> 4.3.3) launchy (~> 2.1) mail (~> 2.6) - faraday (1.10.3) - faraday-em_http (~> 1.0) - faraday-em_synchrony (~> 1.0) - faraday-excon (~> 1.1) - faraday-httpclient (~> 1.0) - faraday-multipart (~> 1.0) - faraday-net_http (~> 1.0) - faraday-net_http_persistent (~> 1.0) - faraday-patron (~> 1.0) - faraday-rack (~> 1.0) - faraday-retry (~> 1.0) + faraday (2.7.10) + faraday-net_http (>= 2.0, < 3.1) ruby2_keywords (>= 0.0.4) - faraday-em_http (1.0.0) - faraday-em_synchrony (1.0.0) - faraday-excon (1.1.0) - faraday-httpclient (1.0.1) - faraday-multipart (1.0.4) - multipart-post (~> 2) - faraday-net_http (1.0.1) - faraday-net_http_persistent (1.2.0) - faraday-patron (1.0.0) - faraday-rack (1.0.0) - faraday-retry (1.0.3) + faraday-net_http (3.0.2) + faraday-retry (2.2.0) + faraday (~> 2.0) ffi (1.15.5) gapic-common (0.19.1) faraday (>= 1.9, < 3.a) @@ -185,10 +168,9 @@ GEM mlanett-redis-lock (0.2.7) redis multi_json (1.15.0) - multipart-post (2.3.0) net-http-persistent (2.9.4) netrc (0.11.0) - oj (2.18.5) + oj (3.15.1) omni_logger (0.1.4) logger os (1.1.4) @@ -259,7 +241,6 @@ PLATFORMS DEPENDENCIES cube-ruby email_spec - faraday (~> 1.9) ffi goo! google-analytics-data @@ -268,7 +249,7 @@ DEPENDENCIES multi_json ncbo_annotator! ncbo_cron! - oj (~> 2.0) + oj (~> 3.0) ontologies_linked_data! parseconfig pony From 79104ab2d0de31a8b2612e0eec09749cb9335a26 Mon Sep 17 00:00:00 2001 From: mdorf Date: Mon, 7 Aug 2023 13:34:35 -0700 Subject: [PATCH 40/62] A chnage to reference Analytics Redis from LinkedData block --- Gemfile.lock | 2 +- lib/ncbo_cron/ontology_analytics.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 6922cbd9..dbd979f0 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 8a0c14a6e6942b20749894806f1f1f512f9afcfa + revision: a3a5f20a75482b4d9e5ffe31d66831855422867a branch: develop specs: goo (0.0.2) diff --git a/lib/ncbo_cron/ontology_analytics.rb b/lib/ncbo_cron/ontology_analytics.rb index 334da43e..3a91b813 100644 --- a/lib/ncbo_cron/ontology_analytics.rb +++ b/lib/ncbo_cron/ontology_analytics.rb @@ -17,7 +17,7 @@ def initialize(logger) end def run - redis = Redis.new(:host => NcboCron.settings.ontology_analytics_redis_host, :port => NcboCron.settings.ontology_analytics_redis_port) + redis = Redis.new(:host => LinkedData.settings.ontology_analytics_redis_host, :port => LinkedData.settings.ontology_analytics_redis_port) ontology_analytics = fetch_ontology_analytics File.open(NcboCron.settings.analytics_path_to_ga_data_file, 'w') do |f| f.write(ontology_analytics.to_json) From 656b3ac6ae66c5bc60cb9d97d5ed0009b1b703e0 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Tue, 8 Aug 2023 10:49:09 -0700 Subject: [PATCH 41/62] Gemfile.lock update --- Gemfile.lock | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 3a21d8ce..d8205040 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: a3a5f20a75482b4d9e5ffe31d66831855422867a + revision: 20801d66769854e254ec85ef5c14b5d87833e210 branch: develop specs: goo (0.0.2) @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 9fd2649b66d3bd6ef7666729d9f91087aa406266 + revision: 3ae6bfb56dc59a670b5bc1a513ff4929f8cf3756 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 13bb138aa40ca72bfafc91a1a7416c162ba8d325 + revision: 98bb4f71a0714a31c13de99646145bb6af5d898c branch: develop specs: ontologies_linked_data (0.0.1) @@ -124,8 +124,8 @@ GEM google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.7.0) - google-protobuf (~> 3.14) + googleapis-common-protos-types (1.8.0) + google-protobuf (~> 3.18) googleauth (1.7.0) faraday (>= 0.17.3, < 3.a) jwt (>= 1.4, < 3.0) @@ -163,7 +163,7 @@ GEM method_source (1.0.0) mime-types (3.5.0) mime-types-data (~> 3.2015) - mime-types-data (3.2023.0218.1) + mime-types-data (3.2023.0808) minitest (4.7.5) mlanett-redis-lock (0.2.7) redis @@ -264,4 +264,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.15 + 2.4.17 From ed14911ccb28375298c63e7ca1b388ed0c638abb Mon Sep 17 00:00:00 2001 From: mdorf Date: Thu, 7 Sep 2023 16:46:06 -0700 Subject: [PATCH 42/62] Gemfile.lock update --- Gemfile.lock | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index d8205040..92164456 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 20801d66769854e254ec85ef5c14b5d87833e210 + revision: 911d71aefe433314d11398445e3856fca503b9c1 branch: develop specs: goo (0.0.2) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 98bb4f71a0714a31c13de99646145bb6af5d898c + revision: e33a0e451f8a8226d98291168e45b46d7065e670 branch: develop specs: ontologies_linked_data (0.0.1) @@ -97,7 +97,7 @@ GEM faraday-retry (2.2.0) faraday (~> 2.0) ffi (1.15.5) - gapic-common (0.19.1) + gapic-common (0.20.0) faraday (>= 1.9, < 3.a) faraday-retry (>= 1.0, < 3.a) google-protobuf (~> 3.14) @@ -108,7 +108,7 @@ GEM google-analytics-data (0.4.0) google-analytics-data-v1beta (>= 0.7, < 2.a) google-cloud-core (~> 1.6) - google-analytics-data-v1beta (0.8.0) + google-analytics-data-v1beta (0.8.1) gapic-common (>= 0.19.1, < 2.a) google-cloud-errors (~> 1.0) google-cloud-core (1.6.0) @@ -117,9 +117,9 @@ GEM google-cloud-env (1.6.0) faraday (>= 0.17.3, < 3.0) google-cloud-errors (1.3.1) - google-protobuf (3.23.4) - google-protobuf (3.23.4-x86_64-darwin) - google-protobuf (3.23.4-x86_64-linux) + google-protobuf (3.24.3) + google-protobuf (3.24.3-x86_64-darwin) + google-protobuf (3.24.3-x86_64-linux) googleapis-common-protos (1.4.0) google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) @@ -133,13 +133,13 @@ GEM multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) - grpc (1.56.2) + grpc (1.57.0) google-protobuf (~> 3.23) googleapis-common-protos-types (~> 1.0) - grpc (1.56.2-x86_64-darwin) + grpc (1.57.0-x86_64-darwin) google-protobuf (~> 3.23) googleapis-common-protos-types (~> 1.0) - grpc (1.56.2-x86_64-linux) + grpc (1.57.0-x86_64-linux) google-protobuf (~> 3.23) googleapis-common-protos-types (~> 1.0) htmlentities (4.3.4) @@ -161,7 +161,7 @@ GEM mime-types (>= 1.16, < 4) memoist (0.16.2) method_source (1.0.0) - mime-types (3.5.0) + mime-types (3.5.1) mime-types-data (~> 3.2015) mime-types-data (3.2023.0808) minitest (4.7.5) @@ -170,7 +170,7 @@ GEM multi_json (1.15.0) net-http-persistent (2.9.4) netrc (0.11.0) - oj (3.15.1) + oj (3.16.1) omni_logger (0.1.4) logger os (1.1.4) @@ -187,9 +187,9 @@ GEM rake (13.0.6) rdf (1.0.8) addressable (>= 2.2) - redis (5.0.6) + redis (5.0.7) redis-client (>= 0.9.0) - redis-client (0.15.0) + redis-client (0.17.0) connection_pool rest-client (2.1.0) http-accept (>= 1.7.0, < 2.0) @@ -205,7 +205,7 @@ GEM rubyzip (2.3.2) rufus-scheduler (2.0.24) tzinfo (>= 0.3.22) - signet (0.17.0) + signet (0.18.0) addressable (~> 2.8) faraday (>= 0.17.5, < 3.a) jwt (>= 1.5, < 3.0) @@ -264,4 +264,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.4.17 + 2.3.15 From 55ac1cad6a14dd7cb5264d4f53801a05537d9cb8 Mon Sep 17 00:00:00 2001 From: mdorf Date: Thu, 7 Sep 2023 17:00:35 -0700 Subject: [PATCH 43/62] Gemfile.lock update --- Gemfile.lock | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index c849962b..776ef5d1 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 911d71aefe433314d11398445e3856fca503b9c1 - branch: develop + revision: cd477a1c71d8c2b2c26c3ea92c9457643a9cc70a + branch: master specs: goo (0.0.2) addressable (~> 2.8) @@ -15,8 +15,8 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 3ae6bfb56dc59a670b5bc1a513ff4929f8cf3756 - branch: develop + revision: ac11b22596b87a6eecbcd70787b2370c18ff4770 + branch: master specs: ncbo_annotator (0.0.1) goo @@ -26,8 +26,8 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: e33a0e451f8a8226d98291168e45b46d7065e670 - branch: develop + revision: 89474face62004ab70430ef718556fe50720e038 + branch: master specs: ontologies_linked_data (0.0.1) activesupport @@ -46,7 +46,7 @@ GIT GIT remote: https://github.com/ncbo/sparql-client.git - revision: fb4a89b420f8eb6dda5190a126b6c62e32c4c0c9 + revision: d418d56a6c9ff5692f925b45739a2a1c66bca851 branch: master specs: sparql-client (1.0.1) @@ -264,4 +264,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.4.17 + 2.3.15 From 020ca5d47d76c286da6c24a3fd7a8532dd757e78 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Wed, 13 Sep 2023 09:09:01 -0700 Subject: [PATCH 44/62] Gemfile.lock update --- Gemfile.lock | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 92164456..1413c3bf 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 911d71aefe433314d11398445e3856fca503b9c1 + revision: 83425ba6c05d051d86c6f5775540727ce4238443 branch: develop specs: goo (0.0.2) @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 3ae6bfb56dc59a670b5bc1a513ff4929f8cf3756 + revision: 067104ae94c0e9d058cfbf419364fbf03f34de43 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: e33a0e451f8a8226d98291168e45b46d7065e670 + revision: 711ebf27722355a25a0e53a8204b4ae472902bd0 branch: develop specs: ontologies_linked_data (0.0.1) @@ -76,6 +76,7 @@ GEM multi_json (~> 1.0) addressable (2.8.5) public_suffix (>= 2.0.2, < 6.0) + base64 (0.1.1) bcrypt (3.1.19) builder (3.2.4) coderay (1.1.3) @@ -90,7 +91,8 @@ GEM htmlentities (~> 4.3.3) launchy (~> 2.1) mail (~> 2.6) - faraday (2.7.10) + faraday (2.7.11) + base64 faraday-net_http (>= 2.0, < 3.1) ruby2_keywords (>= 0.0.4) faraday-net_http (3.0.2) @@ -108,8 +110,8 @@ GEM google-analytics-data (0.4.0) google-analytics-data-v1beta (>= 0.7, < 2.a) google-cloud-core (~> 1.6) - google-analytics-data-v1beta (0.8.1) - gapic-common (>= 0.19.1, < 2.a) + google-analytics-data-v1beta (0.9.0) + gapic-common (>= 0.20.0, < 2.a) google-cloud-errors (~> 1.0) google-cloud-core (1.6.0) google-cloud-env (~> 1.0) @@ -126,20 +128,19 @@ GEM grpc (~> 1.27) googleapis-common-protos-types (1.8.0) google-protobuf (~> 3.18) - googleauth (1.7.0) + googleauth (1.8.0) faraday (>= 0.17.3, < 3.a) jwt (>= 1.4, < 3.0) - memoist (~> 0.16) multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) - grpc (1.57.0) + grpc (1.58.0) google-protobuf (~> 3.23) googleapis-common-protos-types (~> 1.0) - grpc (1.57.0-x86_64-darwin) + grpc (1.58.0-x86_64-darwin) google-protobuf (~> 3.23) googleapis-common-protos-types (~> 1.0) - grpc (1.57.0-x86_64-linux) + grpc (1.58.0-x86_64-linux) google-protobuf (~> 3.23) googleapis-common-protos-types (~> 1.0) htmlentities (4.3.4) @@ -159,7 +160,6 @@ GEM systemu (~> 2.6.5) mail (2.6.6) mime-types (>= 1.16, < 4) - memoist (0.16.2) method_source (1.0.0) mime-types (3.5.1) mime-types-data (~> 3.2015) From ac798aaf30ccac41860edf0df45e9caedaabbdcb Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Wed, 13 Sep 2023 11:18:55 -0700 Subject: [PATCH 45/62] use assert_operator instead of assert minitest style guide adherence. encountered an intermittent unit test failure so assert_operator will provide better failure feedback than assert --- test/test_scheduler.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_scheduler.rb b/test/test_scheduler.rb index bac2f842..58808ea5 100644 --- a/test/test_scheduler.rb +++ b/test/test_scheduler.rb @@ -39,7 +39,7 @@ def test_scheduler sleep(5) finished_array = listen_string.split("\n") - assert finished_array.length >= 4 + assert_operator 4, :<=, finished_array.length assert job1_thread.alive? job1_thread.kill From c626d5c6e6d74da9a2c243ea8c94b51e34d60579 Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Tue, 10 Oct 2023 20:09:53 +0200 Subject: [PATCH 46/62] use local solr to pass the tests --- .github/workflows/ruby-unit-tests.yml | 2 +- config/solr/property_search/enumsconfig.xml | 12 + .../mapping-ISOLatin1Accent.txt | 246 +++ config/solr/property_search/schema.xml | 1179 ++++++++++++ config/solr/property_search/solrconfig.xml | 1299 +++++++++++++ config/solr/solr.xml | 60 + config/solr/term_search/enumsconfig.xml | 12 + .../term_search/mapping-ISOLatin1Accent.txt | 246 +++ config/solr/term_search/schema.xml | 1222 +++++++++++++ config/solr/term_search/solrconfig.xml | 1299 +++++++++++++ docker-compose.yml | 30 +- test/solr/README | 5 + .../_default/conf/lang/contractions_ca.txt | 8 + .../_default/conf/lang/contractions_fr.txt | 15 + .../_default/conf/lang/contractions_ga.txt | 5 + .../_default/conf/lang/contractions_it.txt | 23 + .../_default/conf/lang/hyphenations_ga.txt | 5 + .../_default/conf/lang/stemdict_nl.txt | 6 + .../_default/conf/lang/stoptags_ja.txt | 420 +++++ .../_default/conf/lang/stopwords_ar.txt | 125 ++ .../_default/conf/lang/stopwords_bg.txt | 193 ++ .../_default/conf/lang/stopwords_ca.txt | 220 +++ .../_default/conf/lang/stopwords_cz.txt | 172 ++ .../_default/conf/lang/stopwords_da.txt | 110 ++ .../_default/conf/lang/stopwords_de.txt | 294 +++ .../_default/conf/lang/stopwords_el.txt | 78 + .../_default/conf/lang/stopwords_en.txt | 54 + .../_default/conf/lang/stopwords_es.txt | 356 ++++ .../_default/conf/lang/stopwords_et.txt | 1603 +++++++++++++++++ .../_default/conf/lang/stopwords_eu.txt | 99 + .../_default/conf/lang/stopwords_fa.txt | 313 ++++ .../_default/conf/lang/stopwords_fi.txt | 97 + .../_default/conf/lang/stopwords_fr.txt | 186 ++ .../_default/conf/lang/stopwords_ga.txt | 110 ++ .../_default/conf/lang/stopwords_gl.txt | 161 ++ .../_default/conf/lang/stopwords_hi.txt | 235 +++ .../_default/conf/lang/stopwords_hu.txt | 211 +++ .../_default/conf/lang/stopwords_hy.txt | 46 + .../_default/conf/lang/stopwords_id.txt | 359 ++++ .../_default/conf/lang/stopwords_it.txt | 303 ++++ .../_default/conf/lang/stopwords_ja.txt | 127 ++ .../_default/conf/lang/stopwords_lv.txt | 172 ++ .../_default/conf/lang/stopwords_nl.txt | 119 ++ .../_default/conf/lang/stopwords_no.txt | 194 ++ .../_default/conf/lang/stopwords_pt.txt | 253 +++ .../_default/conf/lang/stopwords_ro.txt | 233 +++ .../_default/conf/lang/stopwords_ru.txt | 243 +++ .../_default/conf/lang/stopwords_sv.txt | 133 ++ .../_default/conf/lang/stopwords_th.txt | 119 ++ .../_default/conf/lang/stopwords_tr.txt | 212 +++ .../_default/conf/lang/userdict_ja.txt | 29 + .../configsets/_default/conf/managed-schema | 1031 +++++++++++ .../configsets/_default/conf/protwords.txt | 21 + .../configsets/_default/conf/solrconfig.xml | 1295 +++++++++++++ .../configsets/_default/conf/stopwords.txt | 14 + .../configsets/_default/conf/synonyms.txt | 29 + .../property_search/conf/enumsconfig.xml | 12 + .../conf/lang/contractions_ca.txt | 8 + .../conf/lang/contractions_fr.txt | 15 + .../conf/lang/contractions_ga.txt | 5 + .../conf/lang/contractions_it.txt | 23 + .../conf/lang/hyphenations_ga.txt | 5 + .../property_search/conf/lang/stemdict_nl.txt | 6 + .../property_search/conf/lang/stoptags_ja.txt | 420 +++++ .../conf/lang/stopwords_ar.txt | 125 ++ .../conf/lang/stopwords_bg.txt | 193 ++ .../conf/lang/stopwords_ca.txt | 220 +++ .../conf/lang/stopwords_cz.txt | 172 ++ .../conf/lang/stopwords_da.txt | 110 ++ .../conf/lang/stopwords_de.txt | 294 +++ .../conf/lang/stopwords_el.txt | 78 + .../conf/lang/stopwords_en.txt | 54 + .../conf/lang/stopwords_es.txt | 356 ++++ .../conf/lang/stopwords_et.txt | 1603 +++++++++++++++++ .../conf/lang/stopwords_eu.txt | 99 + .../conf/lang/stopwords_fa.txt | 313 ++++ .../conf/lang/stopwords_fi.txt | 97 + .../conf/lang/stopwords_fr.txt | 186 ++ .../conf/lang/stopwords_ga.txt | 110 ++ .../conf/lang/stopwords_gl.txt | 161 ++ .../conf/lang/stopwords_hi.txt | 235 +++ .../conf/lang/stopwords_hu.txt | 211 +++ .../conf/lang/stopwords_hy.txt | 46 + .../conf/lang/stopwords_id.txt | 359 ++++ .../conf/lang/stopwords_it.txt | 303 ++++ .../conf/lang/stopwords_ja.txt | 127 ++ .../conf/lang/stopwords_lv.txt | 172 ++ .../conf/lang/stopwords_nl.txt | 119 ++ .../conf/lang/stopwords_no.txt | 194 ++ .../conf/lang/stopwords_pt.txt | 253 +++ .../conf/lang/stopwords_ro.txt | 233 +++ .../conf/lang/stopwords_ru.txt | 243 +++ .../conf/lang/stopwords_sv.txt | 133 ++ .../conf/lang/stopwords_th.txt | 119 ++ .../conf/lang/stopwords_tr.txt | 212 +++ .../property_search/conf/lang/userdict_ja.txt | 29 + .../property_search/conf/managed-schema | 1031 +++++++++++ .../conf/mapping-ISOLatin1Accent.txt | 246 +++ .../property_search/conf/protwords.txt | 21 + .../property_search/conf/schema.xml | 1179 ++++++++++++ .../property_search/conf/solrconfig.xml | 1299 +++++++++++++ .../property_search/conf/stopwords.txt | 14 + .../property_search/conf/synonyms.txt | 29 + .../term_search/conf/enumsconfig.xml | 12 + .../term_search/conf/lang/contractions_ca.txt | 8 + .../term_search/conf/lang/contractions_fr.txt | 15 + .../term_search/conf/lang/contractions_ga.txt | 5 + .../term_search/conf/lang/contractions_it.txt | 23 + .../term_search/conf/lang/hyphenations_ga.txt | 5 + .../term_search/conf/lang/stemdict_nl.txt | 6 + .../term_search/conf/lang/stoptags_ja.txt | 420 +++++ .../term_search/conf/lang/stopwords_ar.txt | 125 ++ .../term_search/conf/lang/stopwords_bg.txt | 193 ++ .../term_search/conf/lang/stopwords_ca.txt | 220 +++ .../term_search/conf/lang/stopwords_cz.txt | 172 ++ .../term_search/conf/lang/stopwords_da.txt | 110 ++ .../term_search/conf/lang/stopwords_de.txt | 294 +++ .../term_search/conf/lang/stopwords_el.txt | 78 + .../term_search/conf/lang/stopwords_en.txt | 54 + .../term_search/conf/lang/stopwords_es.txt | 356 ++++ .../term_search/conf/lang/stopwords_et.txt | 1603 +++++++++++++++++ .../term_search/conf/lang/stopwords_eu.txt | 99 + .../term_search/conf/lang/stopwords_fa.txt | 313 ++++ .../term_search/conf/lang/stopwords_fi.txt | 97 + .../term_search/conf/lang/stopwords_fr.txt | 186 ++ .../term_search/conf/lang/stopwords_ga.txt | 110 ++ .../term_search/conf/lang/stopwords_gl.txt | 161 ++ .../term_search/conf/lang/stopwords_hi.txt | 235 +++ .../term_search/conf/lang/stopwords_hu.txt | 211 +++ .../term_search/conf/lang/stopwords_hy.txt | 46 + .../term_search/conf/lang/stopwords_id.txt | 359 ++++ .../term_search/conf/lang/stopwords_it.txt | 303 ++++ .../term_search/conf/lang/stopwords_ja.txt | 127 ++ .../term_search/conf/lang/stopwords_lv.txt | 172 ++ .../term_search/conf/lang/stopwords_nl.txt | 119 ++ .../term_search/conf/lang/stopwords_no.txt | 194 ++ .../term_search/conf/lang/stopwords_pt.txt | 253 +++ .../term_search/conf/lang/stopwords_ro.txt | 233 +++ .../term_search/conf/lang/stopwords_ru.txt | 243 +++ .../term_search/conf/lang/stopwords_sv.txt | 133 ++ .../term_search/conf/lang/stopwords_th.txt | 119 ++ .../term_search/conf/lang/stopwords_tr.txt | 212 +++ .../term_search/conf/lang/userdict_ja.txt | 29 + .../term_search/conf/managed-schema | 1031 +++++++++++ .../conf/mapping-ISOLatin1Accent.txt | 246 +++ .../configsets/term_search/conf/protwords.txt | 21 + .../configsets/term_search/conf/schema.xml | 1222 +++++++++++++ .../term_search/conf/solrconfig.xml | 1299 +++++++++++++ .../configsets/term_search/conf/stopwords.txt | 14 + .../configsets/term_search/conf/synonyms.txt | 29 + test/solr/docker-compose.yml | 13 + test/solr/generate_ncbo_configsets.sh | 24 + 152 files changed, 38657 insertions(+), 10 deletions(-) create mode 100644 config/solr/property_search/enumsconfig.xml create mode 100644 config/solr/property_search/mapping-ISOLatin1Accent.txt create mode 100644 config/solr/property_search/schema.xml create mode 100644 config/solr/property_search/solrconfig.xml create mode 100644 config/solr/solr.xml create mode 100644 config/solr/term_search/enumsconfig.xml create mode 100644 config/solr/term_search/mapping-ISOLatin1Accent.txt create mode 100644 config/solr/term_search/schema.xml create mode 100644 config/solr/term_search/solrconfig.xml create mode 100644 test/solr/README create mode 100644 test/solr/configsets/_default/conf/lang/contractions_ca.txt create mode 100644 test/solr/configsets/_default/conf/lang/contractions_fr.txt create mode 100644 test/solr/configsets/_default/conf/lang/contractions_ga.txt create mode 100644 test/solr/configsets/_default/conf/lang/contractions_it.txt create mode 100644 test/solr/configsets/_default/conf/lang/hyphenations_ga.txt create mode 100644 test/solr/configsets/_default/conf/lang/stemdict_nl.txt create mode 100644 test/solr/configsets/_default/conf/lang/stoptags_ja.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_ar.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_bg.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_ca.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_cz.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_da.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_de.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_el.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_en.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_es.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_et.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_eu.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_fa.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_fi.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_fr.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_ga.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_gl.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_hi.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_hu.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_hy.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_id.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_it.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_ja.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_lv.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_nl.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_no.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_pt.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_ro.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_ru.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_sv.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_th.txt create mode 100644 test/solr/configsets/_default/conf/lang/stopwords_tr.txt create mode 100644 test/solr/configsets/_default/conf/lang/userdict_ja.txt create mode 100644 test/solr/configsets/_default/conf/managed-schema create mode 100644 test/solr/configsets/_default/conf/protwords.txt create mode 100644 test/solr/configsets/_default/conf/solrconfig.xml create mode 100644 test/solr/configsets/_default/conf/stopwords.txt create mode 100644 test/solr/configsets/_default/conf/synonyms.txt create mode 100644 test/solr/configsets/property_search/conf/enumsconfig.xml create mode 100644 test/solr/configsets/property_search/conf/lang/contractions_ca.txt create mode 100644 test/solr/configsets/property_search/conf/lang/contractions_fr.txt create mode 100644 test/solr/configsets/property_search/conf/lang/contractions_ga.txt create mode 100644 test/solr/configsets/property_search/conf/lang/contractions_it.txt create mode 100644 test/solr/configsets/property_search/conf/lang/hyphenations_ga.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stemdict_nl.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stoptags_ja.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_ar.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_bg.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_ca.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_cz.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_da.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_de.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_el.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_en.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_es.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_et.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_eu.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_fa.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_fi.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_fr.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_ga.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_gl.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_hi.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_hu.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_hy.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_id.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_it.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_ja.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_lv.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_nl.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_no.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_pt.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_ro.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_ru.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_sv.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_th.txt create mode 100644 test/solr/configsets/property_search/conf/lang/stopwords_tr.txt create mode 100644 test/solr/configsets/property_search/conf/lang/userdict_ja.txt create mode 100644 test/solr/configsets/property_search/conf/managed-schema create mode 100644 test/solr/configsets/property_search/conf/mapping-ISOLatin1Accent.txt create mode 100644 test/solr/configsets/property_search/conf/protwords.txt create mode 100644 test/solr/configsets/property_search/conf/schema.xml create mode 100644 test/solr/configsets/property_search/conf/solrconfig.xml create mode 100644 test/solr/configsets/property_search/conf/stopwords.txt create mode 100644 test/solr/configsets/property_search/conf/synonyms.txt create mode 100644 test/solr/configsets/term_search/conf/enumsconfig.xml create mode 100644 test/solr/configsets/term_search/conf/lang/contractions_ca.txt create mode 100644 test/solr/configsets/term_search/conf/lang/contractions_fr.txt create mode 100644 test/solr/configsets/term_search/conf/lang/contractions_ga.txt create mode 100644 test/solr/configsets/term_search/conf/lang/contractions_it.txt create mode 100644 test/solr/configsets/term_search/conf/lang/hyphenations_ga.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stemdict_nl.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stoptags_ja.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_ar.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_bg.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_ca.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_cz.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_da.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_de.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_el.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_en.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_es.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_et.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_eu.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_fa.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_fi.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_fr.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_ga.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_gl.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_hi.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_hu.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_hy.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_id.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_it.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_ja.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_lv.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_nl.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_no.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_pt.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_ro.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_ru.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_sv.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_th.txt create mode 100644 test/solr/configsets/term_search/conf/lang/stopwords_tr.txt create mode 100644 test/solr/configsets/term_search/conf/lang/userdict_ja.txt create mode 100644 test/solr/configsets/term_search/conf/managed-schema create mode 100644 test/solr/configsets/term_search/conf/mapping-ISOLatin1Accent.txt create mode 100644 test/solr/configsets/term_search/conf/protwords.txt create mode 100644 test/solr/configsets/term_search/conf/schema.xml create mode 100644 test/solr/configsets/term_search/conf/solrconfig.xml create mode 100644 test/solr/configsets/term_search/conf/stopwords.txt create mode 100644 test/solr/configsets/term_search/conf/synonyms.txt create mode 100644 test/solr/docker-compose.yml create mode 100755 test/solr/generate_ncbo_configsets.sh diff --git a/.github/workflows/ruby-unit-tests.yml b/.github/workflows/ruby-unit-tests.yml index b61ce745..ce431e48 100644 --- a/.github/workflows/ruby-unit-tests.yml +++ b/.github/workflows/ruby-unit-tests.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - backend: ['ncbo_cron', 'ncbo_cron-agraph'] # ruby runs tests with 4store backend and ruby-agraph runs with AllegroGraph backend + backend: ['ncbo_cron'] # ruby runs tests with 4store backend and ruby-agraph runs with AllegroGraph backend runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 diff --git a/config/solr/property_search/enumsconfig.xml b/config/solr/property_search/enumsconfig.xml new file mode 100644 index 00000000..72e7b7d3 --- /dev/null +++ b/config/solr/property_search/enumsconfig.xml @@ -0,0 +1,12 @@ + + + + ONTOLOGY + VALUE_SET_COLLECTION + + + ANNOTATION + DATATYPE + OBJECT + + \ No newline at end of file diff --git a/config/solr/property_search/mapping-ISOLatin1Accent.txt b/config/solr/property_search/mapping-ISOLatin1Accent.txt new file mode 100644 index 00000000..ede77425 --- /dev/null +++ b/config/solr/property_search/mapping-ISOLatin1Accent.txt @@ -0,0 +1,246 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Syntax: +# "source" => "target" +# "source".length() > 0 (source cannot be empty.) +# "target".length() >= 0 (target can be empty.) + +# example: +# "À" => "A" +# "\u00C0" => "A" +# "\u00C0" => "\u0041" +# "ß" => "ss" +# "\t" => " " +# "\n" => "" + +# À => A +"\u00C0" => "A" + +# Á => A +"\u00C1" => "A" + +#  => A +"\u00C2" => "A" + +# à => A +"\u00C3" => "A" + +# Ä => A +"\u00C4" => "A" + +# Å => A +"\u00C5" => "A" + +# Æ => AE +"\u00C6" => "AE" + +# Ç => C +"\u00C7" => "C" + +# È => E +"\u00C8" => "E" + +# É => E +"\u00C9" => "E" + +# Ê => E +"\u00CA" => "E" + +# Ë => E +"\u00CB" => "E" + +# Ì => I +"\u00CC" => "I" + +# Í => I +"\u00CD" => "I" + +# Î => I +"\u00CE" => "I" + +# Ï => I +"\u00CF" => "I" + +# IJ => IJ +"\u0132" => "IJ" + +# Ð => D +"\u00D0" => "D" + +# Ñ => N +"\u00D1" => "N" + +# Ò => O +"\u00D2" => "O" + +# Ó => O +"\u00D3" => "O" + +# Ô => O +"\u00D4" => "O" + +# Õ => O +"\u00D5" => "O" + +# Ö => O +"\u00D6" => "O" + +# Ø => O +"\u00D8" => "O" + +# Œ => OE +"\u0152" => "OE" + +# Þ +"\u00DE" => "TH" + +# Ù => U +"\u00D9" => "U" + +# Ú => U +"\u00DA" => "U" + +# Û => U +"\u00DB" => "U" + +# Ü => U +"\u00DC" => "U" + +# Ý => Y +"\u00DD" => "Y" + +# Ÿ => Y +"\u0178" => "Y" + +# à => a +"\u00E0" => "a" + +# á => a +"\u00E1" => "a" + +# â => a +"\u00E2" => "a" + +# ã => a +"\u00E3" => "a" + +# ä => a +"\u00E4" => "a" + +# å => a +"\u00E5" => "a" + +# æ => ae +"\u00E6" => "ae" + +# ç => c +"\u00E7" => "c" + +# è => e +"\u00E8" => "e" + +# é => e +"\u00E9" => "e" + +# ê => e +"\u00EA" => "e" + +# ë => e +"\u00EB" => "e" + +# ì => i +"\u00EC" => "i" + +# í => i +"\u00ED" => "i" + +# î => i +"\u00EE" => "i" + +# ï => i +"\u00EF" => "i" + +# ij => ij +"\u0133" => "ij" + +# ð => d +"\u00F0" => "d" + +# ñ => n +"\u00F1" => "n" + +# ò => o +"\u00F2" => "o" + +# ó => o +"\u00F3" => "o" + +# ô => o +"\u00F4" => "o" + +# õ => o +"\u00F5" => "o" + +# ö => o +"\u00F6" => "o" + +# ø => o +"\u00F8" => "o" + +# œ => oe +"\u0153" => "oe" + +# ß => ss +"\u00DF" => "ss" + +# þ => th +"\u00FE" => "th" + +# ù => u +"\u00F9" => "u" + +# ú => u +"\u00FA" => "u" + +# û => u +"\u00FB" => "u" + +# ü => u +"\u00FC" => "u" + +# ý => y +"\u00FD" => "y" + +# ÿ => y +"\u00FF" => "y" + +# ff => ff +"\uFB00" => "ff" + +# fi => fi +"\uFB01" => "fi" + +# fl => fl +"\uFB02" => "fl" + +# ffi => ffi +"\uFB03" => "ffi" + +# ffl => ffl +"\uFB04" => "ffl" + +# ſt => ft +"\uFB05" => "ft" + +# st => st +"\uFB06" => "st" diff --git a/config/solr/property_search/schema.xml b/config/solr/property_search/schema.xml new file mode 100644 index 00000000..20824ea6 --- /dev/null +++ b/config/solr/property_search/schema.xml @@ -0,0 +1,1179 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/config/solr/property_search/solrconfig.xml b/config/solr/property_search/solrconfig.xml new file mode 100644 index 00000000..771a0f32 --- /dev/null +++ b/config/solr/property_search/solrconfig.xml @@ -0,0 +1,1299 @@ + + + + + + + + + 8.8.2 + + + + + + + + + + + ${solr.data.dir:} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.lock.type:native} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.ulog.dir:} + ${solr.ulog.numVersionBuckets:65536} + + + + + ${solr.autoCommit.maxTime:15000} + false + + + + + + ${solr.autoSoftCommit.maxTime:-1} + + + + + + + + + + + + + + ${solr.max.booleanClauses:500000} + + + + + + + + + + + + + + + + + + + + + + + + true + + + + + + 20 + + + 200 + + + + + + + + + + + + + + + + + + + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + explicit + 10 + + + + + + + + + + + + + + + + explicit + json + true + + + + + + _text_ + + + + + + + + + text_general + + + + + + default + _text_ + solr.DirectSolrSpellChecker + + internal + + 0.5 + + 2 + + 1 + + 5 + + 4 + + 0.01 + + + + + + + + + + + + default + on + true + 10 + 5 + 5 + true + true + 10 + 5 + + + spellcheck + + + + + + + + + + true + false + + + terms + + + + + + + + + + + 100 + + + + + + + + 70 + + 0.5 + + [-\w ,/\n\"']{20,200} + + + + + + + ]]> + ]]> + + + + + + + + + + + + + + + + + + + + + + + + ,, + ,, + ,, + ,, + ,]]> + ]]> + + + + + + 10 + .,!? + + + + + + + WORD + + + en + US + + + + + + + + + + + + [^\w-\.] + _ + + + + + + + yyyy-MM-dd['T'[HH:mm[:ss[.SSS]][z + yyyy-MM-dd['T'[HH:mm[:ss[,SSS]][z + yyyy-MM-dd HH:mm[:ss[.SSS]][z + yyyy-MM-dd HH:mm[:ss[,SSS]][z + [EEE, ]dd MMM yyyy HH:mm[:ss] z + EEEE, dd-MMM-yy HH:mm:ss z + EEE MMM ppd HH:mm:ss [z ]yyyy + + + + + java.lang.String + text_general + + *_str + 256 + + + true + + + java.lang.Boolean + booleans + + + java.util.Date + pdates + + + java.lang.Long + java.lang.Integer + plongs + + + java.lang.Number + pdoubles + + + + + + + + + + + + + + + + + + + + text/plain; charset=UTF-8 + + + + + + + + + + + + + + diff --git a/config/solr/solr.xml b/config/solr/solr.xml new file mode 100644 index 00000000..d9d089e4 --- /dev/null +++ b/config/solr/solr.xml @@ -0,0 +1,60 @@ + + + + + + + + ${solr.max.booleanClauses:500000} + ${solr.sharedLib:} + ${solr.allowPaths:} + + + + ${host:} + ${solr.port.advertise:0} + ${hostContext:solr} + + ${genericCoreNodeNames:true} + + ${zkClientTimeout:30000} + ${distribUpdateSoTimeout:600000} + ${distribUpdateConnTimeout:60000} + ${zkCredentialsProvider:org.apache.solr.common.cloud.DefaultZkCredentialsProvider} + ${zkACLProvider:org.apache.solr.common.cloud.DefaultZkACLProvider} + + + + + ${socketTimeout:600000} + ${connTimeout:60000} + ${solr.shardsWhitelist:} + + + + + diff --git a/config/solr/term_search/enumsconfig.xml b/config/solr/term_search/enumsconfig.xml new file mode 100644 index 00000000..72e7b7d3 --- /dev/null +++ b/config/solr/term_search/enumsconfig.xml @@ -0,0 +1,12 @@ + + + + ONTOLOGY + VALUE_SET_COLLECTION + + + ANNOTATION + DATATYPE + OBJECT + + \ No newline at end of file diff --git a/config/solr/term_search/mapping-ISOLatin1Accent.txt b/config/solr/term_search/mapping-ISOLatin1Accent.txt new file mode 100644 index 00000000..ede77425 --- /dev/null +++ b/config/solr/term_search/mapping-ISOLatin1Accent.txt @@ -0,0 +1,246 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Syntax: +# "source" => "target" +# "source".length() > 0 (source cannot be empty.) +# "target".length() >= 0 (target can be empty.) + +# example: +# "À" => "A" +# "\u00C0" => "A" +# "\u00C0" => "\u0041" +# "ß" => "ss" +# "\t" => " " +# "\n" => "" + +# À => A +"\u00C0" => "A" + +# Á => A +"\u00C1" => "A" + +#  => A +"\u00C2" => "A" + +# à => A +"\u00C3" => "A" + +# Ä => A +"\u00C4" => "A" + +# Å => A +"\u00C5" => "A" + +# Æ => AE +"\u00C6" => "AE" + +# Ç => C +"\u00C7" => "C" + +# È => E +"\u00C8" => "E" + +# É => E +"\u00C9" => "E" + +# Ê => E +"\u00CA" => "E" + +# Ë => E +"\u00CB" => "E" + +# Ì => I +"\u00CC" => "I" + +# Í => I +"\u00CD" => "I" + +# Î => I +"\u00CE" => "I" + +# Ï => I +"\u00CF" => "I" + +# IJ => IJ +"\u0132" => "IJ" + +# Ð => D +"\u00D0" => "D" + +# Ñ => N +"\u00D1" => "N" + +# Ò => O +"\u00D2" => "O" + +# Ó => O +"\u00D3" => "O" + +# Ô => O +"\u00D4" => "O" + +# Õ => O +"\u00D5" => "O" + +# Ö => O +"\u00D6" => "O" + +# Ø => O +"\u00D8" => "O" + +# Œ => OE +"\u0152" => "OE" + +# Þ +"\u00DE" => "TH" + +# Ù => U +"\u00D9" => "U" + +# Ú => U +"\u00DA" => "U" + +# Û => U +"\u00DB" => "U" + +# Ü => U +"\u00DC" => "U" + +# Ý => Y +"\u00DD" => "Y" + +# Ÿ => Y +"\u0178" => "Y" + +# à => a +"\u00E0" => "a" + +# á => a +"\u00E1" => "a" + +# â => a +"\u00E2" => "a" + +# ã => a +"\u00E3" => "a" + +# ä => a +"\u00E4" => "a" + +# å => a +"\u00E5" => "a" + +# æ => ae +"\u00E6" => "ae" + +# ç => c +"\u00E7" => "c" + +# è => e +"\u00E8" => "e" + +# é => e +"\u00E9" => "e" + +# ê => e +"\u00EA" => "e" + +# ë => e +"\u00EB" => "e" + +# ì => i +"\u00EC" => "i" + +# í => i +"\u00ED" => "i" + +# î => i +"\u00EE" => "i" + +# ï => i +"\u00EF" => "i" + +# ij => ij +"\u0133" => "ij" + +# ð => d +"\u00F0" => "d" + +# ñ => n +"\u00F1" => "n" + +# ò => o +"\u00F2" => "o" + +# ó => o +"\u00F3" => "o" + +# ô => o +"\u00F4" => "o" + +# õ => o +"\u00F5" => "o" + +# ö => o +"\u00F6" => "o" + +# ø => o +"\u00F8" => "o" + +# œ => oe +"\u0153" => "oe" + +# ß => ss +"\u00DF" => "ss" + +# þ => th +"\u00FE" => "th" + +# ù => u +"\u00F9" => "u" + +# ú => u +"\u00FA" => "u" + +# û => u +"\u00FB" => "u" + +# ü => u +"\u00FC" => "u" + +# ý => y +"\u00FD" => "y" + +# ÿ => y +"\u00FF" => "y" + +# ff => ff +"\uFB00" => "ff" + +# fi => fi +"\uFB01" => "fi" + +# fl => fl +"\uFB02" => "fl" + +# ffi => ffi +"\uFB03" => "ffi" + +# ffl => ffl +"\uFB04" => "ffl" + +# ſt => ft +"\uFB05" => "ft" + +# st => st +"\uFB06" => "st" diff --git a/config/solr/term_search/schema.xml b/config/solr/term_search/schema.xml new file mode 100644 index 00000000..fa95e127 --- /dev/null +++ b/config/solr/term_search/schema.xml @@ -0,0 +1,1222 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/config/solr/term_search/solrconfig.xml b/config/solr/term_search/solrconfig.xml new file mode 100644 index 00000000..771a0f32 --- /dev/null +++ b/config/solr/term_search/solrconfig.xml @@ -0,0 +1,1299 @@ + + + + + + + + + 8.8.2 + + + + + + + + + + + ${solr.data.dir:} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.lock.type:native} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.ulog.dir:} + ${solr.ulog.numVersionBuckets:65536} + + + + + ${solr.autoCommit.maxTime:15000} + false + + + + + + ${solr.autoSoftCommit.maxTime:-1} + + + + + + + + + + + + + + ${solr.max.booleanClauses:500000} + + + + + + + + + + + + + + + + + + + + + + + + true + + + + + + 20 + + + 200 + + + + + + + + + + + + + + + + + + + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + explicit + 10 + + + + + + + + + + + + + + + + explicit + json + true + + + + + + _text_ + + + + + + + + + text_general + + + + + + default + _text_ + solr.DirectSolrSpellChecker + + internal + + 0.5 + + 2 + + 1 + + 5 + + 4 + + 0.01 + + + + + + + + + + + + default + on + true + 10 + 5 + 5 + true + true + 10 + 5 + + + spellcheck + + + + + + + + + + true + false + + + terms + + + + + + + + + + + 100 + + + + + + + + 70 + + 0.5 + + [-\w ,/\n\"']{20,200} + + + + + + + ]]> + ]]> + + + + + + + + + + + + + + + + + + + + + + + + ,, + ,, + ,, + ,, + ,]]> + ]]> + + + + + + 10 + .,!? + + + + + + + WORD + + + en + US + + + + + + + + + + + + [^\w-\.] + _ + + + + + + + yyyy-MM-dd['T'[HH:mm[:ss[.SSS]][z + yyyy-MM-dd['T'[HH:mm[:ss[,SSS]][z + yyyy-MM-dd HH:mm[:ss[.SSS]][z + yyyy-MM-dd HH:mm[:ss[,SSS]][z + [EEE, ]dd MMM yyyy HH:mm[:ss] z + EEEE, dd-MMM-yy HH:mm:ss z + EEE MMM ppd HH:mm:ss [z ]yyyy + + + + + java.lang.String + text_general + + *_str + 256 + + + true + + + java.lang.Boolean + booleans + + + java.util.Date + pdates + + + java.lang.Long + java.lang.Integer + plongs + + + java.lang.Number + pdoubles + + + + + + + + + + + + + + + + + + + + text/plain; charset=UTF-8 + + + + + + + + + + + + + + diff --git a/docker-compose.yml b/docker-compose.yml index 0045ce12..7d4a4a56 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -58,6 +58,7 @@ services: 4store-ut: condition: service_started + ncbo_cron-agraph: <<: *app environment: @@ -75,8 +76,11 @@ services: agraph-ut: condition: service_healthy + redis-ut: image: redis + ports: + - 6379:6379 healthcheck: test: redis-cli ping interval: 10s @@ -95,9 +99,17 @@ services: - 4store solr-ut: - image: ontoportal/solr-ut:0.0.2 + image: solr:8 + volumes: + - ./test/solr/configsets:/configsets:ro + ports: + - "8983:8983" + command: > + bash -c "precreate-core term_search_core1 /configsets/term_search + && precreate-core prop_search_core1 /configsets/property_search + && solr-foreground" healthcheck: - test: ["CMD-SHELL", "curl -sf http://localhost:8983/solr/term_search_core1/admin/ping?wt=json | grep -iq '\"status\":\"OK\"}' || exit 1"] + test: [ "CMD-SHELL", "curl -sf http://localhost:8983/solr/term_search_core1/admin/ping?wt=json | grep -iq '\"status\":\"OK\"}' || exit 1" ] start_period: 3s interval: 10s timeout: 5s @@ -107,7 +119,7 @@ services: image: ontoportal/mgrep:0.0.1 platform: linux/amd64 healthcheck: - test: ["CMD", "nc", "-z", "-v", "localhost", "55555"] + test: [ "CMD", "nc", "-z", "-v", "localhost", "55555" ] start_period: 3s interval: 10s timeout: 5s @@ -124,13 +136,13 @@ services: # ports: # - 10035:10035 command: > - bash -c "/agraph/bin/agraph-control --config /agraph/etc/agraph.cfg start - ; agtool repos create bioportal_test - ; agtool users add anonymous - ; agtool users grant anonymous root:bioportal_test:rw - ; tail -f /agraph/data/agraph.log" + bash -c "/agraph/bin/agraph-control --config /agraph/etc/agraph.cfg start + ; agtool repos create bioportal_test + ; agtool users add anonymous + ; agtool users grant anonymous root:bioportal_test:rw + ; tail -f /agraph/data/agraph.log" healthcheck: - test: ["CMD-SHELL", "agtool storage-report bioportal_test || exit 1"] + test: [ "CMD-SHELL", "agtool storage-report bioportal_test || exit 1" ] start_period: 10s interval: 60s timeout: 5s diff --git a/test/solr/README b/test/solr/README new file mode 100644 index 00000000..b73bea53 --- /dev/null +++ b/test/solr/README @@ -0,0 +1,5 @@ +Solr configsets consists of solr config files from config/solr merged with the _default configsets +provided by solr distribution. +These solr configsets are used for staring up docker containers for unit testing and are in place +until solr Schema API and Config API is adoped for configuing solr. + diff --git a/test/solr/configsets/_default/conf/lang/contractions_ca.txt b/test/solr/configsets/_default/conf/lang/contractions_ca.txt new file mode 100644 index 00000000..307a85f9 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/contractions_ca.txt @@ -0,0 +1,8 @@ +# Set of Catalan contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +l +m +n +s +t diff --git a/test/solr/configsets/_default/conf/lang/contractions_fr.txt b/test/solr/configsets/_default/conf/lang/contractions_fr.txt new file mode 100644 index 00000000..f1bba51b --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/contractions_fr.txt @@ -0,0 +1,15 @@ +# Set of French contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +l +m +t +qu +n +s +j +d +c +jusqu +quoiqu +lorsqu +puisqu diff --git a/test/solr/configsets/_default/conf/lang/contractions_ga.txt b/test/solr/configsets/_default/conf/lang/contractions_ga.txt new file mode 100644 index 00000000..9ebe7fa3 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/contractions_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +m +b diff --git a/test/solr/configsets/_default/conf/lang/contractions_it.txt b/test/solr/configsets/_default/conf/lang/contractions_it.txt new file mode 100644 index 00000000..cac04095 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/contractions_it.txt @@ -0,0 +1,23 @@ +# Set of Italian contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +c +l +all +dall +dell +nell +sull +coll +pell +gl +agl +dagl +degl +negl +sugl +un +m +t +s +v +d diff --git a/test/solr/configsets/_default/conf/lang/hyphenations_ga.txt b/test/solr/configsets/_default/conf/lang/hyphenations_ga.txt new file mode 100644 index 00000000..4d2642cc --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/hyphenations_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish hyphenations for StopFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +h +n +t diff --git a/test/solr/configsets/_default/conf/lang/stemdict_nl.txt b/test/solr/configsets/_default/conf/lang/stemdict_nl.txt new file mode 100644 index 00000000..44107297 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stemdict_nl.txt @@ -0,0 +1,6 @@ +# Set of overrides for the dutch stemmer +# TODO: load this as a resource from the analyzer and sync it in build.xml +fiets fiets +bromfiets bromfiets +ei eier +kind kinder diff --git a/test/solr/configsets/_default/conf/lang/stoptags_ja.txt b/test/solr/configsets/_default/conf/lang/stoptags_ja.txt new file mode 100644 index 00000000..71b75084 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stoptags_ja.txt @@ -0,0 +1,420 @@ +# +# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. +# +# Any token with a part-of-speech tag that exactly matches those defined in this +# file are removed from the token stream. +# +# Set your own stoptags by uncommenting the lines below. Note that comments are +# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, +# etc. that can be useful for building you own stoptag set. +# +# The entire possible tagset is provided below for convenience. +# +##### +# noun: unclassified nouns +#名詞 +# +# noun-common: Common nouns or nouns where the sub-classification is undefined +#名詞-一般 +# +# noun-proper: Proper nouns where the sub-classification is undefined +#名詞-固有名詞 +# +# noun-proper-misc: miscellaneous proper nouns +#名詞-固有名詞-一般 +# +# noun-proper-person: Personal names where the sub-classification is undefined +#名詞-固有名詞-人名 +# +# noun-proper-person-misc: names that cannot be divided into surname and +# given name; foreign names; names where the surname or given name is unknown. +# e.g. お市の方 +#名詞-固有名詞-人名-一般 +# +# noun-proper-person-surname: Mainly Japanese surnames. +# e.g. 山田 +#名詞-固有名詞-人名-姓 +# +# noun-proper-person-given_name: Mainly Japanese given names. +# e.g. 太郎 +#名詞-固有名詞-人名-名 +# +# noun-proper-organization: Names representing organizations. +# e.g. 通産省, NHK +#名詞-固有名詞-組織 +# +# noun-proper-place: Place names where the sub-classification is undefined +#名詞-固有名詞-地域 +# +# noun-proper-place-misc: Place names excluding countries. +# e.g. アジア, バルセロナ, 京都 +#名詞-固有名詞-地域-一般 +# +# noun-proper-place-country: Country names. +# e.g. 日本, オーストラリア +#名詞-固有名詞-地域-国 +# +# noun-pronoun: Pronouns where the sub-classification is undefined +#名詞-代名詞 +# +# noun-pronoun-misc: miscellaneous pronouns: +# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ +#名詞-代名詞-一般 +# +# noun-pronoun-contraction: Spoken language contraction made by combining a +# pronoun and the particle 'wa'. +# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ +#名詞-代名詞-縮約 +# +# noun-adverbial: Temporal nouns such as names of days or months that behave +# like adverbs. Nouns that represent amount or ratios and can be used adverbially, +# e.g. 金曜, 一月, 午後, 少量 +#名詞-副詞可能 +# +# noun-verbal: Nouns that take arguments with case and can appear followed by +# 'suru' and related verbs (する, できる, なさる, くださる) +# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り +#名詞-サ変接続 +# +# noun-adjective-base: The base form of adjectives, words that appear before な ("na") +# e.g. 健康, 安易, 駄目, だめ +#名詞-形容動詞語幹 +# +# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数. +# e.g. 0, 1, 2, 何, 数, 幾 +#名詞-数 +# +# noun-affix: noun affixes where the sub-classification is undefined +#名詞-非自立 +# +# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that +# attach to the base form of inflectional words, words that cannot be classified +# into any of the other categories below. This category includes indefinite nouns. +# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, +# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, +# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳, +# わり, 割り, 割, ん-口語/, もん-口語/ +#名詞-非自立-一般 +# +# noun-affix-adverbial: noun affixes that that can behave as adverbs. +# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, +# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, +# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, +# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, +# 儘, 侭, みぎり, 矢先 +#名詞-非自立-副詞可能 +# +# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars +# with the stem よう(だ) ("you(da)"). +# e.g. よう, やう, 様 (よう) +#名詞-非自立-助動詞語幹 +# +# noun-affix-adjective-base: noun affixes that can connect to the indeclinable +# connection form な (aux "da"). +# e.g. みたい, ふう +#名詞-非自立-形容動詞語幹 +# +# noun-special: special nouns where the sub-classification is undefined. +#名詞-特殊 +# +# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is +# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base +# form of inflectional words. +# e.g. そう +#名詞-特殊-助動詞語幹 +# +# noun-suffix: noun suffixes where the sub-classification is undefined. +#名詞-接尾 +# +# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect +# to ガル or タイ and can combine into compound nouns, words that cannot be classified into +# any of the other categories below. In general, this category is more inclusive than +# 接尾語 ("suffix") and is usually the last element in a compound noun. +# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み, +# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用 +#名詞-接尾-一般 +# +# noun-suffix-person: Suffixes that form nouns and attach to person names more often +# than other nouns. +# e.g. 君, 様, 著 +#名詞-接尾-人名 +# +# noun-suffix-place: Suffixes that form nouns and attach to place names more often +# than other nouns. +# e.g. 町, 市, 県 +#名詞-接尾-地域 +# +# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that +# can appear before スル ("suru"). +# e.g. 化, 視, 分け, 入り, 落ち, 買い +#名詞-接尾-サ変接続 +# +# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, +# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the +# conjunctive form of inflectional words. +# e.g. そう +#名詞-接尾-助動詞語幹 +# +# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive +# form of inflectional words and appear before the copula だ ("da"). +# e.g. 的, げ, がち +#名詞-接尾-形容動詞語幹 +# +# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. +# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ) +#名詞-接尾-副詞可能 +# +# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category +# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach +# to numbers. +# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半 +#名詞-接尾-助数詞 +# +# noun-suffix-special: Special suffixes that mainly attach to inflecting words. +# e.g. (楽し) さ, (考え) 方 +#名詞-接尾-特殊 +# +# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words +# together. +# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦) +#名詞-接続詞的 +# +# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are +# semantically verb-like. +# e.g. ごらん, ご覧, 御覧, 頂戴 +#名詞-動詞非自立的 +# +# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, +# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") +# is いわく ("iwaku"). +#名詞-引用文字列 +# +# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and +# behave like an adjective. +# e.g. 申し訳, 仕方, とんでも, 違い +#名詞-ナイ形容詞語幹 +# +##### +# prefix: unclassified prefixes +#接頭詞 +# +# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) +# excluding numerical expressions. +# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派) +#接頭詞-名詞接続 +# +# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb +# in conjunctive form followed by なる/なさる/くださる. +# e.g. お (読みなさい), お (座り) +#接頭詞-動詞接続 +# +# prefix-adjectival: Prefixes that attach to adjectives. +# e.g. お (寒いですねえ), バカ (でかい) +#接頭詞-形容詞接続 +# +# prefix-numerical: Prefixes that attach to numerical expressions. +# e.g. 約, およそ, 毎時 +#接頭詞-数接続 +# +##### +# verb: unclassified verbs +#動詞 +# +# verb-main: +#動詞-自立 +# +# verb-auxiliary: +#動詞-非自立 +# +# verb-suffix: +#動詞-接尾 +# +##### +# adjective: unclassified adjectives +#形容詞 +# +# adjective-main: +#形容詞-自立 +# +# adjective-auxiliary: +#形容詞-非自立 +# +# adjective-suffix: +#形容詞-接尾 +# +##### +# adverb: unclassified adverbs +#副詞 +# +# adverb-misc: Words that can be segmented into one unit and where adnominal +# modification is not possible. +# e.g. あいかわらず, 多分 +#副詞-一般 +# +# adverb-particle_conjunction: Adverbs that can be followed by の, は, に, +# な, する, だ, etc. +# e.g. こんなに, そんなに, あんなに, なにか, なんでも +#副詞-助詞類接続 +# +##### +# adnominal: Words that only have noun-modifying forms. +# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, +# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, +# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き +#連体詞 +# +##### +# conjunction: Conjunctions that can occur independently. +# e.g. が, けれども, そして, じゃあ, それどころか +接続詞 +# +##### +# particle: unclassified particles. +助詞 +# +# particle-case: case particles where the subclassification is undefined. +助詞-格助詞 +# +# particle-case-misc: Case particles. +# e.g. から, が, で, と, に, へ, より, を, の, にて +助詞-格助詞-一般 +# +# particle-case-quote: the "to" that appears after nouns, a person’s speech, +# quotation marks, expressions of decisions from a meeting, reasons, judgements, +# conjectures, etc. +# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...) +助詞-格助詞-引用 +# +# particle-case-compound: Compounds of particles and verbs that mainly behave +# like case particles. +# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って, +# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, +# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, +# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, +# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって, +# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, +# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる, +# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ +助詞-格助詞-連語 +# +# particle-conjunctive: +# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, +# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, +# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/ +助詞-接続助詞 +# +# particle-dependency: +# e.g. こそ, さえ, しか, すら, は, も, ぞ +助詞-係助詞 +# +# particle-adverbial: +# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, +# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/, +# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, +# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/, +# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」) +助詞-副助詞 +# +# particle-interjective: particles with interjective grammatical roles. +# e.g. (松島) や +助詞-間投助詞 +# +# particle-coordinate: +# e.g. と, たり, だの, だり, とか, なり, や, やら +助詞-並立助詞 +# +# particle-final: +# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, +# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/ +助詞-終助詞 +# +# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is +# adverbial, conjunctive, or sentence final. For example: +# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」 +# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」 +# 「(祈りが届いたせい) か (, 試験に合格した.)」 +# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」 +# e.g. か +助詞-副助詞/並立助詞/終助詞 +# +# particle-adnominalizer: The "no" that attaches to nouns and modifies +# non-inflectional words. +助詞-連体化 +# +# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs +# that are giongo, giseigo, or gitaigo. +# e.g. に, と +助詞-副詞化 +# +# particle-special: A particle that does not fit into one of the above classifications. +# This includes particles that are used in Tanka, Haiku, and other poetry. +# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家) +助詞-特殊 +# +##### +# auxiliary-verb: +助動詞 +# +##### +# interjection: Greetings and other exclamations. +# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, +# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい +#感動詞 +# +##### +# symbol: unclassified Symbols. +記号 +# +# symbol-misc: A general symbol not in one of the categories below. +# e.g. [○◎@$〒→+] +記号-一般 +# +# symbol-comma: Commas +# e.g. [,、] +記号-読点 +# +# symbol-period: Periods and full stops. +# e.g. [..。] +記号-句点 +# +# symbol-space: Full-width whitespace. +記号-空白 +# +# symbol-open_bracket: +# e.g. [({‘“『【] +記号-括弧開 +# +# symbol-close_bracket: +# e.g. [)}’”』」】] +記号-括弧閉 +# +# symbol-alphabetic: +#記号-アルファベット +# +##### +# other: unclassified other +#その他 +# +# other-interjection: Words that are hard to classify as noun-suffixes or +# sentence-final particles. +# e.g. (だ)ァ +その他-間投 +# +##### +# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. +# e.g. あの, うんと, えと +フィラー +# +##### +# non-verbal: non-verbal sound. +非言語音 +# +##### +# fragment: +#語断片 +# +##### +# unknown: unknown part of speech. +#未知語 +# +##### End of file diff --git a/test/solr/configsets/_default/conf/lang/stopwords_ar.txt b/test/solr/configsets/_default/conf/lang/stopwords_ar.txt new file mode 100644 index 00000000..046829db --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_ar.txt @@ -0,0 +1,125 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Cleaned on October 11, 2009 (not normalized, so use before normalization) +# This means that when modifying this list, you might need to add some +# redundant entries, for example containing forms with both أ and ا +من +ومن +منها +منه +في +وفي +فيها +فيه +و +ف +ثم +او +أو +ب +بها +به +ا +أ +اى +اي +أي +أى +لا +ولا +الا +ألا +إلا +لكن +ما +وما +كما +فما +عن +مع +اذا +إذا +ان +أن +إن +انها +أنها +إنها +انه +أنه +إنه +بان +بأن +فان +فأن +وان +وأن +وإن +التى +التي +الذى +الذي +الذين +الى +الي +إلى +إلي +على +عليها +عليه +اما +أما +إما +ايضا +أيضا +كل +وكل +لم +ولم +لن +ولن +هى +هي +هو +وهى +وهي +وهو +فهى +فهي +فهو +انت +أنت +لك +لها +له +هذه +هذا +تلك +ذلك +هناك +كانت +كان +يكون +تكون +وكانت +وكان +غير +بعض +قد +نحو +بين +بينما +منذ +ضمن +حيث +الان +الآن +خلال +بعد +قبل +حتى +عند +عندما +لدى +جميع diff --git a/test/solr/configsets/_default/conf/lang/stopwords_bg.txt b/test/solr/configsets/_default/conf/lang/stopwords_bg.txt new file mode 100644 index 00000000..1ae4ba2a --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_bg.txt @@ -0,0 +1,193 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +а +аз +ако +ала +бе +без +беше +би +бил +била +били +било +близо +бъдат +бъде +бяха +в +вас +ваш +ваша +вероятно +вече +взема +ви +вие +винаги +все +всеки +всички +всичко +всяка +във +въпреки +върху +г +ги +главно +го +д +да +дали +до +докато +докога +дори +досега +доста +е +едва +един +ето +за +зад +заедно +заради +засега +затова +защо +защото +и +из +или +им +има +имат +иска +й +каза +как +каква +какво +както +какъв +като +кога +когато +което +които +кой +който +колко +която +къде +където +към +ли +м +ме +между +мен +ми +мнозина +мога +могат +може +моля +момента +му +н +на +над +назад +най +направи +напред +например +нас +не +него +нея +ни +ние +никой +нито +но +някои +някой +няма +обаче +около +освен +особено +от +отгоре +отново +още +пак +по +повече +повечето +под +поне +поради +после +почти +прави +пред +преди +през +при +пък +първо +с +са +само +се +сега +си +скоро +след +сме +според +сред +срещу +сте +съм +със +също +т +тази +така +такива +такъв +там +твой +те +тези +ти +тн +то +това +тогава +този +той +толкова +точно +трябва +тук +тъй +тя +тях +у +харесва +ч +че +често +чрез +ще +щом +я diff --git a/test/solr/configsets/_default/conf/lang/stopwords_ca.txt b/test/solr/configsets/_default/conf/lang/stopwords_ca.txt new file mode 100644 index 00000000..3da65dea --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_ca.txt @@ -0,0 +1,220 @@ +# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) +a +abans +ací +ah +així +això +al +als +aleshores +algun +alguna +algunes +alguns +alhora +allà +allí +allò +altra +altre +altres +amb +ambdós +ambdues +apa +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aquí +baix +cada +cadascú +cadascuna +cadascunes +cadascuns +com +contra +d'un +d'una +d'unes +d'uns +dalt +de +del +dels +des +després +dins +dintre +donat +doncs +durant +e +eh +el +els +em +en +encara +ens +entre +érem +eren +éreu +es +és +esta +està +estàvem +estaven +estàveu +esteu +et +etc +ets +fins +fora +gairebé +ha +han +has +havia +he +hem +heu +hi +ho +i +igual +iguals +ja +l'hi +la +les +li +li'n +llavors +m'he +ma +mal +malgrat +mateix +mateixa +mateixes +mateixos +me +mentre +més +meu +meus +meva +meves +molt +molta +moltes +molts +mon +mons +n'he +n'hi +ne +ni +no +nogensmenys +només +nosaltres +nostra +nostre +nostres +o +oh +oi +on +pas +pel +pels +per +però +perquè +poc +poca +pocs +poques +potser +propi +qual +quals +quan +quant +que +què +quelcom +qui +quin +quina +quines +quins +s'ha +s'han +sa +semblant +semblants +ses +seu +seus +seva +seva +seves +si +sobre +sobretot +sóc +solament +sols +son +són +sons +sota +sou +t'ha +t'han +t'he +ta +tal +també +tampoc +tan +tant +tanta +tantes +teu +teus +teva +teves +ton +tons +tot +tota +totes +tots +un +una +unes +uns +us +va +vaig +vam +van +vas +veu +vosaltres +vostra +vostre +vostres diff --git a/test/solr/configsets/_default/conf/lang/stopwords_cz.txt b/test/solr/configsets/_default/conf/lang/stopwords_cz.txt new file mode 100644 index 00000000..53c6097d --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_cz.txt @@ -0,0 +1,172 @@ +a +s +k +o +i +u +v +z +dnes +cz +tímto +budeš +budem +byli +jseš +můj +svým +ta +tomto +tohle +tuto +tyto +jej +zda +proč +máte +tato +kam +tohoto +kdo +kteří +mi +nám +tom +tomuto +mít +nic +proto +kterou +byla +toho +protože +asi +ho +naši +napište +re +což +tím +takže +svých +její +svými +jste +aj +tu +tedy +teto +bylo +kde +ke +pravé +ji +nad +nejsou +či +pod +téma +mezi +přes +ty +pak +vám +ani +když +však +neg +jsem +tento +článku +články +aby +jsme +před +pta +jejich +byl +ještě +až +bez +také +pouze +první +vaše +která +nás +nový +tipy +pokud +může +strana +jeho +své +jiné +zprávy +nové +není +vás +jen +podle +zde +už +být +více +bude +již +než +který +by +které +co +nebo +ten +tak +má +při +od +po +jsou +jak +další +ale +si +se +ve +to +jako +za +zpět +ze +do +pro +je +na +atd +atp +jakmile +přičemž +já +on +ona +ono +oni +ony +my +vy +jí +ji +mě +mne +jemu +tomu +těm +těmu +němu +němuž +jehož +jíž +jelikož +jež +jakož +načež diff --git a/test/solr/configsets/_default/conf/lang/stopwords_da.txt b/test/solr/configsets/_default/conf/lang/stopwords_da.txt new file mode 100644 index 00000000..42e6145b --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_da.txt @@ -0,0 +1,110 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Danish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + +og | and +i | in +jeg | I +det | that (dem. pronoun)/it (pers. pronoun) +at | that (in front of a sentence)/to (with infinitive) +en | a/an +den | it (pers. pronoun)/that (dem. pronoun) +til | to/at/for/until/against/by/of/into, more +er | present tense of "to be" +som | who, as +på | on/upon/in/on/at/to/after/of/with/for, on +de | they +med | with/by/in, along +han | he +af | of/by/from/off/for/in/with/on, off +for | at/for/to/from/by/of/ago, in front/before, because +ikke | not +der | who/which, there/those +var | past tense of "to be" +mig | me/myself +sig | oneself/himself/herself/itself/themselves +men | but +et | a/an/one, one (number), someone/somebody/one +har | present tense of "to have" +om | round/about/for/in/a, about/around/down, if +vi | we +min | my +havde | past tense of "to have" +ham | him +hun | she +nu | now +over | over/above/across/by/beyond/past/on/about, over/past +da | then, when/as/since +fra | from/off/since, off, since +du | you +ud | out +sin | his/her/its/one's +dem | them +os | us/ourselves +op | up +man | you/one +hans | his +hvor | where +eller | or +hvad | what +skal | must/shall etc. +selv | myself/youself/herself/ourselves etc., even +her | here +alle | all/everyone/everybody etc. +vil | will (verb) +blev | past tense of "to stay/to remain/to get/to become" +kunne | could +ind | in +når | when +være | present tense of "to be" +dog | however/yet/after all +noget | something +ville | would +jo | you know/you see (adv), yes +deres | their/theirs +efter | after/behind/according to/for/by/from, later/afterwards +ned | down +skulle | should +denne | this +end | than +dette | this +mit | my/mine +også | also +under | under/beneath/below/during, below/underneath +have | have +dig | you +anden | other +hende | her +mine | my +alt | everything +meget | much/very, plenty of +sit | his, her, its, one's +sine | his, her, its, one's +vor | our +mod | against +disse | these +hvis | if +din | your/yours +nogle | some +hos | by/at +blive | be/become +mange | many +ad | by/through +bliver | present tense of "to be/to become" +hendes | her/hers +været | be +thi | for (conj) +jer | you +sådan | such, like this/like that diff --git a/test/solr/configsets/_default/conf/lang/stopwords_de.txt b/test/solr/configsets/_default/conf/lang/stopwords_de.txt new file mode 100644 index 00000000..86525e7a --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_de.txt @@ -0,0 +1,294 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A German stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | The number of forms in this list is reduced significantly by passing it + | through the German stemmer. + + +aber | but + +alle | all +allem +allen +aller +alles + +als | than, as +also | so +am | an + dem +an | at + +ander | other +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders + +auch | also +auf | on +aus | out of +bei | by +bin | am +bis | until +bist | art +da | there +damit | with it +dann | then + +der | the +den +des +dem +die +das + +daß | that + +derselbe | the same +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe + +dazu | to that + +dein | thy +deine +deinem +deinen +deiner +deines + +denn | because + +derer | of those +dessen | of him + +dich | thee +dir | to thee +du | thou + +dies | this +diese +diesem +diesen +dieser +dieses + + +doch | (several meanings) +dort | (over) there + + +durch | through + +ein | a +eine +einem +einen +einer +eines + +einig | some +einige +einigem +einigen +einiger +einiges + +einmal | once + +er | he +ihn | him +ihm | to him + +es | it +etwas | something + +euer | your +eure +eurem +euren +eurer +eures + +für | for +gegen | towards +gewesen | p.p. of sein +hab | have +habe | have +haben | have +hat | has +hatte | had +hatten | had +hier | here +hin | there +hinter | behind + +ich | I +mich | me +mir | to me + + +ihr | you, to her +ihre +ihrem +ihren +ihrer +ihres +euch | to you + +im | in + dem +in | in +indem | while +ins | in + das +ist | is + +jede | each, every +jedem +jeden +jeder +jedes + +jene | that +jenem +jenen +jener +jenes + +jetzt | now +kann | can + +kein | no +keine +keinem +keinen +keiner +keines + +können | can +könnte | could +machen | do +man | one + +manche | some, many a +manchem +manchen +mancher +manches + +mein | my +meine +meinem +meinen +meiner +meines + +mit | with +muss | must +musste | had to +nach | to(wards) +nicht | not +nichts | nothing +noch | still, yet +nun | now +nur | only +ob | whether +oder | or +ohne | without +sehr | very + +sein | his +seine +seinem +seinen +seiner +seines + +selbst | self +sich | herself + +sie | they, she +ihnen | to them + +sind | are +so | so + +solche | such +solchem +solchen +solcher +solches + +soll | shall +sollte | should +sondern | but +sonst | else +über | over +um | about, around +und | and + +uns | us +unse +unsem +unsen +unser +unses + +unter | under +viel | much +vom | von + dem +von | from +vor | before +während | while +war | was +waren | were +warst | wast +was | what +weg | away, off +weil | because +weiter | further + +welche | which +welchem +welchen +welcher +welches + +wenn | when +werde | will +werden | will +wie | how +wieder | again +will | want +wir | we +wird | will +wirst | willst +wo | where +wollen | want +wollte | wanted +würde | would +würden | would +zu | to +zum | zu + dem +zur | zu + der +zwar | indeed +zwischen | between + diff --git a/test/solr/configsets/_default/conf/lang/stopwords_el.txt b/test/solr/configsets/_default/conf/lang/stopwords_el.txt new file mode 100644 index 00000000..232681f5 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_el.txt @@ -0,0 +1,78 @@ +# Lucene Greek Stopwords list +# Note: by default this file is used after GreekLowerCaseFilter, +# so when modifying this file use 'σ' instead of 'ς' +ο +η +το +οι +τα +του +τησ +των +τον +την +και +κι +κ +ειμαι +εισαι +ειναι +ειμαστε +ειστε +στο +στον +στη +στην +μα +αλλα +απο +για +προσ +με +σε +ωσ +παρα +αντι +κατα +μετα +θα +να +δε +δεν +μη +μην +επι +ενω +εαν +αν +τοτε +που +πωσ +ποιοσ +ποια +ποιο +ποιοι +ποιεσ +ποιων +ποιουσ +αυτοσ +αυτη +αυτο +αυτοι +αυτων +αυτουσ +αυτεσ +αυτα +εκεινοσ +εκεινη +εκεινο +εκεινοι +εκεινεσ +εκεινα +εκεινων +εκεινουσ +οπωσ +ομωσ +ισωσ +οσο +οτι diff --git a/test/solr/configsets/_default/conf/lang/stopwords_en.txt b/test/solr/configsets/_default/conf/lang/stopwords_en.txt new file mode 100644 index 00000000..2c164c0b --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_en.txt @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +# Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +such +that +the +their +then +there +these +they +this +to +was +will +with diff --git a/test/solr/configsets/_default/conf/lang/stopwords_es.txt b/test/solr/configsets/_default/conf/lang/stopwords_es.txt new file mode 100644 index 00000000..487d78c8 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_es.txt @@ -0,0 +1,356 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Spanish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | from, of +la | the, her +que | who, that +el | the +en | in +y | and +a | to +los | the, them +del | de + el +se | himself, from him etc +las | the, them +por | for, by, etc +un | a +para | for +con | with +no | no +una | a +su | his, her +al | a + el + | es from SER +lo | him +como | how +más | more +pero | pero +sus | su plural +le | to him, her +ya | already +o | or + | fue from SER +este | this + | ha from HABER +sí | himself etc +porque | because +esta | this + | son from SER +entre | between + | está from ESTAR +cuando | when +muy | very +sin | without +sobre | on + | ser from SER + | tiene from TENER +también | also +me | me +hasta | until +hay | there is/are +donde | where + | han from HABER +quien | whom, that + | están from ESTAR + | estado from ESTAR +desde | from +todo | all +nos | us +durante | during + | estados from ESTAR +todos | all +uno | a +les | to them +ni | nor +contra | against +otros | other + | fueron from SER +ese | that +eso | that + | había from HABER +ante | before +ellos | they +e | and (variant of y) +esto | this +mí | me +antes | before +algunos | some +qué | what? +unos | a +yo | I +otro | other +otras | other +otra | other +él | he +tanto | so much, many +esa | that +estos | these +mucho | much, many +quienes | who +nada | nothing +muchos | many +cual | who + | sea from SER +poco | few +ella | she +estar | to be + | haber from HABER +estas | these + | estaba from ESTAR + | estamos from ESTAR +algunas | some +algo | something +nosotros | we + + | other forms + +mi | me +mis | mi plural +tú | thou +te | thee +ti | thee +tu | thy +tus | tu plural +ellas | they +nosotras | we +vosotros | you +vosotras | you +os | you +mío | mine +mía | +míos | +mías | +tuyo | thine +tuya | +tuyos | +tuyas | +suyo | his, hers, theirs +suya | +suyos | +suyas | +nuestro | ours +nuestra | +nuestros | +nuestras | +vuestro | yours +vuestra | +vuestros | +vuestras | +esos | those +esas | those + + | forms of estar, to be (not including the infinitive): +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad + + | forms of haber, to have (not including the infinitive): +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas + + | forms of ser, to be (not including the infinitive): +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +siendo +sido + | sed also means 'thirst' + + | forms of tener, to have (not including the infinitive): +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened + diff --git a/test/solr/configsets/_default/conf/lang/stopwords_et.txt b/test/solr/configsets/_default/conf/lang/stopwords_et.txt new file mode 100644 index 00000000..1b06a134 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_et.txt @@ -0,0 +1,1603 @@ +# Estonian stopwords list +all +alla +allapoole +allpool +alt +altpoolt +eel +eespool +enne +hommikupoole +hoolimata +ilma +kaudu +keset +kesk +kohe +koos +kuhupoole +kuni +kuspool +kustpoolt +kõige +käsikäes +lappi +ligi +läbi +mööda +paitsi +peale +pealepoole +pealpool +pealt +pealtpoolt +piki +pikku +piku +pikuti +põiki +pärast +päri +risti +sealpool +sealtpoolt +seespool +seltsis +siiapoole +siinpool +siitpoolt +sinnapoole +sissepoole +taga +tagantpoolt +tagapidi +tagapool +taha +tahapoole +teispool +teispoole +tänu +tükkis +vaatamata +vastu +väljapoole +väljaspool +väljastpoolt +õhtupoole +ühes +ühestükis +ühestükkis +ülalpool +ülaltpoolt +üle +ülespoole +ülevalpool +ülevaltpoolt +ümber +ümbert +aegu +aegus +alguks +algul +algule +algult +alguni +all +alla +alt +alul +alutsi +arvel +asemel +asemele +eel +eeli +ees +eesotsas +eest +eestotsast +esitsi +ette +etteotsa +haaval +heaks +hoolimata +hulgas +hulgast +hulka +jalgu +jalus +jalust +jaoks +jooksul +juurde +juures +juurest +jälil +jälile +järel +järele +järelt +järgi +kaasas +kallal +kallale +kallalt +kamul +kannul +kannule +kannult +kaudu +kaupa +keskel +keskele +keskelt +keskis +keskpaiku +kestel +kestes +kilda +killas +killast +kimpu +kimpus +kiuste +kohal +kohale +kohalt +kohaselt +kohe +kohta +koos +korral +kukil +kukile +kukilt +kulul +kõrva +kõrval +kõrvale +kõrvalt +kõrvas +kõrvast +käekõrval +käekõrvale +käekõrvalt +käes +käest +kätte +külge +küljes +küljest +küüsi +küüsis +küüsist +ligi +ligidal +ligidale +ligidalt +aegu +aegus +alguks +algul +algule +algult +alguni +all +alla +alt +alul +alutsi +arvel +asemel +asemele +eel +eeli +ees +eesotsas +eest +eestotsast +esitsi +ette +etteotsa +haaval +heaks +hoolimata +hulgas +hulgast +hulka +jalgu +jalus +jalust +jaoks +jooksul +juurde +juures +juurest +jälil +jälile +järel +järele +järelt +järgi +kaasas +kallal +kallale +kallalt +kamul +kannul +kannule +kannult +kaudu +kaupa +keskel +keskele +keskelt +keskis +keskpaiku +kestel +kestes +kilda +killas +killast +kimpu +kimpus +kiuste +kohal +kohale +kohalt +kohaselt +kohe +kohta +koos +korral +kukil +kukile +kukilt +kulul +kõrva +kõrval +kõrvale +kõrvalt +kõrvas +kõrvast +käekõrval +käekõrvale +käekõrvalt +käes +käest +kätte +külge +küljes +küljest +küüsi +küüsis +küüsist +ligi +ligidal +ligidale +ligidalt +lool +läbi +lähedal +lähedale +lähedalt +man +mant +manu +meelest +mööda +nahas +nahka +nahkas +najal +najale +najalt +nõjal +nõjale +otsa +otsas +otsast +paigale +paigu +paiku +peal +peale +pealt +perra +perrä +pidi +pihta +piki +pikku +pool +poole +poolest +poolt +puhul +puksiiris +pähe +päralt +päras +pärast +päri +ringi +ringis +risust +saadetusel +saadik +saatel +saati +seas +seast +sees +seest +sekka +seljataga +seltsi +seltsis +seltsist +sisse +slepis +suhtes +šlepis +taga +tagant +tagantotsast +tagaotsas +tagaselja +tagasi +tagast +tagutsi +taha +tahaotsa +takka +tarvis +tasa +tuuri +tuuris +tõttu +tükkis +uhal +vaatamata +vahel +vahele +vahelt +vahepeal +vahepeale +vahepealt +vahetsi +varal +varale +varul +vastas +vastast +vastu +veerde +veeres +viisi +võidu +võrd +võrdki +võrra +võrragi +väel +väele +vältel +väärt +väärtki +äärde +ääre +ääres +äärest +ühes +üle +ümber +ümbert +a +abil +aina +ainult +alalt +alates +alati +alles +b +c +d +e +eales +ealeski +edasi +edaspidi +eelkõige +eemal +ei +eks +end +enda +enese +ennem +esialgu +f +g +h +hoopis +i +iganes +igatahes +igati +iial +iialgi +ikka +ikkagi +ilmaski +iseenda +iseenese +iseenesest +isegi +j +jah +ju +juba +juhul +just +järelikult +k +ka +kah +kas +kasvõi +keda +kestahes +kogu +koguni +kohati +kokku +kuhu +kuhugi +kuidagi +kuidas +kunagi +kus +kusagil +kusjuures +kuskil +kust +kõigepealt +küll +l +liiga +lisaks +m +miks +mil +millal +millalgi +mispärast +mistahes +mistõttu +mitte +muide +muidu +muidugi +muist +mujal +mujale +mujalt +mõlemad +mõnda +mõne +mõnikord +n +nii +niikaua +niimoodi +niipaljuke +niisama +niisiis +niivõrd +nõnda +nüüd +o +omaette +omakorda +omavahel +ometi +p +palju +paljuke +palju-palju +peaaegu +peagi +peamiselt +pigem +pisut +praegu +päris +r +rohkem +s +samas +samuti +seal +sealt +sedakorda +sedapuhku +seega +seejuures +seejärel +seekord +seepärast +seetõttu +sellepärast +seni +sestap +siia +siiani +siin +siinkohal +siis +siiski +siit +sinna +suht +š +z +ž +t +teel +teineteise +tõesti +täiesti +u +umbes +v +w +veel +veelgi +vist +võibolla +võib-olla +väga +vähemalt +välja +väljas +väljast +õ +ä +ära +ö +ü +ühtlasi +üksi +ükskõik +ülal +ülale +ülalt +üles +ülesse +üleval +ülevalt +ülimalt +üsna +x +y +aga +ega +ehk +ehkki +elik +ellik +enge +ennegu +ent +et +ja +justkui +kui +kuid +kuigi +kuivõrd +kuna +kuni +kut +mistab +muudkui +nagu +nigu +ning +olgugi +otsekui +otsenagu +selmet +sest +sestab +vaid +või +aa +adaa +adjöö +ae +ah +ahaa +ahah +ah-ah-ah +ah-haa +ahoi +ai +aidaa +aidu-raidu +aih +aijeh +aituma +aitäh +aitüma +ammuu +amps +ampsti +aptsih +ass +at +ata +at-at-at +atsih +atsihh +auh +bai-bai +bingo +braavo +brr +ee +eeh +eh +ehee +eheh +eh-eh-hee +eh-eh-ee +ehei +ehh +ehhee +einoh +ena +ennäe +ennäh +fuh +fui +fuih +haa +hah +hahaa +hah-hah-hah +halleluuja +hallo +halloo +hass +hee +heh +he-he-hee +hei +heldeke(ne) +heureka +hihii +hip-hip-hurraa +hmh +hmjah +hoh-hoh-hoo +hohoo +hoi +hollallaa +hoo +hoplaa +hopp +hops +hopsassaa +hopsti +hosianna +huh +huidii +huist +hurjah +hurjeh +hurjoh +hurjuh +hurraa +huu +hõhõh +hõi +hõissa +hõissassa +hõk +hõkk +häh +hä-hä-hää +hüvasti +ih-ah-haa +ih-ih-hii +ii-ha-ha +issake +issakene +isver +jaa-ah +ja-ah +jaah +janäe +jeeh +jeerum +jeever +jessas +jestas +juhhei +jumalaga +jumalime +jumaluke +jumalukene +jutas +kaaps +kaapsti +kaasike +kae +kalps +kalpsti +kannäe +kanäe +kappadi +kaps +kapsti +karkõmm +karkäuh +karkääks +karkääksti +karmauh +karmauhti +karnaps +karnapsti +karniuhti +karpartsaki +karpauh +karpauhti +karplauh +karplauhti +karprauh +karprauhti +karsumdi +karsumm +kartsumdi +kartsumm +karviuh +karviuhti +kaske +kassa +kauh +kauhti +keh +keksti +kepsti +khe +khm +kih +kiiks +kiiksti +kiis +kiiss +kikerii +kikerikii +kili +kilk +kilk-kõlk +kilks +kilks-kolks +kilks-kõlks +kill +killadi +killadi|-kolladi +killadi-kõlladi +killa-kolla +killa-kõlla +kill-kõll +kimps-komps +kipp +kips-kõps +kiriküüt +kirra-kõrra +kirr-kõrr +kirts +klaps +klapsti +klirdi +klirr +klonks +klops +klopsti +kluk +klu-kluu +klõks +klõksti +klõmdi +klõmm +klõmpsti +klõnks +klõnksti +klõps +klõpsti +kläu +kohva-kohva +kok +koks +koksti +kolaki +kolk +kolks +kolksti +koll +kolladi +komp +komps +kompsti +kop +kopp +koppadi +kops +kopsti +kossu +kotsu +kraa +kraak +kraaks +kraaps +kraapsti +krahh +kraks +kraksti +kraps +krapsti +krauh +krauhti +kriiks +kriiksti +kriips +kriips-kraaps +kripa-krõpa +krips-kraps +kriuh +kriuks +kriuksti +kromps +kronk +kronks +krooks +kruu +krõks +krõksti +krõpa +krõps +krõpsti +krõuh +kräu +kräuh +kräuhti +kräuks +kss +kukeleegu +kukku +kuku +kulu +kurluu +kurnäu +kuss +kussu +kõks +kõksti +kõldi +kõlks +kõlksti +kõll +kõmaki +kõmdi +kõmm +kõmps +kõpp +kõps +kõpsadi +kõpsat +kõpsti +kõrr +kõrra-kõrra +kõss +kõtt +kõõksti +kärr +kärts +kärtsti +käuks +käuksti +kääga +kääks +kääksti +köh +köki-möki +köksti +laks +laksti +lampsti +larts +lartsti +lats +latsti +leelo +legoo +lehva +liiri-lõõri +lika-lõka +likat-lõkat +limpsti +lips +lipsti +lirts +lirtsaki +lirtsti +lonksti +lops +lopsti +lorts +lortsti +luks +lups +lupsti +lurts +lurtsti +lõks +lõksti +lõmps +lõmpsti +lõnks +lõnksti +lärts +lärtsti +läts +lätsti +lörts +lörtsti +lötsti +lööps +lööpsti +marss +mats +matsti +mauh +mauhti +mh +mhh +mhmh +miau +mjaa +mkm +m-mh +mnjaa +mnjah +moens +mulks +mulksti +mull-mull +mull-mull-mull +muu +muuh +mõh +mõmm +mäh +mäts +mäu +mää +möh +möh-öh-ää +möö +müh-müh +mühüh +müks +müksti +müraki +mürr +mürts +mürtsaki +mürtsti +mütaku +müta-mäta +müta-müta +müt-müt +müt-müt-müt +müts +mütsti +mütt +naa +naah +nah +naks +naksti +nanuu +naps +napsti +nilpsti +nipsti +nirr +niuh +niuh-näuh +niuhti +noh +noksti +nolpsti +nonoh +nonoo +nonäh +noo +nooh +nooks +norr +nurr +nuuts +nõh +nõhh +nõka-nõka +nõks +nõksat-nõksat +nõks-nõks +nõksti +nõõ +nõõh +näeh +näh +nälpsti +nämm-nämm +näpsti +näts +nätsti +näu +näuh +näuhti +näuks +näuksti +nääh +nääks +nühkat-nühkat +oeh +oh +ohh +ohhh +oh-hoi +oh-hoo +ohoh +oh-oh-oo +oh-oh-hoo +ohoi +ohoo +oi +oih +oijee +oijeh +oo +ooh +oo-oh +oo-ohh +oot +ossa +ot +paa +pah +pahh +pakaa +pamm +pantsti +pardon +pardonks +parlartsti +parts +partsti +partsumdi +partsumm +pastoi +pats +patst +patsti +pau +pauh +pauhti +pele +pfui +phuh +phuuh +phäh +phähh +piiks +piip +piiri-pääri +pimm +pimm-pamm +pimm-pomm +pimm-põmm +piraki +piuks +piu-pau +plaks +plaksti +plarts +plartsti +plats +platsti +plauh +plauhh +plauhti +pliks +pliks-plaks +plinn +pliraki +plirts +plirtsti +pliu +pliuh +ploks +plotsti +plumps +plumpsti +plõks +plõksti +plõmdi +plõmm +plõnn +plärr +plärts +plärtsat +plärtsti +pläu +pläuh +plää +plörtsat +pomm +popp +pops +popsti +ports +pot +pots +potsti +pott +praks +praksti +prants +prantsaki +prantsti +prassai +prauh +prauhh +prauhti +priks +priuh +priuhh +priuh-prauh +proosit +proost +prr +prrr +prõks +prõksti +prõmdi +prõmm +prõntsti +prääk +prääks +pst +psst +ptrr +ptruu +ptüi +puh +puhh +puksti +pumm +pumps +pup-pup-pup +purts +puuh +põks +põksti +põmdi +põmm +põmmadi +põnks +põnn +põnnadi +põnt +põnts +põntsti +põraki +põrr +põrra-põrra +päh +pähh +päntsti +pää +pöörd +püh +raks +raksti +raps +rapsti +ratataa +rauh +riips +riipsti +riks +riks-raks +rips-raps +rivitult +robaki +rops +ropsaki +ropsti +ruik +räntsti +räts +röh +röhh +sah +sahh +sahkat +saps +sapsti +sauh +sauhti +servus +sihkadi-sahkadi +sihka-sahka +sihkat-sahkat +silks +silk-solk +sips +sipsti +sirr +sirr-sorr +sirts +sirtsti +siu +siuh +siuh-sauh +siuh-säuh +siuhti +siuks +siuts +skool +so +soh +solks +solksti +solpsti +soo +sooh +so-oh +soo-oh +sopp +sops +sopsti +sorr +sorts +sortsti +so-soo +soss +soss-soss +ss +sss +sst +stopp +suhkat-sahkat +sulk +sulks +sulksti +sull +sulla-sulla +sulpa-sulpa +sulps +sulpsti +sumaki +sumdi +summ +summat-summat +sups +supsaku +supsti +surts +surtsti +suss +susti +suts +sutsti +säh +sähke +särts +särtsti +säu +säuh +säuhti +taevake +taevakene +takk +tere +terekest +tibi-tibi +tikk-takk +tiks +tilk +tilks +till +tilla-talla +till-tall +tilulii +tinn +tip +tip-tap +tirr +tirtsti +tiu +tjaa +tjah +tohhoh +tohhoo +tohoh +tohoo +tok +tokk +toks +toksti +tonks +tonksti +tota +totsti +tot-tot +tprr +tpruu +trah +trahh +trallallaa +trill +trillallaa +trr +trrr +tsah +tsahh +tsilk +tsilk-tsolk +tsirr +tsiuh +tskae +tsolk +tss +tst +tsst +tsuhh +tsuk +tsumm +tsurr +tsäuh +tšao +tšš +tššš +tuk +tuks +turts +turtsti +tutki +tutkit +tutu-lutu +tutulutu +tuut +tuutu-luutu +tõks +tötsti +tümps +uh +uhh +uh-huu +uhtsa +uhtsaa +uhuh +uhuu +ui +uih +uih-aih +uijah +uijeh +uist +uit +uka +upsti +uraa +urjah +urjeh +urjoh +urjuh +urr +urraa +ust +utu +uu +uuh +vaak +vaat +vae +vaeh +vai +vat +vau +vhüüt +vidiit +viiks +vilks +vilksti +vinki-vinki +virdi +virr +viu +viudi +viuh +viuhti +voeh +voh +vohh +volks +volksti +vooh +vops +vopsti +vot +vuh +vuhti +vuih +vulks +vulksti +vull +vulpsti +vups +vupsaki +vupsaku +vupsti +vurdi +vurr +vurra-vurra +vurts +vurtsti +vutt +võe +võeh +või +võih +võrr +võts +võtt +vääks +õe +õits +õk +õkk +õrr +õss +õuh +äh +ähh +ähhähhää +äh-hää +äh-äh-hää +äiu +äiu-ää +äss +ää +ääh +äähh +öh +öhh +ök +üh +eelmine +eikeegi +eimiski +emb-kumb +enam +enim +iga +igasugune +igaüks +ise +isesugune +järgmine +keegi +kes +kumb +kumbki +kõik +meiesugune +meietaoline +midagi +mihuke +mihukene +milletaoline +milline +mina +minake +mingi +mingisugune +minusugune +minutaoline +mis +miski +miskisugune +missugune +misuke +mitmes +mitmesugune +mitu +mitu-mitu +mitu-setu +muu +mõlema +mõnesugune +mõni +mõningane +mõningas +mäherdune +määrane +naasugune +need +nemad +nendesugune +nendetaoline +nihuke +nihukene +niimitu +niisamasugune +niisugune +nisuke +nisukene +oma +omaenese +omasugune +omataoline +pool +praegune +sama +samasugune +samataoline +see +seesama +seesamane +seesamune +seesinane +seesugune +selline +sihuke +sihukene +sina +sinusugune +sinutaoline +siuke +siukene +säherdune +säärane +taoline +teiesugune +teine +teistsugune +tema +temake +temakene +temasugune +temataoline +too +toosama +toosamane +üks +üksteise +hakkama +minema +olema +pidama +saama +tegema +tulema +võima diff --git a/test/solr/configsets/_default/conf/lang/stopwords_eu.txt b/test/solr/configsets/_default/conf/lang/stopwords_eu.txt new file mode 100644 index 00000000..25f1db93 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_eu.txt @@ -0,0 +1,99 @@ +# example set of basque stopwords +al +anitz +arabera +asko +baina +bat +batean +batek +bati +batzuei +batzuek +batzuetan +batzuk +bera +beraiek +berau +berauek +bere +berori +beroriek +beste +bezala +da +dago +dira +ditu +du +dute +edo +egin +ere +eta +eurak +ez +gainera +gu +gutxi +guzti +haiei +haiek +haietan +hainbeste +hala +han +handik +hango +hara +hari +hark +hartan +hau +hauei +hauek +hauetan +hemen +hemendik +hemengo +hi +hona +honek +honela +honetan +honi +hor +hori +horiei +horiek +horietan +horko +horra +horrek +horrela +horretan +horri +hortik +hura +izan +ni +noiz +nola +non +nondik +nongo +nor +nora +ze +zein +zen +zenbait +zenbat +zer +zergatik +ziren +zituen +zu +zuek +zuen +zuten diff --git a/test/solr/configsets/_default/conf/lang/stopwords_fa.txt b/test/solr/configsets/_default/conf/lang/stopwords_fa.txt new file mode 100644 index 00000000..723641c6 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_fa.txt @@ -0,0 +1,313 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Note: by default this file is used after normalization, so when adding entries +# to this file, use the arabic 'ي' instead of 'ی' +انان +نداشته +سراسر +خياه +ايشان +وي +تاكنون +بيشتري +دوم +پس +ناشي +وگو +يا +داشتند +سپس +هنگام +هرگز +پنج +نشان +امسال +ديگر +گروهي +شدند +چطور +ده +و +دو +نخستين +ولي +چرا +چه +وسط +ه +كدام +قابل +يك +رفت +هفت +همچنين +در +هزار +بله +بلي +شايد +اما +شناسي +گرفته +دهد +داشته +دانست +داشتن +خواهيم +ميليارد +وقتيكه +امد +خواهد +جز +اورده +شده +بلكه +خدمات +شدن +برخي +نبود +بسياري +جلوگيري +حق +كردند +نوعي +بعري +نكرده +نظير +نبايد +بوده +بودن +داد +اورد +هست +جايي +شود +دنبال +داده +بايد +سابق +هيچ +همان +انجا +كمتر +كجاست +گردد +كسي +تر +مردم +تان +دادن +بودند +سري +جدا +ندارند +مگر +يكديگر +دارد +دهند +بنابراين +هنگامي +سمت +جا +انچه +خود +دادند +زياد +دارند +اثر +بدون +بهترين +بيشتر +البته +به +براساس +بيرون +كرد +بعضي +گرفت +توي +اي +ميليون +او +جريان +تول +بر +مانند +برابر +باشيم +مدتي +گويند +اكنون +تا +تنها +جديد +چند +بي +نشده +كردن +كردم +گويد +كرده +كنيم +نمي +نزد +روي +قصد +فقط +بالاي +ديگران +اين +ديروز +توسط +سوم +ايم +دانند +سوي +استفاده +شما +كنار +داريم +ساخته +طور +امده +رفته +نخست +بيست +نزديك +طي +كنيد +از +انها +تمامي +داشت +يكي +طريق +اش +چيست +روب +نمايد +گفت +چندين +چيزي +تواند +ام +ايا +با +ان +ايد +ترين +اينكه +ديگري +راه +هايي +بروز +همچنان +پاعين +كس +حدود +مختلف +مقابل +چيز +گيرد +ندارد +ضد +همچون +سازي +شان +مورد +باره +مرسي +خويش +برخوردار +چون +خارج +شش +هنوز +تحت +ضمن +هستيم +گفته +فكر +بسيار +پيش +براي +روزهاي +انكه +نخواهد +بالا +كل +وقتي +كي +چنين +كه +گيري +نيست +است +كجا +كند +نيز +يابد +بندي +حتي +توانند +عقب +خواست +كنند +بين +تمام +همه +ما +باشند +مثل +شد +اري +باشد +اره +طبق +بعد +اگر +صورت +غير +جاي +بيش +ريزي +اند +زيرا +چگونه +بار +لطفا +مي +درباره +من +ديده +همين +گذاري +برداري +علت +گذاشته +هم +فوق +نه +ها +شوند +اباد +همواره +هر +اول +خواهند +چهار +نام +امروز +مان +هاي +قبل +كنم +سعي +تازه +را +هستند +زير +جلوي +عنوان +بود diff --git a/test/solr/configsets/_default/conf/lang/stopwords_fi.txt b/test/solr/configsets/_default/conf/lang/stopwords_fi.txt new file mode 100644 index 00000000..4372c9a0 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_fi.txt @@ -0,0 +1,97 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| forms of BE + +olla +olen +olet +on +olemme +olette +ovat +ole | negative form + +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet + +en | negation +et +ei +emme +ette +eivät + +|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans +minä minun minut minua minussa minusta minuun minulla minulta minulle | I +sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you +hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she +me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we +te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you +he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they + +tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this +tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that +se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it +nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these +nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those +ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they + +kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who +ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) +mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what +mitkä | (pl) + +joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which +jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) + +| conjunctions + +että | that +ja | and +jos | if +koska | because +kuin | than +mutta | but +niin | so +sekä | and +sillä | for +tai | or +vaan | but +vai | or +vaikka | although + + +| prepositions + +kanssa | with +mukaan | according to +noin | about +poikki | across +yli | over, across + +| other + +kun | when +niin | so +nyt | now +itse | self + diff --git a/test/solr/configsets/_default/conf/lang/stopwords_fr.txt b/test/solr/configsets/_default/conf/lang/stopwords_fr.txt new file mode 100644 index 00000000..749abae6 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_fr.txt @@ -0,0 +1,186 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A French stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +au | a + le +aux | a + les +avec | with +ce | this +ces | these +dans | with +de | of +des | de + les +du | de + le +elle | she +en | `of them' etc +et | and +eux | them +il | he +je | I +la | the +le | the +leur | their +lui | him +ma | my (fem) +mais | but +me | me +même | same; as in moi-même (myself) etc +mes | me (pl) +moi | me +mon | my (masc) +ne | not +nos | our (pl) +notre | our +nous | we +on | one +ou | where +par | by +pas | not +pour | for +qu | que before vowel +que | that +qui | who +sa | his, her (fem) +se | oneself +ses | his (pl) +son | his, her (masc) +sur | on +ta | thy (fem) +te | thee +tes | thy (pl) +toi | thee +ton | thy (masc) +tu | thou +un | a +une | a +vos | your (pl) +votre | your +vous | you + + | single letter forms + +c | c' +d | d' +j | j' +l | l' +à | to, at +m | m' +n | n' +s | s' +t | t' +y | there + + | forms of être (not including the infinitive): +été +étée +étées +étés +étant +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent + + | forms of avoir (not including the infinitive): +ayant +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent + + | Later additions (from Jean-Christophe Deschamps) +ceci | this +cela | that +celà | that +cet | this +cette | this +ici | here +ils | they +les | the (pl) +leurs | their (pl) +quel | which +quels | which +quelle | which +quelles | which +sans | without +soi | oneself + diff --git a/test/solr/configsets/_default/conf/lang/stopwords_ga.txt b/test/solr/configsets/_default/conf/lang/stopwords_ga.txt new file mode 100644 index 00000000..9ff88d74 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_ga.txt @@ -0,0 +1,110 @@ + +a +ach +ag +agus +an +aon +ar +arna +as +b' +ba +beirt +bhúr +caoga +ceathair +ceathrar +chomh +chtó +chuig +chun +cois +céad +cúig +cúigear +d' +daichead +dar +de +deich +deichniúr +den +dhá +do +don +dtí +dá +dár +dó +faoi +faoin +faoina +faoinár +fara +fiche +gach +gan +go +gur +haon +hocht +i +iad +idir +in +ina +ins +inár +is +le +leis +lena +lenár +m' +mar +mo +mé +na +nach +naoi +naonúr +ná +ní +níor +nó +nócha +ocht +ochtar +os +roimh +sa +seacht +seachtar +seachtó +seasca +seisear +siad +sibh +sinn +sna +sé +sí +tar +thar +thú +triúr +trí +trína +trínár +tríocha +tú +um +ár +é +éis +í +ó +ón +óna +ónár diff --git a/test/solr/configsets/_default/conf/lang/stopwords_gl.txt b/test/solr/configsets/_default/conf/lang/stopwords_gl.txt new file mode 100644 index 00000000..d8760b12 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_gl.txt @@ -0,0 +1,161 @@ +# galican stopwords +a +aínda +alí +aquel +aquela +aquelas +aqueles +aquilo +aquí +ao +aos +as +así +á +ben +cando +che +co +coa +comigo +con +connosco +contigo +convosco +coas +cos +cun +cuns +cunha +cunhas +da +dalgunha +dalgunhas +dalgún +dalgúns +das +de +del +dela +delas +deles +desde +deste +do +dos +dun +duns +dunha +dunhas +e +el +ela +elas +eles +en +era +eran +esa +esas +ese +eses +esta +estar +estaba +está +están +este +estes +estiven +estou +eu +é +facer +foi +foron +fun +había +hai +iso +isto +la +las +lle +lles +lo +los +mais +me +meu +meus +min +miña +miñas +moi +na +nas +neste +nin +no +non +nos +nosa +nosas +noso +nosos +nós +nun +nunha +nuns +nunhas +o +os +ou +ó +ós +para +pero +pode +pois +pola +polas +polo +polos +por +que +se +senón +ser +seu +seus +sexa +sido +sobre +súa +súas +tamén +tan +te +ten +teñen +teño +ter +teu +teus +ti +tido +tiña +tiven +túa +túas +un +unha +unhas +uns +vos +vosa +vosas +voso +vosos +vós diff --git a/test/solr/configsets/_default/conf/lang/stopwords_hi.txt b/test/solr/configsets/_default/conf/lang/stopwords_hi.txt new file mode 100644 index 00000000..86286bb0 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_hi.txt @@ -0,0 +1,235 @@ +# Also see http://www.opensource.org/licenses/bsd-license.html +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# This file was created by Jacques Savoy and is distributed under the BSD license. +# Note: by default this file also contains forms normalized by HindiNormalizer +# for spelling variation (see section below), such that it can be used whether or +# not you enable that feature. When adding additional entries to this list, +# please add the normalized form as well. +अंदर +अत +अपना +अपनी +अपने +अभी +आदि +आप +इत्यादि +इन +इनका +इन्हीं +इन्हें +इन्हों +इस +इसका +इसकी +इसके +इसमें +इसी +इसे +उन +उनका +उनकी +उनके +उनको +उन्हीं +उन्हें +उन्हों +उस +उसके +उसी +उसे +एक +एवं +एस +ऐसे +और +कई +कर +करता +करते +करना +करने +करें +कहते +कहा +का +काफ़ी +कि +कितना +किन्हें +किन्हों +किया +किर +किस +किसी +किसे +की +कुछ +कुल +के +को +कोई +कौन +कौनसा +गया +घर +जब +जहाँ +जा +जितना +जिन +जिन्हें +जिन्हों +जिस +जिसे +जीधर +जैसा +जैसे +जो +तक +तब +तरह +तिन +तिन्हें +तिन्हों +तिस +तिसे +तो +था +थी +थे +दबारा +दिया +दुसरा +दूसरे +दो +द्वारा +न +नहीं +ना +निहायत +नीचे +ने +पर +पर +पहले +पूरा +पे +फिर +बनी +बही +बहुत +बाद +बाला +बिलकुल +भी +भीतर +मगर +मानो +मे +में +यदि +यह +यहाँ +यही +या +यिह +ये +रखें +रहा +रहे +ऱ्वासा +लिए +लिये +लेकिन +व +वर्ग +वह +वह +वहाँ +वहीं +वाले +वुह +वे +वग़ैरह +संग +सकता +सकते +सबसे +सभी +साथ +साबुत +साभ +सारा +से +सो +ही +हुआ +हुई +हुए +है +हैं +हो +होता +होती +होते +होना +होने +# additional normalized forms of the above +अपनि +जेसे +होति +सभि +तिंहों +इंहों +दवारा +इसि +किंहें +थि +उंहों +ओर +जिंहें +वहिं +अभि +बनि +हि +उंहिं +उंहें +हें +वगेरह +एसे +रवासा +कोन +निचे +काफि +उसि +पुरा +भितर +हे +बहि +वहां +कोइ +यहां +जिंहों +तिंहें +किसि +कइ +यहि +इंहिं +जिधर +इंहें +अदि +इतयादि +हुइ +कोनसा +इसकि +दुसरे +जहां +अप +किंहों +उनकि +भि +वरग +हुअ +जेसा +नहिं diff --git a/test/solr/configsets/_default/conf/lang/stopwords_hu.txt b/test/solr/configsets/_default/conf/lang/stopwords_hu.txt new file mode 100644 index 00000000..37526da8 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_hu.txt @@ -0,0 +1,211 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| Hungarian stop word list +| prepared by Anna Tordai + +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elő +először +előtt +első +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +ő +ők +őket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/test/solr/configsets/_default/conf/lang/stopwords_hy.txt b/test/solr/configsets/_default/conf/lang/stopwords_hy.txt new file mode 100644 index 00000000..60c1c50f --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_hy.txt @@ -0,0 +1,46 @@ +# example set of Armenian stopwords. +այդ +այլ +այն +այս +դու +դուք +եմ +են +ենք +ես +եք +է +էի +էին +էինք +էիր +էիք +էր +ըստ +թ +ի +ին +իսկ +իր +կամ +համար +հետ +հետո +մենք +մեջ +մի +ն +նա +նաև +նրա +նրանք +որ +որը +որոնք +որպես +ու +ում +պիտի +վրա +և diff --git a/test/solr/configsets/_default/conf/lang/stopwords_id.txt b/test/solr/configsets/_default/conf/lang/stopwords_id.txt new file mode 100644 index 00000000..4617f83a --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_id.txt @@ -0,0 +1,359 @@ +# from appendix D of: A Study of Stemming Effects on Information +# Retrieval in Bahasa Indonesia +ada +adanya +adalah +adapun +agak +agaknya +agar +akan +akankah +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +diantaranya +antara +antaranya +diantara +apa +apaan +mengapa +apabila +apakah +apalagi +apatah +atau +ataukah +ataupun +bagai +bagaikan +sebagai +sebagainya +bagaimana +bagaimanapun +sebagaimana +bagaimanakah +bagi +bahkan +bahwa +bahwasanya +sebaliknya +banyak +sebanyak +beberapa +seberapa +begini +beginian +beginikah +beginilah +sebegini +begitu +begitukah +begitulah +begitupun +sebegitu +belum +belumlah +sebelum +sebelumnya +sebenarnya +berapa +berapakah +berapalah +berapapun +betulkah +sebetulnya +biasa +biasanya +bila +bilakah +bisa +bisakah +sebisanya +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +cuma +percuma +dahulu +dalam +dan +dapat +dari +daripada +dekat +demi +demikian +demikianlah +sedemikian +dengan +depan +di +dia +dialah +dini +diri +dirinya +terdiri +dong +dulu +enggak +enggaknya +entah +entahlah +terhadap +terhadapnya +hal +hampir +hanya +hanyalah +harus +haruslah +harusnya +seharusnya +hendak +hendaklah +hendaknya +hingga +sehingga +ia +ialah +ibarat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jangan +jangankan +janganlah +jika +jikalau +juga +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +dikarenakan +karena +karenanya +ke +kecil +kemudian +kenapa +kepada +kepadanya +ketika +seketika +khususnya +kini +kinilah +kiranya +sekiranya +kita +kitalah +kok +lagi +lagian +selagi +lah +lain +lainnya +melainkan +selaku +lalu +melalui +terlalu +lama +lamanya +selama +selama +selamanya +lebih +terlebih +bermacam +macam +semacam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masih +masihkah +semasih +masing +mau +maupun +semaunya +memang +mereka +merekalah +meski +meskipun +semula +mungkin +mungkinkah +nah +namun +nanti +nantinya +nyaris +oleh +olehnya +seorang +seseorang +pada +padanya +padahal +paling +sepanjang +pantas +sepantasnya +sepantasnyalah +para +pasti +pastilah +per +pernah +pula +pun +merupakan +rupanya +serupa +saat +saatnya +sesaat +saja +sajalah +saling +bersama +sama +sesama +sambil +sampai +sana +sangat +sangatlah +saya +sayalah +se +sebab +sebabnya +sebuah +tersebut +tersebutlah +sedang +sedangkan +sedikit +sedikitnya +segala +segalanya +segera +sesegera +sejak +sejenak +sekali +sekalian +sekalipun +sesekali +sekaligus +sekarang +sekarang +sekitar +sekitarnya +sela +selain +selalu +seluruh +seluruhnya +semakin +sementara +sempat +semua +semuanya +sendiri +sendirinya +seolah +seperti +sepertinya +sering +seringnya +serta +siapa +siapakah +siapapun +disini +disinilah +sini +sinilah +sesuatu +sesuatunya +suatu +sesudah +sesudahnya +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tak +tanpa +setelah +telah +tentang +tentu +tentulah +tentunya +tertentu +seterusnya +tapi +tetapi +setiap +tiap +setidaknya +tidak +tidakkah +tidaklah +toh +waduh +wah +wahai +sewaktu +walau +walaupun +wong +yaitu +yakni +yang diff --git a/test/solr/configsets/_default/conf/lang/stopwords_it.txt b/test/solr/configsets/_default/conf/lang/stopwords_it.txt new file mode 100644 index 00000000..1219cc77 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_it.txt @@ -0,0 +1,303 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | An Italian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +ad | a (to) before vowel +al | a + il +allo | a + lo +ai | a + i +agli | a + gli +all | a + l' +agl | a + gl' +alla | a + la +alle | a + le +con | with +col | con + il +coi | con + i (forms collo, cogli etc are now very rare) +da | from +dal | da + il +dallo | da + lo +dai | da + i +dagli | da + gli +dall | da + l' +dagl | da + gll' +dalla | da + la +dalle | da + le +di | of +del | di + il +dello | di + lo +dei | di + i +degli | di + gli +dell | di + l' +degl | di + gl' +della | di + la +delle | di + le +in | in +nel | in + el +nello | in + lo +nei | in + i +negli | in + gli +nell | in + l' +negl | in + gl' +nella | in + la +nelle | in + le +su | on +sul | su + il +sullo | su + lo +sui | su + i +sugli | su + gli +sull | su + l' +sugl | su + gl' +sulla | su + la +sulle | su + le +per | through, by +tra | among +contro | against +io | I +tu | thou +lui | he +lei | she +noi | we +voi | you +loro | they +mio | my +mia | +miei | +mie | +tuo | +tua | +tuoi | thy +tue | +suo | +sua | +suoi | his, her +sue | +nostro | our +nostra | +nostri | +nostre | +vostro | your +vostra | +vostri | +vostre | +mi | me +ti | thee +ci | us, there +vi | you, there +lo | him, the +la | her, the +li | them +le | them, the +gli | to him, the +ne | from there etc +il | the +un | a +uno | a +una | a +ma | but +ed | and +se | if +perché | why, because +anche | also +come | how +dov | where (as dov') +dove | where +che | who, that +chi | who +cui | whom +non | not +più | more +quale | who, that +quanto | how much +quanti | +quanta | +quante | +quello | that +quelli | +quella | +quelle | +questo | this +questi | +questa | +queste | +si | yes +tutto | all +tutti | all + + | single letter forms: + +a | at +c | as c' for ce or ci +e | and +i | the +l | as l' +o | or + + | forms of avere, to have (not including the infinitive): + +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute + + | forms of essere, to be (not including the infinitive): +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo + + | forms of fare, to do (not including the infinitive, fa, fat-): +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo + + | forms of stare, to be (not including the infinitive): +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/test/solr/configsets/_default/conf/lang/stopwords_ja.txt b/test/solr/configsets/_default/conf/lang/stopwords_ja.txt new file mode 100644 index 00000000..d4321be6 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_ja.txt @@ -0,0 +1,127 @@ +# +# This file defines a stopword set for Japanese. +# +# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. +# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 +# for frequency lists, etc. that can be useful for making your own set (if desired) +# +# Note that there is an overlap between these stopwords and the terms stopped when used +# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note +# that comments are not allowed on the same line as stopwords. +# +# Also note that stopping is done in a case-insensitive manner. Change your StopFilter +# configuration if you need case-sensitive stopping. Lastly, note that stopping is done +# using the same character width as the entries in this file. Since this StopFilter is +# normally done after a CJKWidthFilter in your chain, you would usually want your romaji +# entries to be in half-width and your kana entries to be in full-width. +# +の +に +は +を +た +が +で +て +と +し +れ +さ +ある +いる +も +する +から +な +こと +として +い +や +れる +など +なっ +ない +この +ため +その +あっ +よう +また +もの +という +あり +まで +られ +なる +へ +か +だ +これ +によって +により +おり +より +による +ず +なり +られる +において +ば +なかっ +なく +しかし +について +せ +だっ +その後 +できる +それ +う +ので +なお +のみ +でき +き +つ +における +および +いう +さらに +でも +ら +たり +その他 +に関する +たち +ます +ん +なら +に対して +特に +せる +及び +これら +とき +では +にて +ほか +ながら +うち +そして +とともに +ただし +かつて +それぞれ +または +お +ほど +ものの +に対する +ほとんど +と共に +といった +です +とも +ところ +ここ +##### End of file diff --git a/test/solr/configsets/_default/conf/lang/stopwords_lv.txt b/test/solr/configsets/_default/conf/lang/stopwords_lv.txt new file mode 100644 index 00000000..e21a23c0 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_lv.txt @@ -0,0 +1,172 @@ +# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins +# the original list of over 800 forms was refined: +# pronouns, adverbs, interjections were removed +# +# prepositions +aiz +ap +ar +apakš +ārpus +augšpus +bez +caur +dēļ +gar +iekš +iz +kopš +labad +lejpus +līdz +no +otrpus +pa +par +pār +pēc +pie +pirms +pret +priekš +starp +šaipus +uz +viņpus +virs +virspus +zem +apakšpus +# Conjunctions +un +bet +jo +ja +ka +lai +tomēr +tikko +turpretī +arī +kaut +gan +tādēļ +tā +ne +tikvien +vien +kā +ir +te +vai +kamēr +# Particles +ar +diezin +droši +diemžēl +nebūt +ik +it +taču +nu +pat +tiklab +iekšpus +nedz +tik +nevis +turpretim +jeb +iekam +iekām +iekāms +kolīdz +līdzko +tiklīdz +jebšu +tālab +tāpēc +nekā +itin +jā +jau +jel +nē +nezin +tad +tikai +vis +tak +iekams +vien +# modal verbs +būt +biju +biji +bija +bijām +bijāt +esmu +esi +esam +esat +būšu +būsi +būs +būsim +būsiet +tikt +tiku +tiki +tika +tikām +tikāt +tieku +tiec +tiek +tiekam +tiekat +tikšu +tiks +tiksim +tiksiet +tapt +tapi +tapāt +topat +tapšu +tapsi +taps +tapsim +tapsiet +kļūt +kļuvu +kļuvi +kļuva +kļuvām +kļuvāt +kļūstu +kļūsti +kļūst +kļūstam +kļūstat +kļūšu +kļūsi +kļūs +kļūsim +kļūsiet +# verbs +varēt +varēju +varējām +varēšu +varēsim +var +varēji +varējāt +varēsi +varēsiet +varat +varēja +varēs diff --git a/test/solr/configsets/_default/conf/lang/stopwords_nl.txt b/test/solr/configsets/_default/conf/lang/stopwords_nl.txt new file mode 100644 index 00000000..47a2aeac --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_nl.txt @@ -0,0 +1,119 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Dutch stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large sample of Dutch text. + + | Dutch stop words frequently exhibit homonym clashes. These are indicated + | clearly below. + +de | the +en | and +van | of, from +ik | I, the ego +te | (1) chez, at etc, (2) to, (3) too +dat | that, which +die | that, those, who, which +in | in, inside +een | a, an, one +hij | he +het | the, it +niet | not, nothing, naught +zijn | (1) to be, being, (2) his, one's, its +is | is +was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river +op | on, upon, at, in, up, used up +aan | on, upon, to (as dative) +met | with, by +als | like, such as, when +voor | (1) before, in front of, (2) furrow +had | had, past tense all persons sing. of 'hebben' (have) +er | there +maar | but, only +om | round, about, for etc +hem | him +dan | then +zou | should/would, past tense all persons sing. of 'zullen' +of | or, whether, if +wat | what, something, anything +mijn | possessive and noun 'mine' +men | people, 'one' +dit | this +zo | so, thus, in this way +door | through by +over | over, across +ze | she, her, they, them +zich | oneself +bij | (1) a bee, (2) by, near, at +ook | also, too +tot | till, until +je | you +mij | me +uit | out of, from +der | Old Dutch form of 'van der' still found in surnames +daar | (1) there, (2) because +haar | (1) her, their, them, (2) hair +naar | (1) unpleasant, unwell etc, (2) towards, (3) as +heb | present first person sing. of 'to have' +hoe | how, why +heeft | present third person sing. of 'to have' +hebben | 'to have' and various parts thereof +deze | this +u | you +want | (1) for, (2) mitten, (3) rigging +nog | yet, still +zal | 'shall', first and third person sing. of verb 'zullen' (will) +me | me +zij | she, they +nu | now +ge | 'thou', still used in Belgium and south Netherlands +geen | none +omdat | because +iets | something, somewhat +worden | to become, grow, get +toch | yet, still +al | all, every, each +waren | (1) 'were' (2) to wander, (3) wares, (3) +veel | much, many +meer | (1) more, (2) lake +doen | to do, to make +toen | then, when +moet | noun 'spot/mote' and present form of 'to must' +ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' +zonder | without +kan | noun 'can' and present form of 'to be able' +hun | their, them +dus | so, consequently +alles | all, everything, anything +onder | under, beneath +ja | yes, of course +eens | once, one day +hier | here +wie | who +werd | imperfect third person sing. of 'become' +altijd | always +doch | yet, but etc +wordt | present third person sing. of 'become' +wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans +kunnen | to be able +ons | us/our +zelf | self +tegen | against, towards, at +na | after, near +reeds | already +wil | (1) present tense of 'want', (2) 'will', noun, (3) fender +kon | could; past tense of 'to be able' +niets | nothing +uw | your +iemand | somebody +geweest | been; past participle of 'be' +andere | other diff --git a/test/solr/configsets/_default/conf/lang/stopwords_no.txt b/test/solr/configsets/_default/conf/lang/stopwords_no.txt new file mode 100644 index 00000000..a7a2c28b --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_no.txt @@ -0,0 +1,194 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Norwegian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This stop word list is for the dominant bokmål dialect. Words unique + | to nynorsk are marked *. + + | Revised by Jan Bruusgaard , Jan 2005 + +og | and +i | in +jeg | I +det | it/this/that +at | to (w. inf.) +en | a/an +et | a/an +den | it/this/that +til | to +er | is/am/are +som | who/that +på | on +de | they / you(formal) +med | with +han | he +av | of +ikke | not +ikkje | not * +der | there +så | so +var | was/were +meg | me +seg | you +men | but +ett | one +har | have +om | about +vi | we +min | my +mitt | my +ha | have +hadde | had +hun | she +nå | now +over | over +da | when/as +ved | by/know +fra | from +du | you +ut | out +sin | your +dem | them +oss | us +opp | up +man | you/one +kan | can +hans | his +hvor | where +eller | or +hva | what +skal | shall/must +selv | self (reflective) +sjøl | self (reflective) +her | here +alle | all +vil | will +bli | become +ble | became +blei | became * +blitt | have become +kunne | could +inn | in +når | when +være | be +kom | come +noen | some +noe | some +ville | would +dere | you +som | who/which/that +deres | their/theirs +kun | only/just +ja | yes +etter | after +ned | down +skulle | should +denne | this +for | for/because +deg | you +si | hers/his +sine | hers/his +sitt | hers/his +mot | against +å | to +meget | much +hvorfor | why +dette | this +disse | these/those +uten | without +hvordan | how +ingen | none +din | your +ditt | your +blir | become +samme | same +hvilken | which +hvilke | which (plural) +sånn | such a +inni | inside/within +mellom | between +vår | our +hver | each +hvem | who +vors | us/ours +hvis | whose +både | both +bare | only/just +enn | than +fordi | as/because +før | before +mange | many +også | also +slik | just +vært | been +være | to be +båe | both * +begge | both +siden | since +dykk | your * +dykkar | yours * +dei | they * +deira | them * +deires | theirs * +deim | them * +di | your (fem.) * +då | as/when * +eg | I * +ein | a/an * +eit | a/an * +eitt | a/an * +elles | or * +honom | he * +hjå | at * +ho | she * +hoe | she * +henne | her +hennar | her/hers +hennes | hers +hoss | how * +hossen | how * +ikkje | not * +ingi | noone * +inkje | noone * +korleis | how * +korso | how * +kva | what/which * +kvar | where * +kvarhelst | where * +kven | who/whom * +kvi | why * +kvifor | why * +me | we * +medan | while * +mi | my * +mine | my * +mykje | much * +no | now * +nokon | some (masc./neut.) * +noka | some (fem.) * +nokor | some * +noko | some * +nokre | some * +si | his/hers * +sia | since * +sidan | since * +so | so * +somt | some * +somme | some * +um | about* +upp | up * +vere | be * +vore | was * +verte | become * +vort | become * +varte | became * +vart | became * + diff --git a/test/solr/configsets/_default/conf/lang/stopwords_pt.txt b/test/solr/configsets/_default/conf/lang/stopwords_pt.txt new file mode 100644 index 00000000..acfeb01a --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_pt.txt @@ -0,0 +1,253 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Portuguese stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | of, from +a | the; to, at; her +o | the; him +que | who, that +e | and +do | de + o +da | de + a +em | in +um | a +para | for + | é from SER +com | with +não | not, no +uma | a +os | the; them +no | em + o +se | himself etc +na | em + a +por | for +mais | more +as | the; them +dos | de + os +como | as, like +mas | but + | foi from SER +ao | a + o +ele | he +das | de + as + | tem from TER +à | a + a +seu | his +sua | her +ou | or + | ser from SER +quando | when +muito | much + | há from HAV +nos | em + os; us +já | already, now + | está from EST +eu | I +também | also +só | only, just +pelo | per + o +pela | per + a +até | up to +isso | that +ela | he +entre | between + | era from SER +depois | after +sem | without +mesmo | same +aos | a + os + | ter from TER +seus | his +quem | whom +nas | em + as +me | me +esse | that +eles | they + | estão from EST +você | you + | tinha from TER + | foram from SER +essa | that +num | em + um +nem | nor +suas | her +meu | my +às | a + as +minha | my + | têm from TER +numa | em + uma +pelos | per + os +elas | they + | havia from HAV + | seja from SER +qual | which + | será from SER +nós | we + | tenho from TER +lhe | to him, her +deles | of them +essas | those +esses | those +pelas | per + as +este | this + | fosse from SER +dele | of him + + | other words. There are many contractions such as naquele = em+aquele, + | mo = me+o, but they are rare. + | Indefinite article plural forms are also rare. + +tu | thou +te | thee +vocês | you (plural) +vos | you +lhes | to them +meus | my +minhas +teu | thy +tua +teus +tuas +nosso | our +nossa +nossos +nossas + +dela | of her +delas | of them + +esta | this +estes | these +estas | these +aquele | that +aquela | that +aqueles | those +aquelas | those +isto | this +aquilo | that + + | forms of estar, to be (not including the infinitive): +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem + + | forms of haver, to have (not including the infinitive): +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam + + | forms of ser, to be (not including the infinitive): +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam + + | forms of ter, to have (not including the infinitive): +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/test/solr/configsets/_default/conf/lang/stopwords_ro.txt b/test/solr/configsets/_default/conf/lang/stopwords_ro.txt new file mode 100644 index 00000000..4fdee90a --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_ro.txt @@ -0,0 +1,233 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +acea +aceasta +această +aceea +acei +aceia +acel +acela +acele +acelea +acest +acesta +aceste +acestea +aceşti +aceştia +acolo +acum +ai +aia +aibă +aici +al +ăla +ale +alea +ălea +altceva +altcineva +am +ar +are +aş +aşadar +asemenea +asta +ăsta +astăzi +astea +ăstea +ăştia +asupra +aţi +au +avea +avem +aveţi +azi +bine +bucur +bună +ca +că +căci +când +care +cărei +căror +cărui +cât +câte +câţi +către +câtva +ce +cel +ceva +chiar +cînd +cine +cineva +cît +cîte +cîţi +cîtva +contra +cu +cum +cumva +curând +curînd +da +dă +dacă +dar +datorită +de +deci +deja +deoarece +departe +deşi +din +dinaintea +dintr +dintre +drept +după +ea +ei +el +ele +eram +este +eşti +eu +face +fără +fi +fie +fiecare +fii +fim +fiţi +iar +ieri +îi +îl +îmi +împotriva +în +înainte +înaintea +încât +încît +încotro +între +întrucât +întrucît +îţi +la +lângă +le +li +lîngă +lor +lui +mă +mâine +mea +mei +mele +mereu +meu +mi +mine +mult +multă +mulţi +ne +nicăieri +nici +nimeni +nişte +noastră +noastre +noi +noştri +nostru +nu +ori +oricând +oricare +oricât +orice +oricînd +oricine +oricît +oricum +oriunde +până +pe +pentru +peste +pînă +poate +pot +prea +prima +primul +prin +printr +sa +să +săi +sale +sau +său +se +şi +sînt +sîntem +sînteţi +spre +sub +sunt +suntem +sunteţi +ta +tăi +tale +tău +te +ţi +ţie +tine +toată +toate +tot +toţi +totuşi +tu +un +una +unde +undeva +unei +unele +uneori +unor +vă +vi +voastră +voastre +voi +voştri +vostru +vouă +vreo +vreun diff --git a/test/solr/configsets/_default/conf/lang/stopwords_ru.txt b/test/solr/configsets/_default/conf/lang/stopwords_ru.txt new file mode 100644 index 00000000..55271400 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_ru.txt @@ -0,0 +1,243 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | a russian stop word list. comments begin with vertical bar. each stop + | word is at the start of a line. + + | this is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | letter `ё' is translated to `е'. + +и | and +в | in/into +во | alternative form +не | not +что | what/that +он | he +на | on/onto +я | i +с | from +со | alternative form +как | how +а | milder form of `no' (but) +то | conjunction and form of `that' +все | all +она | she +так | so, thus +его | him +но | but +да | yes/and +ты | thou +к | towards, by +у | around, chez +же | intensifier particle +вы | you +за | beyond, behind +бы | conditional/subj. particle +по | up to, along +только | only +ее | her +мне | to me +было | it was +вот | here is/are, particle +от | away from +меня | me +еще | still, yet, more +нет | no, there isnt/arent +о | about +из | out of +ему | to him +теперь | now +когда | when +даже | even +ну | so, well +вдруг | suddenly +ли | interrogative particle +если | if +уже | already, but homonym of `narrower' +или | or +ни | neither +быть | to be +был | he was +него | prepositional form of его +до | up to +вас | you accusative +нибудь | indef. suffix preceded by hyphen +опять | again +уж | already, but homonym of `adder' +вам | to you +сказал | he said +ведь | particle `after all' +там | there +потом | then +себя | oneself +ничего | nothing +ей | to her +может | usually with `быть' as `maybe' +они | they +тут | here +где | where +есть | there is/are +надо | got to, must +ней | prepositional form of ей +для | for +мы | we +тебя | thee +их | them, their +чем | than +была | she was +сам | self +чтоб | in order to +без | without +будто | as if +человек | man, person, one +чего | genitive form of `what' +раз | once +тоже | also +себе | to oneself +под | beneath +жизнь | life +будет | will be +ж | short form of intensifer particle `же' +тогда | then +кто | who +этот | this +говорил | was saying +того | genitive form of `that' +потому | for that reason +этого | genitive form of `this' +какой | which +совсем | altogether +ним | prepositional form of `его', `они' +здесь | here +этом | prepositional form of `этот' +один | one +почти | almost +мой | my +тем | instrumental/dative plural of `тот', `то' +чтобы | full form of `in order that' +нее | her (acc.) +кажется | it seems +сейчас | now +были | they were +куда | where to +зачем | why +сказать | to say +всех | all (acc., gen. preposn. plural) +никогда | never +сегодня | today +можно | possible, one can +при | by +наконец | finally +два | two +об | alternative form of `о', about +другой | another +хоть | even +после | after +над | above +больше | more +тот | that one (masc.) +через | across, in +эти | these +нас | us +про | about +всего | in all, only, of all +них | prepositional form of `они' (they) +какая | which, feminine +много | lots +разве | interrogative particle +сказала | she said +три | three +эту | this, acc. fem. sing. +моя | my, feminine +впрочем | moreover, besides +хорошо | good +свою | ones own, acc. fem. sing. +этой | oblique form of `эта', fem. `this' +перед | in front of +иногда | sometimes +лучше | better +чуть | a little +том | preposn. form of `that one' +нельзя | one must not +такой | such a one +им | to them +более | more +всегда | always +конечно | of course +всю | acc. fem. sing of `all' +между | between + + + | b: some paradigms + | + | personal pronouns + | + | я меня мне мной [мною] + | ты тебя тебе тобой [тобою] + | он его ему им [него, нему, ним] + | она ее эи ею [нее, нэи, нею] + | оно его ему им [него, нему, ним] + | + | мы нас нам нами + | вы вас вам вами + | они их им ими [них, ним, ними] + | + | себя себе собой [собою] + | + | demonstrative pronouns: этот (this), тот (that) + | + | этот эта это эти + | этого эты это эти + | этого этой этого этих + | этому этой этому этим + | этим этой этим [этою] этими + | этом этой этом этих + | + | тот та то те + | того ту то те + | того той того тех + | тому той тому тем + | тем той тем [тою] теми + | том той том тех + | + | determinative pronouns + | + | (a) весь (all) + | + | весь вся все все + | всего всю все все + | всего всей всего всех + | всему всей всему всем + | всем всей всем [всею] всеми + | всем всей всем всех + | + | (b) сам (himself etc) + | + | сам сама само сами + | самого саму само самих + | самого самой самого самих + | самому самой самому самим + | самим самой самим [самою] самими + | самом самой самом самих + | + | stems of verbs `to be', `to have', `to do' and modal + | + | быть бы буд быв есть суть + | име + | дел + | мог мож мочь + | уме + | хоч хот + | долж + | можн + | нужн + | нельзя + diff --git a/test/solr/configsets/_default/conf/lang/stopwords_sv.txt b/test/solr/configsets/_default/conf/lang/stopwords_sv.txt new file mode 100644 index 00000000..096f87f6 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_sv.txt @@ -0,0 +1,133 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Swedish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | Swedish stop words occasionally exhibit homonym clashes. For example + | så = so, but also seed. These are indicated clearly below. + +och | and +det | it, this/that +att | to (with infinitive) +i | in, at +en | a +jag | I +hon | she +som | who, that +han | he +på | on +den | it, this/that +med | with +var | where, each +sig | him(self) etc +för | for +så | so (also: seed) +till | to +är | is +men | but +ett | a +om | if; around, about +hade | had +de | they, these/those +av | of +icke | not, no +mig | me +du | you +henne | her +då | then, when +sin | his +nu | now +har | have +inte | inte någon = no one +hans | his +honom | him +skulle | 'sake' +hennes | her +där | there +min | my +man | one (pronoun) +ej | nor +vid | at, by, on (also: vast) +kunde | could +något | some etc +från | from, off +ut | out +när | when +efter | after, behind +upp | up +vi | we +dem | them +vara | be +vad | what +över | over +än | than +dig | you +kan | can +sina | his +här | here +ha | have +mot | towards +alla | all +under | under (also: wonder) +någon | some etc +eller | or (else) +allt | all +mycket | much +sedan | since +ju | why +denna | this/that +själv | myself, yourself etc +detta | this/that +åt | to +utan | without +varit | was +hur | how +ingen | no +mitt | my +ni | you +bli | to be, become +blev | from bli +oss | us +din | thy +dessa | these/those +några | some etc +deras | their +blir | from bli +mina | my +samma | (the) same +vilken | who, that +er | you, your +sådan | such a +vår | our +blivit | from bli +dess | its +inom | within +mellan | between +sådant | such a +varför | why +varje | each +vilka | who, that +ditt | thy +vem | who +vilket | who, that +sitta | his +sådana | such a +vart | each +dina | thy +vars | whose +vårt | our +våra | our +ert | your +era | your +vilkas | whose + diff --git a/test/solr/configsets/_default/conf/lang/stopwords_th.txt b/test/solr/configsets/_default/conf/lang/stopwords_th.txt new file mode 100644 index 00000000..07f0fabe --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_th.txt @@ -0,0 +1,119 @@ +# Thai stopwords from: +# "Opinion Detection in Thai Political News Columns +# Based on Subjectivity Analysis" +# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak +ไว้ +ไม่ +ไป +ได้ +ให้ +ใน +โดย +แห่ง +แล้ว +และ +แรก +แบบ +แต่ +เอง +เห็น +เลย +เริ่ม +เรา +เมื่อ +เพื่อ +เพราะ +เป็นการ +เป็น +เปิดเผย +เปิด +เนื่องจาก +เดียวกัน +เดียว +เช่น +เฉพาะ +เคย +เข้า +เขา +อีก +อาจ +อะไร +ออก +อย่าง +อยู่ +อยาก +หาก +หลาย +หลังจาก +หลัง +หรือ +หนึ่ง +ส่วน +ส่ง +สุด +สําหรับ +ว่า +วัน +ลง +ร่วม +ราย +รับ +ระหว่าง +รวม +ยัง +มี +มาก +มา +พร้อม +พบ +ผ่าน +ผล +บาง +น่า +นี้ +นํา +นั้น +นัก +นอกจาก +ทุก +ที่สุด +ที่ +ทําให้ +ทํา +ทาง +ทั้งนี้ +ทั้ง +ถ้า +ถูก +ถึง +ต้อง +ต่างๆ +ต่าง +ต่อ +ตาม +ตั้งแต่ +ตั้ง +ด้าน +ด้วย +ดัง +ซึ่ง +ช่วง +จึง +จาก +จัด +จะ +คือ +ความ +ครั้ง +คง +ขึ้น +ของ +ขอ +ขณะ +ก่อน +ก็ +การ +กับ +กัน +กว่า +กล่าว diff --git a/test/solr/configsets/_default/conf/lang/stopwords_tr.txt b/test/solr/configsets/_default/conf/lang/stopwords_tr.txt new file mode 100644 index 00000000..84d9408d --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/stopwords_tr.txt @@ -0,0 +1,212 @@ +# Turkish stopwords from LUCENE-559 +# merged with the list from "Information Retrieval on Turkish Texts" +# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) +acaba +altmış +altı +ama +ancak +arada +aslında +ayrıca +bana +bazı +belki +ben +benden +beni +benim +beri +beş +bile +bin +bir +birçok +biri +birkaç +birkez +birşey +birşeyi +biz +bize +bizden +bizi +bizim +böyle +böylece +bu +buna +bunda +bundan +bunlar +bunları +bunların +bunu +bunun +burada +çok +çünkü +da +daha +dahi +de +defa +değil +diğer +diye +doksan +dokuz +dolayı +dolayısıyla +dört +edecek +eden +ederek +edilecek +ediliyor +edilmesi +ediyor +eğer +elli +en +etmesi +etti +ettiği +ettiğini +gibi +göre +halen +hangi +hatta +hem +henüz +hep +hepsi +her +herhangi +herkesin +hiç +hiçbir +için +iki +ile +ilgili +ise +işte +itibaren +itibariyle +kadar +karşın +katrilyon +kendi +kendilerine +kendini +kendisi +kendisine +kendisini +kez +ki +kim +kimden +kime +kimi +kimse +kırk +milyar +milyon +mu +mü +mı +nasıl +ne +neden +nedenle +nerde +nerede +nereye +niye +niçin +o +olan +olarak +oldu +olduğu +olduğunu +olduklarını +olmadı +olmadığı +olmak +olması +olmayan +olmaz +olsa +olsun +olup +olur +olursa +oluyor +on +ona +ondan +onlar +onlardan +onları +onların +onu +onun +otuz +oysa +öyle +pek +rağmen +sadece +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +sizi +sizin +şey +şeyden +şeyi +şeyler +şöyle +şu +şuna +şunda +şundan +şunları +şunu +tarafından +trilyon +tüm +üç +üzere +var +vardı +ve +veya +ya +yani +yapacak +yapılan +yapılması +yapıyor +yapmak +yaptı +yaptığı +yaptığını +yaptıkları +yedi +yerine +yetmiş +yine +yirmi +yoksa +yüz +zaten diff --git a/test/solr/configsets/_default/conf/lang/userdict_ja.txt b/test/solr/configsets/_default/conf/lang/userdict_ja.txt new file mode 100644 index 00000000..6f0368e4 --- /dev/null +++ b/test/solr/configsets/_default/conf/lang/userdict_ja.txt @@ -0,0 +1,29 @@ +# +# This is a sample user dictionary for Kuromoji (JapaneseTokenizer) +# +# Add entries to this file in order to override the statistical model in terms +# of segmentation, readings and part-of-speech tags. Notice that entries do +# not have weights since they are always used when found. This is by-design +# in order to maximize ease-of-use. +# +# Entries are defined using the following CSV format: +# , ... , ... , +# +# Notice that a single half-width space separates tokens and readings, and +# that the number tokens and readings must match exactly. +# +# Also notice that multiple entries with the same is undefined. +# +# Whitespace only lines are ignored. Comments are not allowed on entry lines. +# + +# Custom segmentation for kanji compounds +日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 +関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 + +# Custom segmentation for compound katakana +トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 +ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 + +# Custom reading for former sumo wrestler +朝青龍,朝青龍,アサショウリュウ,カスタム人名 diff --git a/test/solr/configsets/_default/conf/managed-schema b/test/solr/configsets/_default/conf/managed-schema new file mode 100644 index 00000000..e99e27e9 --- /dev/null +++ b/test/solr/configsets/_default/conf/managed-schema @@ -0,0 +1,1031 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/solr/configsets/_default/conf/protwords.txt b/test/solr/configsets/_default/conf/protwords.txt new file mode 100644 index 00000000..1dfc0abe --- /dev/null +++ b/test/solr/configsets/_default/conf/protwords.txt @@ -0,0 +1,21 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +# Use a protected word file to protect against the stemmer reducing two +# unrelated words to the same base word. + +# Some non-words that normally won't be encountered, +# just to test that they won't be stemmed. +dontstems +zwhacky + diff --git a/test/solr/configsets/_default/conf/solrconfig.xml b/test/solr/configsets/_default/conf/solrconfig.xml new file mode 100644 index 00000000..165544a2 --- /dev/null +++ b/test/solr/configsets/_default/conf/solrconfig.xml @@ -0,0 +1,1295 @@ + + + + + + + + + 8.10.1 + + + + + + + + + + + ${solr.data.dir:} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.lock.type:native} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.ulog.dir:} + ${solr.ulog.numVersionBuckets:65536} + + + + + ${solr.autoCommit.maxTime:15000} + false + + + + + + ${solr.autoSoftCommit.maxTime:-1} + + + + + + + + + + + + + + ${solr.max.booleanClauses:1024} + + + + + + + + + + + + + + + + + + + + + + + + true + + + + + + 20 + + + 200 + + + + + + + + + + + + + + + + + + + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + explicit + 10 + + + + + + + + + + + + + + + + explicit + json + true + + + + + + _text_ + + + + + + + + + text_general + + + + + + default + _text_ + solr.DirectSolrSpellChecker + + internal + + 0.5 + + 2 + + 1 + + 5 + + 4 + + 0.01 + + + + + + + + + + + + default + on + true + 10 + 5 + 5 + true + true + 10 + 5 + + + spellcheck + + + + + + + + + + true + false + + + terms + + + + + + + + + + + 100 + + + + + + + + 70 + + 0.5 + + [-\w ,/\n\"']{20,200} + + + + + + + ]]> + ]]> + + + + + + + + + + + + + + + + + + + + + + + + ,, + ,, + ,, + ,, + ,]]> + ]]> + + + + + + 10 + .,!? + + + + + + + WORD + + + en + US + + + + + + + + + + + + [^\w-\.] + _ + + + + + + + yyyy-MM-dd['T'[HH:mm[:ss[.SSS]][z + yyyy-MM-dd['T'[HH:mm[:ss[,SSS]][z + yyyy-MM-dd HH:mm[:ss[.SSS]][z + yyyy-MM-dd HH:mm[:ss[,SSS]][z + [EEE, ]dd MMM yyyy HH:mm[:ss] z + EEEE, dd-MMM-yy HH:mm:ss z + EEE MMM ppd HH:mm:ss [z ]yyyy + + + + + java.lang.String + text_general + + *_str + 256 + + + true + + + java.lang.Boolean + booleans + + + java.util.Date + pdates + + + java.lang.Long + java.lang.Integer + plongs + + + java.lang.Number + pdoubles + + + + + + + + + + + + + + + + + + + + text/plain; charset=UTF-8 + + + + + + + + + + + + + + diff --git a/test/solr/configsets/_default/conf/stopwords.txt b/test/solr/configsets/_default/conf/stopwords.txt new file mode 100644 index 00000000..ae1e83ee --- /dev/null +++ b/test/solr/configsets/_default/conf/stopwords.txt @@ -0,0 +1,14 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/test/solr/configsets/_default/conf/synonyms.txt b/test/solr/configsets/_default/conf/synonyms.txt new file mode 100644 index 00000000..eab4ee87 --- /dev/null +++ b/test/solr/configsets/_default/conf/synonyms.txt @@ -0,0 +1,29 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaafoo => aaabar +bbbfoo => bbbfoo bbbbar +cccfoo => cccbar cccbaz +fooaaa,baraaa,bazaaa + +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +pixima => pixma + diff --git a/test/solr/configsets/property_search/conf/enumsconfig.xml b/test/solr/configsets/property_search/conf/enumsconfig.xml new file mode 100644 index 00000000..72e7b7d3 --- /dev/null +++ b/test/solr/configsets/property_search/conf/enumsconfig.xml @@ -0,0 +1,12 @@ + + + + ONTOLOGY + VALUE_SET_COLLECTION + + + ANNOTATION + DATATYPE + OBJECT + + \ No newline at end of file diff --git a/test/solr/configsets/property_search/conf/lang/contractions_ca.txt b/test/solr/configsets/property_search/conf/lang/contractions_ca.txt new file mode 100644 index 00000000..307a85f9 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/contractions_ca.txt @@ -0,0 +1,8 @@ +# Set of Catalan contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +l +m +n +s +t diff --git a/test/solr/configsets/property_search/conf/lang/contractions_fr.txt b/test/solr/configsets/property_search/conf/lang/contractions_fr.txt new file mode 100644 index 00000000..f1bba51b --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/contractions_fr.txt @@ -0,0 +1,15 @@ +# Set of French contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +l +m +t +qu +n +s +j +d +c +jusqu +quoiqu +lorsqu +puisqu diff --git a/test/solr/configsets/property_search/conf/lang/contractions_ga.txt b/test/solr/configsets/property_search/conf/lang/contractions_ga.txt new file mode 100644 index 00000000..9ebe7fa3 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/contractions_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +m +b diff --git a/test/solr/configsets/property_search/conf/lang/contractions_it.txt b/test/solr/configsets/property_search/conf/lang/contractions_it.txt new file mode 100644 index 00000000..cac04095 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/contractions_it.txt @@ -0,0 +1,23 @@ +# Set of Italian contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +c +l +all +dall +dell +nell +sull +coll +pell +gl +agl +dagl +degl +negl +sugl +un +m +t +s +v +d diff --git a/test/solr/configsets/property_search/conf/lang/hyphenations_ga.txt b/test/solr/configsets/property_search/conf/lang/hyphenations_ga.txt new file mode 100644 index 00000000..4d2642cc --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/hyphenations_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish hyphenations for StopFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +h +n +t diff --git a/test/solr/configsets/property_search/conf/lang/stemdict_nl.txt b/test/solr/configsets/property_search/conf/lang/stemdict_nl.txt new file mode 100644 index 00000000..44107297 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stemdict_nl.txt @@ -0,0 +1,6 @@ +# Set of overrides for the dutch stemmer +# TODO: load this as a resource from the analyzer and sync it in build.xml +fiets fiets +bromfiets bromfiets +ei eier +kind kinder diff --git a/test/solr/configsets/property_search/conf/lang/stoptags_ja.txt b/test/solr/configsets/property_search/conf/lang/stoptags_ja.txt new file mode 100644 index 00000000..71b75084 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stoptags_ja.txt @@ -0,0 +1,420 @@ +# +# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. +# +# Any token with a part-of-speech tag that exactly matches those defined in this +# file are removed from the token stream. +# +# Set your own stoptags by uncommenting the lines below. Note that comments are +# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, +# etc. that can be useful for building you own stoptag set. +# +# The entire possible tagset is provided below for convenience. +# +##### +# noun: unclassified nouns +#名詞 +# +# noun-common: Common nouns or nouns where the sub-classification is undefined +#名詞-一般 +# +# noun-proper: Proper nouns where the sub-classification is undefined +#名詞-固有名詞 +# +# noun-proper-misc: miscellaneous proper nouns +#名詞-固有名詞-一般 +# +# noun-proper-person: Personal names where the sub-classification is undefined +#名詞-固有名詞-人名 +# +# noun-proper-person-misc: names that cannot be divided into surname and +# given name; foreign names; names where the surname or given name is unknown. +# e.g. お市の方 +#名詞-固有名詞-人名-一般 +# +# noun-proper-person-surname: Mainly Japanese surnames. +# e.g. 山田 +#名詞-固有名詞-人名-姓 +# +# noun-proper-person-given_name: Mainly Japanese given names. +# e.g. 太郎 +#名詞-固有名詞-人名-名 +# +# noun-proper-organization: Names representing organizations. +# e.g. 通産省, NHK +#名詞-固有名詞-組織 +# +# noun-proper-place: Place names where the sub-classification is undefined +#名詞-固有名詞-地域 +# +# noun-proper-place-misc: Place names excluding countries. +# e.g. アジア, バルセロナ, 京都 +#名詞-固有名詞-地域-一般 +# +# noun-proper-place-country: Country names. +# e.g. 日本, オーストラリア +#名詞-固有名詞-地域-国 +# +# noun-pronoun: Pronouns where the sub-classification is undefined +#名詞-代名詞 +# +# noun-pronoun-misc: miscellaneous pronouns: +# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ +#名詞-代名詞-一般 +# +# noun-pronoun-contraction: Spoken language contraction made by combining a +# pronoun and the particle 'wa'. +# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ +#名詞-代名詞-縮約 +# +# noun-adverbial: Temporal nouns such as names of days or months that behave +# like adverbs. Nouns that represent amount or ratios and can be used adverbially, +# e.g. 金曜, 一月, 午後, 少量 +#名詞-副詞可能 +# +# noun-verbal: Nouns that take arguments with case and can appear followed by +# 'suru' and related verbs (する, できる, なさる, くださる) +# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り +#名詞-サ変接続 +# +# noun-adjective-base: The base form of adjectives, words that appear before な ("na") +# e.g. 健康, 安易, 駄目, だめ +#名詞-形容動詞語幹 +# +# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数. +# e.g. 0, 1, 2, 何, 数, 幾 +#名詞-数 +# +# noun-affix: noun affixes where the sub-classification is undefined +#名詞-非自立 +# +# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that +# attach to the base form of inflectional words, words that cannot be classified +# into any of the other categories below. This category includes indefinite nouns. +# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, +# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, +# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳, +# わり, 割り, 割, ん-口語/, もん-口語/ +#名詞-非自立-一般 +# +# noun-affix-adverbial: noun affixes that that can behave as adverbs. +# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, +# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, +# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, +# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, +# 儘, 侭, みぎり, 矢先 +#名詞-非自立-副詞可能 +# +# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars +# with the stem よう(だ) ("you(da)"). +# e.g. よう, やう, 様 (よう) +#名詞-非自立-助動詞語幹 +# +# noun-affix-adjective-base: noun affixes that can connect to the indeclinable +# connection form な (aux "da"). +# e.g. みたい, ふう +#名詞-非自立-形容動詞語幹 +# +# noun-special: special nouns where the sub-classification is undefined. +#名詞-特殊 +# +# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is +# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base +# form of inflectional words. +# e.g. そう +#名詞-特殊-助動詞語幹 +# +# noun-suffix: noun suffixes where the sub-classification is undefined. +#名詞-接尾 +# +# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect +# to ガル or タイ and can combine into compound nouns, words that cannot be classified into +# any of the other categories below. In general, this category is more inclusive than +# 接尾語 ("suffix") and is usually the last element in a compound noun. +# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み, +# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用 +#名詞-接尾-一般 +# +# noun-suffix-person: Suffixes that form nouns and attach to person names more often +# than other nouns. +# e.g. 君, 様, 著 +#名詞-接尾-人名 +# +# noun-suffix-place: Suffixes that form nouns and attach to place names more often +# than other nouns. +# e.g. 町, 市, 県 +#名詞-接尾-地域 +# +# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that +# can appear before スル ("suru"). +# e.g. 化, 視, 分け, 入り, 落ち, 買い +#名詞-接尾-サ変接続 +# +# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, +# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the +# conjunctive form of inflectional words. +# e.g. そう +#名詞-接尾-助動詞語幹 +# +# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive +# form of inflectional words and appear before the copula だ ("da"). +# e.g. 的, げ, がち +#名詞-接尾-形容動詞語幹 +# +# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. +# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ) +#名詞-接尾-副詞可能 +# +# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category +# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach +# to numbers. +# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半 +#名詞-接尾-助数詞 +# +# noun-suffix-special: Special suffixes that mainly attach to inflecting words. +# e.g. (楽し) さ, (考え) 方 +#名詞-接尾-特殊 +# +# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words +# together. +# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦) +#名詞-接続詞的 +# +# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are +# semantically verb-like. +# e.g. ごらん, ご覧, 御覧, 頂戴 +#名詞-動詞非自立的 +# +# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, +# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") +# is いわく ("iwaku"). +#名詞-引用文字列 +# +# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and +# behave like an adjective. +# e.g. 申し訳, 仕方, とんでも, 違い +#名詞-ナイ形容詞語幹 +# +##### +# prefix: unclassified prefixes +#接頭詞 +# +# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) +# excluding numerical expressions. +# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派) +#接頭詞-名詞接続 +# +# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb +# in conjunctive form followed by なる/なさる/くださる. +# e.g. お (読みなさい), お (座り) +#接頭詞-動詞接続 +# +# prefix-adjectival: Prefixes that attach to adjectives. +# e.g. お (寒いですねえ), バカ (でかい) +#接頭詞-形容詞接続 +# +# prefix-numerical: Prefixes that attach to numerical expressions. +# e.g. 約, およそ, 毎時 +#接頭詞-数接続 +# +##### +# verb: unclassified verbs +#動詞 +# +# verb-main: +#動詞-自立 +# +# verb-auxiliary: +#動詞-非自立 +# +# verb-suffix: +#動詞-接尾 +# +##### +# adjective: unclassified adjectives +#形容詞 +# +# adjective-main: +#形容詞-自立 +# +# adjective-auxiliary: +#形容詞-非自立 +# +# adjective-suffix: +#形容詞-接尾 +# +##### +# adverb: unclassified adverbs +#副詞 +# +# adverb-misc: Words that can be segmented into one unit and where adnominal +# modification is not possible. +# e.g. あいかわらず, 多分 +#副詞-一般 +# +# adverb-particle_conjunction: Adverbs that can be followed by の, は, に, +# な, する, だ, etc. +# e.g. こんなに, そんなに, あんなに, なにか, なんでも +#副詞-助詞類接続 +# +##### +# adnominal: Words that only have noun-modifying forms. +# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, +# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, +# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き +#連体詞 +# +##### +# conjunction: Conjunctions that can occur independently. +# e.g. が, けれども, そして, じゃあ, それどころか +接続詞 +# +##### +# particle: unclassified particles. +助詞 +# +# particle-case: case particles where the subclassification is undefined. +助詞-格助詞 +# +# particle-case-misc: Case particles. +# e.g. から, が, で, と, に, へ, より, を, の, にて +助詞-格助詞-一般 +# +# particle-case-quote: the "to" that appears after nouns, a person’s speech, +# quotation marks, expressions of decisions from a meeting, reasons, judgements, +# conjectures, etc. +# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...) +助詞-格助詞-引用 +# +# particle-case-compound: Compounds of particles and verbs that mainly behave +# like case particles. +# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って, +# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, +# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, +# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, +# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって, +# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, +# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる, +# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ +助詞-格助詞-連語 +# +# particle-conjunctive: +# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, +# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, +# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/ +助詞-接続助詞 +# +# particle-dependency: +# e.g. こそ, さえ, しか, すら, は, も, ぞ +助詞-係助詞 +# +# particle-adverbial: +# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, +# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/, +# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, +# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/, +# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」) +助詞-副助詞 +# +# particle-interjective: particles with interjective grammatical roles. +# e.g. (松島) や +助詞-間投助詞 +# +# particle-coordinate: +# e.g. と, たり, だの, だり, とか, なり, や, やら +助詞-並立助詞 +# +# particle-final: +# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, +# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/ +助詞-終助詞 +# +# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is +# adverbial, conjunctive, or sentence final. For example: +# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」 +# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」 +# 「(祈りが届いたせい) か (, 試験に合格した.)」 +# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」 +# e.g. か +助詞-副助詞/並立助詞/終助詞 +# +# particle-adnominalizer: The "no" that attaches to nouns and modifies +# non-inflectional words. +助詞-連体化 +# +# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs +# that are giongo, giseigo, or gitaigo. +# e.g. に, と +助詞-副詞化 +# +# particle-special: A particle that does not fit into one of the above classifications. +# This includes particles that are used in Tanka, Haiku, and other poetry. +# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家) +助詞-特殊 +# +##### +# auxiliary-verb: +助動詞 +# +##### +# interjection: Greetings and other exclamations. +# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, +# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい +#感動詞 +# +##### +# symbol: unclassified Symbols. +記号 +# +# symbol-misc: A general symbol not in one of the categories below. +# e.g. [○◎@$〒→+] +記号-一般 +# +# symbol-comma: Commas +# e.g. [,、] +記号-読点 +# +# symbol-period: Periods and full stops. +# e.g. [..。] +記号-句点 +# +# symbol-space: Full-width whitespace. +記号-空白 +# +# symbol-open_bracket: +# e.g. [({‘“『【] +記号-括弧開 +# +# symbol-close_bracket: +# e.g. [)}’”』」】] +記号-括弧閉 +# +# symbol-alphabetic: +#記号-アルファベット +# +##### +# other: unclassified other +#その他 +# +# other-interjection: Words that are hard to classify as noun-suffixes or +# sentence-final particles. +# e.g. (だ)ァ +その他-間投 +# +##### +# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. +# e.g. あの, うんと, えと +フィラー +# +##### +# non-verbal: non-verbal sound. +非言語音 +# +##### +# fragment: +#語断片 +# +##### +# unknown: unknown part of speech. +#未知語 +# +##### End of file diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_ar.txt b/test/solr/configsets/property_search/conf/lang/stopwords_ar.txt new file mode 100644 index 00000000..046829db --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_ar.txt @@ -0,0 +1,125 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Cleaned on October 11, 2009 (not normalized, so use before normalization) +# This means that when modifying this list, you might need to add some +# redundant entries, for example containing forms with both أ and ا +من +ومن +منها +منه +في +وفي +فيها +فيه +و +ف +ثم +او +أو +ب +بها +به +ا +أ +اى +اي +أي +أى +لا +ولا +الا +ألا +إلا +لكن +ما +وما +كما +فما +عن +مع +اذا +إذا +ان +أن +إن +انها +أنها +إنها +انه +أنه +إنه +بان +بأن +فان +فأن +وان +وأن +وإن +التى +التي +الذى +الذي +الذين +الى +الي +إلى +إلي +على +عليها +عليه +اما +أما +إما +ايضا +أيضا +كل +وكل +لم +ولم +لن +ولن +هى +هي +هو +وهى +وهي +وهو +فهى +فهي +فهو +انت +أنت +لك +لها +له +هذه +هذا +تلك +ذلك +هناك +كانت +كان +يكون +تكون +وكانت +وكان +غير +بعض +قد +نحو +بين +بينما +منذ +ضمن +حيث +الان +الآن +خلال +بعد +قبل +حتى +عند +عندما +لدى +جميع diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_bg.txt b/test/solr/configsets/property_search/conf/lang/stopwords_bg.txt new file mode 100644 index 00000000..1ae4ba2a --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_bg.txt @@ -0,0 +1,193 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +а +аз +ако +ала +бе +без +беше +би +бил +била +били +било +близо +бъдат +бъде +бяха +в +вас +ваш +ваша +вероятно +вече +взема +ви +вие +винаги +все +всеки +всички +всичко +всяка +във +въпреки +върху +г +ги +главно +го +д +да +дали +до +докато +докога +дори +досега +доста +е +едва +един +ето +за +зад +заедно +заради +засега +затова +защо +защото +и +из +или +им +има +имат +иска +й +каза +как +каква +какво +както +какъв +като +кога +когато +което +които +кой +който +колко +която +къде +където +към +ли +м +ме +между +мен +ми +мнозина +мога +могат +може +моля +момента +му +н +на +над +назад +най +направи +напред +например +нас +не +него +нея +ни +ние +никой +нито +но +някои +някой +няма +обаче +около +освен +особено +от +отгоре +отново +още +пак +по +повече +повечето +под +поне +поради +после +почти +прави +пред +преди +през +при +пък +първо +с +са +само +се +сега +си +скоро +след +сме +според +сред +срещу +сте +съм +със +също +т +тази +така +такива +такъв +там +твой +те +тези +ти +тн +то +това +тогава +този +той +толкова +точно +трябва +тук +тъй +тя +тях +у +харесва +ч +че +често +чрез +ще +щом +я diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_ca.txt b/test/solr/configsets/property_search/conf/lang/stopwords_ca.txt new file mode 100644 index 00000000..3da65dea --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_ca.txt @@ -0,0 +1,220 @@ +# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) +a +abans +ací +ah +així +això +al +als +aleshores +algun +alguna +algunes +alguns +alhora +allà +allí +allò +altra +altre +altres +amb +ambdós +ambdues +apa +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aquí +baix +cada +cadascú +cadascuna +cadascunes +cadascuns +com +contra +d'un +d'una +d'unes +d'uns +dalt +de +del +dels +des +després +dins +dintre +donat +doncs +durant +e +eh +el +els +em +en +encara +ens +entre +érem +eren +éreu +es +és +esta +està +estàvem +estaven +estàveu +esteu +et +etc +ets +fins +fora +gairebé +ha +han +has +havia +he +hem +heu +hi +ho +i +igual +iguals +ja +l'hi +la +les +li +li'n +llavors +m'he +ma +mal +malgrat +mateix +mateixa +mateixes +mateixos +me +mentre +més +meu +meus +meva +meves +molt +molta +moltes +molts +mon +mons +n'he +n'hi +ne +ni +no +nogensmenys +només +nosaltres +nostra +nostre +nostres +o +oh +oi +on +pas +pel +pels +per +però +perquè +poc +poca +pocs +poques +potser +propi +qual +quals +quan +quant +que +què +quelcom +qui +quin +quina +quines +quins +s'ha +s'han +sa +semblant +semblants +ses +seu +seus +seva +seva +seves +si +sobre +sobretot +sóc +solament +sols +son +són +sons +sota +sou +t'ha +t'han +t'he +ta +tal +també +tampoc +tan +tant +tanta +tantes +teu +teus +teva +teves +ton +tons +tot +tota +totes +tots +un +una +unes +uns +us +va +vaig +vam +van +vas +veu +vosaltres +vostra +vostre +vostres diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_cz.txt b/test/solr/configsets/property_search/conf/lang/stopwords_cz.txt new file mode 100644 index 00000000..53c6097d --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_cz.txt @@ -0,0 +1,172 @@ +a +s +k +o +i +u +v +z +dnes +cz +tímto +budeš +budem +byli +jseš +můj +svým +ta +tomto +tohle +tuto +tyto +jej +zda +proč +máte +tato +kam +tohoto +kdo +kteří +mi +nám +tom +tomuto +mít +nic +proto +kterou +byla +toho +protože +asi +ho +naši +napište +re +což +tím +takže +svých +její +svými +jste +aj +tu +tedy +teto +bylo +kde +ke +pravé +ji +nad +nejsou +či +pod +téma +mezi +přes +ty +pak +vám +ani +když +však +neg +jsem +tento +článku +články +aby +jsme +před +pta +jejich +byl +ještě +až +bez +také +pouze +první +vaše +která +nás +nový +tipy +pokud +může +strana +jeho +své +jiné +zprávy +nové +není +vás +jen +podle +zde +už +být +více +bude +již +než +který +by +které +co +nebo +ten +tak +má +při +od +po +jsou +jak +další +ale +si +se +ve +to +jako +za +zpět +ze +do +pro +je +na +atd +atp +jakmile +přičemž +já +on +ona +ono +oni +ony +my +vy +jí +ji +mě +mne +jemu +tomu +těm +těmu +němu +němuž +jehož +jíž +jelikož +jež +jakož +načež diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_da.txt b/test/solr/configsets/property_search/conf/lang/stopwords_da.txt new file mode 100644 index 00000000..42e6145b --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_da.txt @@ -0,0 +1,110 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Danish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + +og | and +i | in +jeg | I +det | that (dem. pronoun)/it (pers. pronoun) +at | that (in front of a sentence)/to (with infinitive) +en | a/an +den | it (pers. pronoun)/that (dem. pronoun) +til | to/at/for/until/against/by/of/into, more +er | present tense of "to be" +som | who, as +på | on/upon/in/on/at/to/after/of/with/for, on +de | they +med | with/by/in, along +han | he +af | of/by/from/off/for/in/with/on, off +for | at/for/to/from/by/of/ago, in front/before, because +ikke | not +der | who/which, there/those +var | past tense of "to be" +mig | me/myself +sig | oneself/himself/herself/itself/themselves +men | but +et | a/an/one, one (number), someone/somebody/one +har | present tense of "to have" +om | round/about/for/in/a, about/around/down, if +vi | we +min | my +havde | past tense of "to have" +ham | him +hun | she +nu | now +over | over/above/across/by/beyond/past/on/about, over/past +da | then, when/as/since +fra | from/off/since, off, since +du | you +ud | out +sin | his/her/its/one's +dem | them +os | us/ourselves +op | up +man | you/one +hans | his +hvor | where +eller | or +hvad | what +skal | must/shall etc. +selv | myself/youself/herself/ourselves etc., even +her | here +alle | all/everyone/everybody etc. +vil | will (verb) +blev | past tense of "to stay/to remain/to get/to become" +kunne | could +ind | in +når | when +være | present tense of "to be" +dog | however/yet/after all +noget | something +ville | would +jo | you know/you see (adv), yes +deres | their/theirs +efter | after/behind/according to/for/by/from, later/afterwards +ned | down +skulle | should +denne | this +end | than +dette | this +mit | my/mine +også | also +under | under/beneath/below/during, below/underneath +have | have +dig | you +anden | other +hende | her +mine | my +alt | everything +meget | much/very, plenty of +sit | his, her, its, one's +sine | his, her, its, one's +vor | our +mod | against +disse | these +hvis | if +din | your/yours +nogle | some +hos | by/at +blive | be/become +mange | many +ad | by/through +bliver | present tense of "to be/to become" +hendes | her/hers +været | be +thi | for (conj) +jer | you +sådan | such, like this/like that diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_de.txt b/test/solr/configsets/property_search/conf/lang/stopwords_de.txt new file mode 100644 index 00000000..86525e7a --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_de.txt @@ -0,0 +1,294 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A German stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | The number of forms in this list is reduced significantly by passing it + | through the German stemmer. + + +aber | but + +alle | all +allem +allen +aller +alles + +als | than, as +also | so +am | an + dem +an | at + +ander | other +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders + +auch | also +auf | on +aus | out of +bei | by +bin | am +bis | until +bist | art +da | there +damit | with it +dann | then + +der | the +den +des +dem +die +das + +daß | that + +derselbe | the same +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe + +dazu | to that + +dein | thy +deine +deinem +deinen +deiner +deines + +denn | because + +derer | of those +dessen | of him + +dich | thee +dir | to thee +du | thou + +dies | this +diese +diesem +diesen +dieser +dieses + + +doch | (several meanings) +dort | (over) there + + +durch | through + +ein | a +eine +einem +einen +einer +eines + +einig | some +einige +einigem +einigen +einiger +einiges + +einmal | once + +er | he +ihn | him +ihm | to him + +es | it +etwas | something + +euer | your +eure +eurem +euren +eurer +eures + +für | for +gegen | towards +gewesen | p.p. of sein +hab | have +habe | have +haben | have +hat | has +hatte | had +hatten | had +hier | here +hin | there +hinter | behind + +ich | I +mich | me +mir | to me + + +ihr | you, to her +ihre +ihrem +ihren +ihrer +ihres +euch | to you + +im | in + dem +in | in +indem | while +ins | in + das +ist | is + +jede | each, every +jedem +jeden +jeder +jedes + +jene | that +jenem +jenen +jener +jenes + +jetzt | now +kann | can + +kein | no +keine +keinem +keinen +keiner +keines + +können | can +könnte | could +machen | do +man | one + +manche | some, many a +manchem +manchen +mancher +manches + +mein | my +meine +meinem +meinen +meiner +meines + +mit | with +muss | must +musste | had to +nach | to(wards) +nicht | not +nichts | nothing +noch | still, yet +nun | now +nur | only +ob | whether +oder | or +ohne | without +sehr | very + +sein | his +seine +seinem +seinen +seiner +seines + +selbst | self +sich | herself + +sie | they, she +ihnen | to them + +sind | are +so | so + +solche | such +solchem +solchen +solcher +solches + +soll | shall +sollte | should +sondern | but +sonst | else +über | over +um | about, around +und | and + +uns | us +unse +unsem +unsen +unser +unses + +unter | under +viel | much +vom | von + dem +von | from +vor | before +während | while +war | was +waren | were +warst | wast +was | what +weg | away, off +weil | because +weiter | further + +welche | which +welchem +welchen +welcher +welches + +wenn | when +werde | will +werden | will +wie | how +wieder | again +will | want +wir | we +wird | will +wirst | willst +wo | where +wollen | want +wollte | wanted +würde | would +würden | would +zu | to +zum | zu + dem +zur | zu + der +zwar | indeed +zwischen | between + diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_el.txt b/test/solr/configsets/property_search/conf/lang/stopwords_el.txt new file mode 100644 index 00000000..232681f5 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_el.txt @@ -0,0 +1,78 @@ +# Lucene Greek Stopwords list +# Note: by default this file is used after GreekLowerCaseFilter, +# so when modifying this file use 'σ' instead of 'ς' +ο +η +το +οι +τα +του +τησ +των +τον +την +και +κι +κ +ειμαι +εισαι +ειναι +ειμαστε +ειστε +στο +στον +στη +στην +μα +αλλα +απο +για +προσ +με +σε +ωσ +παρα +αντι +κατα +μετα +θα +να +δε +δεν +μη +μην +επι +ενω +εαν +αν +τοτε +που +πωσ +ποιοσ +ποια +ποιο +ποιοι +ποιεσ +ποιων +ποιουσ +αυτοσ +αυτη +αυτο +αυτοι +αυτων +αυτουσ +αυτεσ +αυτα +εκεινοσ +εκεινη +εκεινο +εκεινοι +εκεινεσ +εκεινα +εκεινων +εκεινουσ +οπωσ +ομωσ +ισωσ +οσο +οτι diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_en.txt b/test/solr/configsets/property_search/conf/lang/stopwords_en.txt new file mode 100644 index 00000000..2c164c0b --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_en.txt @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +# Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +such +that +the +their +then +there +these +they +this +to +was +will +with diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_es.txt b/test/solr/configsets/property_search/conf/lang/stopwords_es.txt new file mode 100644 index 00000000..487d78c8 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_es.txt @@ -0,0 +1,356 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Spanish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | from, of +la | the, her +que | who, that +el | the +en | in +y | and +a | to +los | the, them +del | de + el +se | himself, from him etc +las | the, them +por | for, by, etc +un | a +para | for +con | with +no | no +una | a +su | his, her +al | a + el + | es from SER +lo | him +como | how +más | more +pero | pero +sus | su plural +le | to him, her +ya | already +o | or + | fue from SER +este | this + | ha from HABER +sí | himself etc +porque | because +esta | this + | son from SER +entre | between + | está from ESTAR +cuando | when +muy | very +sin | without +sobre | on + | ser from SER + | tiene from TENER +también | also +me | me +hasta | until +hay | there is/are +donde | where + | han from HABER +quien | whom, that + | están from ESTAR + | estado from ESTAR +desde | from +todo | all +nos | us +durante | during + | estados from ESTAR +todos | all +uno | a +les | to them +ni | nor +contra | against +otros | other + | fueron from SER +ese | that +eso | that + | había from HABER +ante | before +ellos | they +e | and (variant of y) +esto | this +mí | me +antes | before +algunos | some +qué | what? +unos | a +yo | I +otro | other +otras | other +otra | other +él | he +tanto | so much, many +esa | that +estos | these +mucho | much, many +quienes | who +nada | nothing +muchos | many +cual | who + | sea from SER +poco | few +ella | she +estar | to be + | haber from HABER +estas | these + | estaba from ESTAR + | estamos from ESTAR +algunas | some +algo | something +nosotros | we + + | other forms + +mi | me +mis | mi plural +tú | thou +te | thee +ti | thee +tu | thy +tus | tu plural +ellas | they +nosotras | we +vosotros | you +vosotras | you +os | you +mío | mine +mía | +míos | +mías | +tuyo | thine +tuya | +tuyos | +tuyas | +suyo | his, hers, theirs +suya | +suyos | +suyas | +nuestro | ours +nuestra | +nuestros | +nuestras | +vuestro | yours +vuestra | +vuestros | +vuestras | +esos | those +esas | those + + | forms of estar, to be (not including the infinitive): +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad + + | forms of haber, to have (not including the infinitive): +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas + + | forms of ser, to be (not including the infinitive): +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +siendo +sido + | sed also means 'thirst' + + | forms of tener, to have (not including the infinitive): +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened + diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_et.txt b/test/solr/configsets/property_search/conf/lang/stopwords_et.txt new file mode 100644 index 00000000..1b06a134 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_et.txt @@ -0,0 +1,1603 @@ +# Estonian stopwords list +all +alla +allapoole +allpool +alt +altpoolt +eel +eespool +enne +hommikupoole +hoolimata +ilma +kaudu +keset +kesk +kohe +koos +kuhupoole +kuni +kuspool +kustpoolt +kõige +käsikäes +lappi +ligi +läbi +mööda +paitsi +peale +pealepoole +pealpool +pealt +pealtpoolt +piki +pikku +piku +pikuti +põiki +pärast +päri +risti +sealpool +sealtpoolt +seespool +seltsis +siiapoole +siinpool +siitpoolt +sinnapoole +sissepoole +taga +tagantpoolt +tagapidi +tagapool +taha +tahapoole +teispool +teispoole +tänu +tükkis +vaatamata +vastu +väljapoole +väljaspool +väljastpoolt +õhtupoole +ühes +ühestükis +ühestükkis +ülalpool +ülaltpoolt +üle +ülespoole +ülevalpool +ülevaltpoolt +ümber +ümbert +aegu +aegus +alguks +algul +algule +algult +alguni +all +alla +alt +alul +alutsi +arvel +asemel +asemele +eel +eeli +ees +eesotsas +eest +eestotsast +esitsi +ette +etteotsa +haaval +heaks +hoolimata +hulgas +hulgast +hulka +jalgu +jalus +jalust +jaoks +jooksul +juurde +juures +juurest +jälil +jälile +järel +järele +järelt +järgi +kaasas +kallal +kallale +kallalt +kamul +kannul +kannule +kannult +kaudu +kaupa +keskel +keskele +keskelt +keskis +keskpaiku +kestel +kestes +kilda +killas +killast +kimpu +kimpus +kiuste +kohal +kohale +kohalt +kohaselt +kohe +kohta +koos +korral +kukil +kukile +kukilt +kulul +kõrva +kõrval +kõrvale +kõrvalt +kõrvas +kõrvast +käekõrval +käekõrvale +käekõrvalt +käes +käest +kätte +külge +küljes +küljest +küüsi +küüsis +küüsist +ligi +ligidal +ligidale +ligidalt +aegu +aegus +alguks +algul +algule +algult +alguni +all +alla +alt +alul +alutsi +arvel +asemel +asemele +eel +eeli +ees +eesotsas +eest +eestotsast +esitsi +ette +etteotsa +haaval +heaks +hoolimata +hulgas +hulgast +hulka +jalgu +jalus +jalust +jaoks +jooksul +juurde +juures +juurest +jälil +jälile +järel +järele +järelt +järgi +kaasas +kallal +kallale +kallalt +kamul +kannul +kannule +kannult +kaudu +kaupa +keskel +keskele +keskelt +keskis +keskpaiku +kestel +kestes +kilda +killas +killast +kimpu +kimpus +kiuste +kohal +kohale +kohalt +kohaselt +kohe +kohta +koos +korral +kukil +kukile +kukilt +kulul +kõrva +kõrval +kõrvale +kõrvalt +kõrvas +kõrvast +käekõrval +käekõrvale +käekõrvalt +käes +käest +kätte +külge +küljes +küljest +küüsi +küüsis +küüsist +ligi +ligidal +ligidale +ligidalt +lool +läbi +lähedal +lähedale +lähedalt +man +mant +manu +meelest +mööda +nahas +nahka +nahkas +najal +najale +najalt +nõjal +nõjale +otsa +otsas +otsast +paigale +paigu +paiku +peal +peale +pealt +perra +perrä +pidi +pihta +piki +pikku +pool +poole +poolest +poolt +puhul +puksiiris +pähe +päralt +päras +pärast +päri +ringi +ringis +risust +saadetusel +saadik +saatel +saati +seas +seast +sees +seest +sekka +seljataga +seltsi +seltsis +seltsist +sisse +slepis +suhtes +šlepis +taga +tagant +tagantotsast +tagaotsas +tagaselja +tagasi +tagast +tagutsi +taha +tahaotsa +takka +tarvis +tasa +tuuri +tuuris +tõttu +tükkis +uhal +vaatamata +vahel +vahele +vahelt +vahepeal +vahepeale +vahepealt +vahetsi +varal +varale +varul +vastas +vastast +vastu +veerde +veeres +viisi +võidu +võrd +võrdki +võrra +võrragi +väel +väele +vältel +väärt +väärtki +äärde +ääre +ääres +äärest +ühes +üle +ümber +ümbert +a +abil +aina +ainult +alalt +alates +alati +alles +b +c +d +e +eales +ealeski +edasi +edaspidi +eelkõige +eemal +ei +eks +end +enda +enese +ennem +esialgu +f +g +h +hoopis +i +iganes +igatahes +igati +iial +iialgi +ikka +ikkagi +ilmaski +iseenda +iseenese +iseenesest +isegi +j +jah +ju +juba +juhul +just +järelikult +k +ka +kah +kas +kasvõi +keda +kestahes +kogu +koguni +kohati +kokku +kuhu +kuhugi +kuidagi +kuidas +kunagi +kus +kusagil +kusjuures +kuskil +kust +kõigepealt +küll +l +liiga +lisaks +m +miks +mil +millal +millalgi +mispärast +mistahes +mistõttu +mitte +muide +muidu +muidugi +muist +mujal +mujale +mujalt +mõlemad +mõnda +mõne +mõnikord +n +nii +niikaua +niimoodi +niipaljuke +niisama +niisiis +niivõrd +nõnda +nüüd +o +omaette +omakorda +omavahel +ometi +p +palju +paljuke +palju-palju +peaaegu +peagi +peamiselt +pigem +pisut +praegu +päris +r +rohkem +s +samas +samuti +seal +sealt +sedakorda +sedapuhku +seega +seejuures +seejärel +seekord +seepärast +seetõttu +sellepärast +seni +sestap +siia +siiani +siin +siinkohal +siis +siiski +siit +sinna +suht +š +z +ž +t +teel +teineteise +tõesti +täiesti +u +umbes +v +w +veel +veelgi +vist +võibolla +võib-olla +väga +vähemalt +välja +väljas +väljast +õ +ä +ära +ö +ü +ühtlasi +üksi +ükskõik +ülal +ülale +ülalt +üles +ülesse +üleval +ülevalt +ülimalt +üsna +x +y +aga +ega +ehk +ehkki +elik +ellik +enge +ennegu +ent +et +ja +justkui +kui +kuid +kuigi +kuivõrd +kuna +kuni +kut +mistab +muudkui +nagu +nigu +ning +olgugi +otsekui +otsenagu +selmet +sest +sestab +vaid +või +aa +adaa +adjöö +ae +ah +ahaa +ahah +ah-ah-ah +ah-haa +ahoi +ai +aidaa +aidu-raidu +aih +aijeh +aituma +aitäh +aitüma +ammuu +amps +ampsti +aptsih +ass +at +ata +at-at-at +atsih +atsihh +auh +bai-bai +bingo +braavo +brr +ee +eeh +eh +ehee +eheh +eh-eh-hee +eh-eh-ee +ehei +ehh +ehhee +einoh +ena +ennäe +ennäh +fuh +fui +fuih +haa +hah +hahaa +hah-hah-hah +halleluuja +hallo +halloo +hass +hee +heh +he-he-hee +hei +heldeke(ne) +heureka +hihii +hip-hip-hurraa +hmh +hmjah +hoh-hoh-hoo +hohoo +hoi +hollallaa +hoo +hoplaa +hopp +hops +hopsassaa +hopsti +hosianna +huh +huidii +huist +hurjah +hurjeh +hurjoh +hurjuh +hurraa +huu +hõhõh +hõi +hõissa +hõissassa +hõk +hõkk +häh +hä-hä-hää +hüvasti +ih-ah-haa +ih-ih-hii +ii-ha-ha +issake +issakene +isver +jaa-ah +ja-ah +jaah +janäe +jeeh +jeerum +jeever +jessas +jestas +juhhei +jumalaga +jumalime +jumaluke +jumalukene +jutas +kaaps +kaapsti +kaasike +kae +kalps +kalpsti +kannäe +kanäe +kappadi +kaps +kapsti +karkõmm +karkäuh +karkääks +karkääksti +karmauh +karmauhti +karnaps +karnapsti +karniuhti +karpartsaki +karpauh +karpauhti +karplauh +karplauhti +karprauh +karprauhti +karsumdi +karsumm +kartsumdi +kartsumm +karviuh +karviuhti +kaske +kassa +kauh +kauhti +keh +keksti +kepsti +khe +khm +kih +kiiks +kiiksti +kiis +kiiss +kikerii +kikerikii +kili +kilk +kilk-kõlk +kilks +kilks-kolks +kilks-kõlks +kill +killadi +killadi|-kolladi +killadi-kõlladi +killa-kolla +killa-kõlla +kill-kõll +kimps-komps +kipp +kips-kõps +kiriküüt +kirra-kõrra +kirr-kõrr +kirts +klaps +klapsti +klirdi +klirr +klonks +klops +klopsti +kluk +klu-kluu +klõks +klõksti +klõmdi +klõmm +klõmpsti +klõnks +klõnksti +klõps +klõpsti +kläu +kohva-kohva +kok +koks +koksti +kolaki +kolk +kolks +kolksti +koll +kolladi +komp +komps +kompsti +kop +kopp +koppadi +kops +kopsti +kossu +kotsu +kraa +kraak +kraaks +kraaps +kraapsti +krahh +kraks +kraksti +kraps +krapsti +krauh +krauhti +kriiks +kriiksti +kriips +kriips-kraaps +kripa-krõpa +krips-kraps +kriuh +kriuks +kriuksti +kromps +kronk +kronks +krooks +kruu +krõks +krõksti +krõpa +krõps +krõpsti +krõuh +kräu +kräuh +kräuhti +kräuks +kss +kukeleegu +kukku +kuku +kulu +kurluu +kurnäu +kuss +kussu +kõks +kõksti +kõldi +kõlks +kõlksti +kõll +kõmaki +kõmdi +kõmm +kõmps +kõpp +kõps +kõpsadi +kõpsat +kõpsti +kõrr +kõrra-kõrra +kõss +kõtt +kõõksti +kärr +kärts +kärtsti +käuks +käuksti +kääga +kääks +kääksti +köh +köki-möki +köksti +laks +laksti +lampsti +larts +lartsti +lats +latsti +leelo +legoo +lehva +liiri-lõõri +lika-lõka +likat-lõkat +limpsti +lips +lipsti +lirts +lirtsaki +lirtsti +lonksti +lops +lopsti +lorts +lortsti +luks +lups +lupsti +lurts +lurtsti +lõks +lõksti +lõmps +lõmpsti +lõnks +lõnksti +lärts +lärtsti +läts +lätsti +lörts +lörtsti +lötsti +lööps +lööpsti +marss +mats +matsti +mauh +mauhti +mh +mhh +mhmh +miau +mjaa +mkm +m-mh +mnjaa +mnjah +moens +mulks +mulksti +mull-mull +mull-mull-mull +muu +muuh +mõh +mõmm +mäh +mäts +mäu +mää +möh +möh-öh-ää +möö +müh-müh +mühüh +müks +müksti +müraki +mürr +mürts +mürtsaki +mürtsti +mütaku +müta-mäta +müta-müta +müt-müt +müt-müt-müt +müts +mütsti +mütt +naa +naah +nah +naks +naksti +nanuu +naps +napsti +nilpsti +nipsti +nirr +niuh +niuh-näuh +niuhti +noh +noksti +nolpsti +nonoh +nonoo +nonäh +noo +nooh +nooks +norr +nurr +nuuts +nõh +nõhh +nõka-nõka +nõks +nõksat-nõksat +nõks-nõks +nõksti +nõõ +nõõh +näeh +näh +nälpsti +nämm-nämm +näpsti +näts +nätsti +näu +näuh +näuhti +näuks +näuksti +nääh +nääks +nühkat-nühkat +oeh +oh +ohh +ohhh +oh-hoi +oh-hoo +ohoh +oh-oh-oo +oh-oh-hoo +ohoi +ohoo +oi +oih +oijee +oijeh +oo +ooh +oo-oh +oo-ohh +oot +ossa +ot +paa +pah +pahh +pakaa +pamm +pantsti +pardon +pardonks +parlartsti +parts +partsti +partsumdi +partsumm +pastoi +pats +patst +patsti +pau +pauh +pauhti +pele +pfui +phuh +phuuh +phäh +phähh +piiks +piip +piiri-pääri +pimm +pimm-pamm +pimm-pomm +pimm-põmm +piraki +piuks +piu-pau +plaks +plaksti +plarts +plartsti +plats +platsti +plauh +plauhh +plauhti +pliks +pliks-plaks +plinn +pliraki +plirts +plirtsti +pliu +pliuh +ploks +plotsti +plumps +plumpsti +plõks +plõksti +plõmdi +plõmm +plõnn +plärr +plärts +plärtsat +plärtsti +pläu +pläuh +plää +plörtsat +pomm +popp +pops +popsti +ports +pot +pots +potsti +pott +praks +praksti +prants +prantsaki +prantsti +prassai +prauh +prauhh +prauhti +priks +priuh +priuhh +priuh-prauh +proosit +proost +prr +prrr +prõks +prõksti +prõmdi +prõmm +prõntsti +prääk +prääks +pst +psst +ptrr +ptruu +ptüi +puh +puhh +puksti +pumm +pumps +pup-pup-pup +purts +puuh +põks +põksti +põmdi +põmm +põmmadi +põnks +põnn +põnnadi +põnt +põnts +põntsti +põraki +põrr +põrra-põrra +päh +pähh +päntsti +pää +pöörd +püh +raks +raksti +raps +rapsti +ratataa +rauh +riips +riipsti +riks +riks-raks +rips-raps +rivitult +robaki +rops +ropsaki +ropsti +ruik +räntsti +räts +röh +röhh +sah +sahh +sahkat +saps +sapsti +sauh +sauhti +servus +sihkadi-sahkadi +sihka-sahka +sihkat-sahkat +silks +silk-solk +sips +sipsti +sirr +sirr-sorr +sirts +sirtsti +siu +siuh +siuh-sauh +siuh-säuh +siuhti +siuks +siuts +skool +so +soh +solks +solksti +solpsti +soo +sooh +so-oh +soo-oh +sopp +sops +sopsti +sorr +sorts +sortsti +so-soo +soss +soss-soss +ss +sss +sst +stopp +suhkat-sahkat +sulk +sulks +sulksti +sull +sulla-sulla +sulpa-sulpa +sulps +sulpsti +sumaki +sumdi +summ +summat-summat +sups +supsaku +supsti +surts +surtsti +suss +susti +suts +sutsti +säh +sähke +särts +särtsti +säu +säuh +säuhti +taevake +taevakene +takk +tere +terekest +tibi-tibi +tikk-takk +tiks +tilk +tilks +till +tilla-talla +till-tall +tilulii +tinn +tip +tip-tap +tirr +tirtsti +tiu +tjaa +tjah +tohhoh +tohhoo +tohoh +tohoo +tok +tokk +toks +toksti +tonks +tonksti +tota +totsti +tot-tot +tprr +tpruu +trah +trahh +trallallaa +trill +trillallaa +trr +trrr +tsah +tsahh +tsilk +tsilk-tsolk +tsirr +tsiuh +tskae +tsolk +tss +tst +tsst +tsuhh +tsuk +tsumm +tsurr +tsäuh +tšao +tšš +tššš +tuk +tuks +turts +turtsti +tutki +tutkit +tutu-lutu +tutulutu +tuut +tuutu-luutu +tõks +tötsti +tümps +uh +uhh +uh-huu +uhtsa +uhtsaa +uhuh +uhuu +ui +uih +uih-aih +uijah +uijeh +uist +uit +uka +upsti +uraa +urjah +urjeh +urjoh +urjuh +urr +urraa +ust +utu +uu +uuh +vaak +vaat +vae +vaeh +vai +vat +vau +vhüüt +vidiit +viiks +vilks +vilksti +vinki-vinki +virdi +virr +viu +viudi +viuh +viuhti +voeh +voh +vohh +volks +volksti +vooh +vops +vopsti +vot +vuh +vuhti +vuih +vulks +vulksti +vull +vulpsti +vups +vupsaki +vupsaku +vupsti +vurdi +vurr +vurra-vurra +vurts +vurtsti +vutt +võe +võeh +või +võih +võrr +võts +võtt +vääks +õe +õits +õk +õkk +õrr +õss +õuh +äh +ähh +ähhähhää +äh-hää +äh-äh-hää +äiu +äiu-ää +äss +ää +ääh +äähh +öh +öhh +ök +üh +eelmine +eikeegi +eimiski +emb-kumb +enam +enim +iga +igasugune +igaüks +ise +isesugune +järgmine +keegi +kes +kumb +kumbki +kõik +meiesugune +meietaoline +midagi +mihuke +mihukene +milletaoline +milline +mina +minake +mingi +mingisugune +minusugune +minutaoline +mis +miski +miskisugune +missugune +misuke +mitmes +mitmesugune +mitu +mitu-mitu +mitu-setu +muu +mõlema +mõnesugune +mõni +mõningane +mõningas +mäherdune +määrane +naasugune +need +nemad +nendesugune +nendetaoline +nihuke +nihukene +niimitu +niisamasugune +niisugune +nisuke +nisukene +oma +omaenese +omasugune +omataoline +pool +praegune +sama +samasugune +samataoline +see +seesama +seesamane +seesamune +seesinane +seesugune +selline +sihuke +sihukene +sina +sinusugune +sinutaoline +siuke +siukene +säherdune +säärane +taoline +teiesugune +teine +teistsugune +tema +temake +temakene +temasugune +temataoline +too +toosama +toosamane +üks +üksteise +hakkama +minema +olema +pidama +saama +tegema +tulema +võima diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_eu.txt b/test/solr/configsets/property_search/conf/lang/stopwords_eu.txt new file mode 100644 index 00000000..25f1db93 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_eu.txt @@ -0,0 +1,99 @@ +# example set of basque stopwords +al +anitz +arabera +asko +baina +bat +batean +batek +bati +batzuei +batzuek +batzuetan +batzuk +bera +beraiek +berau +berauek +bere +berori +beroriek +beste +bezala +da +dago +dira +ditu +du +dute +edo +egin +ere +eta +eurak +ez +gainera +gu +gutxi +guzti +haiei +haiek +haietan +hainbeste +hala +han +handik +hango +hara +hari +hark +hartan +hau +hauei +hauek +hauetan +hemen +hemendik +hemengo +hi +hona +honek +honela +honetan +honi +hor +hori +horiei +horiek +horietan +horko +horra +horrek +horrela +horretan +horri +hortik +hura +izan +ni +noiz +nola +non +nondik +nongo +nor +nora +ze +zein +zen +zenbait +zenbat +zer +zergatik +ziren +zituen +zu +zuek +zuen +zuten diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_fa.txt b/test/solr/configsets/property_search/conf/lang/stopwords_fa.txt new file mode 100644 index 00000000..723641c6 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_fa.txt @@ -0,0 +1,313 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Note: by default this file is used after normalization, so when adding entries +# to this file, use the arabic 'ي' instead of 'ی' +انان +نداشته +سراسر +خياه +ايشان +وي +تاكنون +بيشتري +دوم +پس +ناشي +وگو +يا +داشتند +سپس +هنگام +هرگز +پنج +نشان +امسال +ديگر +گروهي +شدند +چطور +ده +و +دو +نخستين +ولي +چرا +چه +وسط +ه +كدام +قابل +يك +رفت +هفت +همچنين +در +هزار +بله +بلي +شايد +اما +شناسي +گرفته +دهد +داشته +دانست +داشتن +خواهيم +ميليارد +وقتيكه +امد +خواهد +جز +اورده +شده +بلكه +خدمات +شدن +برخي +نبود +بسياري +جلوگيري +حق +كردند +نوعي +بعري +نكرده +نظير +نبايد +بوده +بودن +داد +اورد +هست +جايي +شود +دنبال +داده +بايد +سابق +هيچ +همان +انجا +كمتر +كجاست +گردد +كسي +تر +مردم +تان +دادن +بودند +سري +جدا +ندارند +مگر +يكديگر +دارد +دهند +بنابراين +هنگامي +سمت +جا +انچه +خود +دادند +زياد +دارند +اثر +بدون +بهترين +بيشتر +البته +به +براساس +بيرون +كرد +بعضي +گرفت +توي +اي +ميليون +او +جريان +تول +بر +مانند +برابر +باشيم +مدتي +گويند +اكنون +تا +تنها +جديد +چند +بي +نشده +كردن +كردم +گويد +كرده +كنيم +نمي +نزد +روي +قصد +فقط +بالاي +ديگران +اين +ديروز +توسط +سوم +ايم +دانند +سوي +استفاده +شما +كنار +داريم +ساخته +طور +امده +رفته +نخست +بيست +نزديك +طي +كنيد +از +انها +تمامي +داشت +يكي +طريق +اش +چيست +روب +نمايد +گفت +چندين +چيزي +تواند +ام +ايا +با +ان +ايد +ترين +اينكه +ديگري +راه +هايي +بروز +همچنان +پاعين +كس +حدود +مختلف +مقابل +چيز +گيرد +ندارد +ضد +همچون +سازي +شان +مورد +باره +مرسي +خويش +برخوردار +چون +خارج +شش +هنوز +تحت +ضمن +هستيم +گفته +فكر +بسيار +پيش +براي +روزهاي +انكه +نخواهد +بالا +كل +وقتي +كي +چنين +كه +گيري +نيست +است +كجا +كند +نيز +يابد +بندي +حتي +توانند +عقب +خواست +كنند +بين +تمام +همه +ما +باشند +مثل +شد +اري +باشد +اره +طبق +بعد +اگر +صورت +غير +جاي +بيش +ريزي +اند +زيرا +چگونه +بار +لطفا +مي +درباره +من +ديده +همين +گذاري +برداري +علت +گذاشته +هم +فوق +نه +ها +شوند +اباد +همواره +هر +اول +خواهند +چهار +نام +امروز +مان +هاي +قبل +كنم +سعي +تازه +را +هستند +زير +جلوي +عنوان +بود diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_fi.txt b/test/solr/configsets/property_search/conf/lang/stopwords_fi.txt new file mode 100644 index 00000000..4372c9a0 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_fi.txt @@ -0,0 +1,97 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| forms of BE + +olla +olen +olet +on +olemme +olette +ovat +ole | negative form + +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet + +en | negation +et +ei +emme +ette +eivät + +|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans +minä minun minut minua minussa minusta minuun minulla minulta minulle | I +sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you +hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she +me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we +te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you +he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they + +tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this +tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that +se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it +nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these +nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those +ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they + +kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who +ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) +mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what +mitkä | (pl) + +joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which +jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) + +| conjunctions + +että | that +ja | and +jos | if +koska | because +kuin | than +mutta | but +niin | so +sekä | and +sillä | for +tai | or +vaan | but +vai | or +vaikka | although + + +| prepositions + +kanssa | with +mukaan | according to +noin | about +poikki | across +yli | over, across + +| other + +kun | when +niin | so +nyt | now +itse | self + diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_fr.txt b/test/solr/configsets/property_search/conf/lang/stopwords_fr.txt new file mode 100644 index 00000000..749abae6 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_fr.txt @@ -0,0 +1,186 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A French stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +au | a + le +aux | a + les +avec | with +ce | this +ces | these +dans | with +de | of +des | de + les +du | de + le +elle | she +en | `of them' etc +et | and +eux | them +il | he +je | I +la | the +le | the +leur | their +lui | him +ma | my (fem) +mais | but +me | me +même | same; as in moi-même (myself) etc +mes | me (pl) +moi | me +mon | my (masc) +ne | not +nos | our (pl) +notre | our +nous | we +on | one +ou | where +par | by +pas | not +pour | for +qu | que before vowel +que | that +qui | who +sa | his, her (fem) +se | oneself +ses | his (pl) +son | his, her (masc) +sur | on +ta | thy (fem) +te | thee +tes | thy (pl) +toi | thee +ton | thy (masc) +tu | thou +un | a +une | a +vos | your (pl) +votre | your +vous | you + + | single letter forms + +c | c' +d | d' +j | j' +l | l' +à | to, at +m | m' +n | n' +s | s' +t | t' +y | there + + | forms of être (not including the infinitive): +été +étée +étées +étés +étant +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent + + | forms of avoir (not including the infinitive): +ayant +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent + + | Later additions (from Jean-Christophe Deschamps) +ceci | this +cela | that +celà | that +cet | this +cette | this +ici | here +ils | they +les | the (pl) +leurs | their (pl) +quel | which +quels | which +quelle | which +quelles | which +sans | without +soi | oneself + diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_ga.txt b/test/solr/configsets/property_search/conf/lang/stopwords_ga.txt new file mode 100644 index 00000000..9ff88d74 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_ga.txt @@ -0,0 +1,110 @@ + +a +ach +ag +agus +an +aon +ar +arna +as +b' +ba +beirt +bhúr +caoga +ceathair +ceathrar +chomh +chtó +chuig +chun +cois +céad +cúig +cúigear +d' +daichead +dar +de +deich +deichniúr +den +dhá +do +don +dtí +dá +dár +dó +faoi +faoin +faoina +faoinár +fara +fiche +gach +gan +go +gur +haon +hocht +i +iad +idir +in +ina +ins +inár +is +le +leis +lena +lenár +m' +mar +mo +mé +na +nach +naoi +naonúr +ná +ní +níor +nó +nócha +ocht +ochtar +os +roimh +sa +seacht +seachtar +seachtó +seasca +seisear +siad +sibh +sinn +sna +sé +sí +tar +thar +thú +triúr +trí +trína +trínár +tríocha +tú +um +ár +é +éis +í +ó +ón +óna +ónár diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_gl.txt b/test/solr/configsets/property_search/conf/lang/stopwords_gl.txt new file mode 100644 index 00000000..d8760b12 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_gl.txt @@ -0,0 +1,161 @@ +# galican stopwords +a +aínda +alí +aquel +aquela +aquelas +aqueles +aquilo +aquí +ao +aos +as +así +á +ben +cando +che +co +coa +comigo +con +connosco +contigo +convosco +coas +cos +cun +cuns +cunha +cunhas +da +dalgunha +dalgunhas +dalgún +dalgúns +das +de +del +dela +delas +deles +desde +deste +do +dos +dun +duns +dunha +dunhas +e +el +ela +elas +eles +en +era +eran +esa +esas +ese +eses +esta +estar +estaba +está +están +este +estes +estiven +estou +eu +é +facer +foi +foron +fun +había +hai +iso +isto +la +las +lle +lles +lo +los +mais +me +meu +meus +min +miña +miñas +moi +na +nas +neste +nin +no +non +nos +nosa +nosas +noso +nosos +nós +nun +nunha +nuns +nunhas +o +os +ou +ó +ós +para +pero +pode +pois +pola +polas +polo +polos +por +que +se +senón +ser +seu +seus +sexa +sido +sobre +súa +súas +tamén +tan +te +ten +teñen +teño +ter +teu +teus +ti +tido +tiña +tiven +túa +túas +un +unha +unhas +uns +vos +vosa +vosas +voso +vosos +vós diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_hi.txt b/test/solr/configsets/property_search/conf/lang/stopwords_hi.txt new file mode 100644 index 00000000..86286bb0 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_hi.txt @@ -0,0 +1,235 @@ +# Also see http://www.opensource.org/licenses/bsd-license.html +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# This file was created by Jacques Savoy and is distributed under the BSD license. +# Note: by default this file also contains forms normalized by HindiNormalizer +# for spelling variation (see section below), such that it can be used whether or +# not you enable that feature. When adding additional entries to this list, +# please add the normalized form as well. +अंदर +अत +अपना +अपनी +अपने +अभी +आदि +आप +इत्यादि +इन +इनका +इन्हीं +इन्हें +इन्हों +इस +इसका +इसकी +इसके +इसमें +इसी +इसे +उन +उनका +उनकी +उनके +उनको +उन्हीं +उन्हें +उन्हों +उस +उसके +उसी +उसे +एक +एवं +एस +ऐसे +और +कई +कर +करता +करते +करना +करने +करें +कहते +कहा +का +काफ़ी +कि +कितना +किन्हें +किन्हों +किया +किर +किस +किसी +किसे +की +कुछ +कुल +के +को +कोई +कौन +कौनसा +गया +घर +जब +जहाँ +जा +जितना +जिन +जिन्हें +जिन्हों +जिस +जिसे +जीधर +जैसा +जैसे +जो +तक +तब +तरह +तिन +तिन्हें +तिन्हों +तिस +तिसे +तो +था +थी +थे +दबारा +दिया +दुसरा +दूसरे +दो +द्वारा +न +नहीं +ना +निहायत +नीचे +ने +पर +पर +पहले +पूरा +पे +फिर +बनी +बही +बहुत +बाद +बाला +बिलकुल +भी +भीतर +मगर +मानो +मे +में +यदि +यह +यहाँ +यही +या +यिह +ये +रखें +रहा +रहे +ऱ्वासा +लिए +लिये +लेकिन +व +वर्ग +वह +वह +वहाँ +वहीं +वाले +वुह +वे +वग़ैरह +संग +सकता +सकते +सबसे +सभी +साथ +साबुत +साभ +सारा +से +सो +ही +हुआ +हुई +हुए +है +हैं +हो +होता +होती +होते +होना +होने +# additional normalized forms of the above +अपनि +जेसे +होति +सभि +तिंहों +इंहों +दवारा +इसि +किंहें +थि +उंहों +ओर +जिंहें +वहिं +अभि +बनि +हि +उंहिं +उंहें +हें +वगेरह +एसे +रवासा +कोन +निचे +काफि +उसि +पुरा +भितर +हे +बहि +वहां +कोइ +यहां +जिंहों +तिंहें +किसि +कइ +यहि +इंहिं +जिधर +इंहें +अदि +इतयादि +हुइ +कोनसा +इसकि +दुसरे +जहां +अप +किंहों +उनकि +भि +वरग +हुअ +जेसा +नहिं diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_hu.txt b/test/solr/configsets/property_search/conf/lang/stopwords_hu.txt new file mode 100644 index 00000000..37526da8 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_hu.txt @@ -0,0 +1,211 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| Hungarian stop word list +| prepared by Anna Tordai + +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elő +először +előtt +első +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +ő +ők +őket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_hy.txt b/test/solr/configsets/property_search/conf/lang/stopwords_hy.txt new file mode 100644 index 00000000..60c1c50f --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_hy.txt @@ -0,0 +1,46 @@ +# example set of Armenian stopwords. +այդ +այլ +այն +այս +դու +դուք +եմ +են +ենք +ես +եք +է +էի +էին +էինք +էիր +էիք +էր +ըստ +թ +ի +ին +իսկ +իր +կամ +համար +հետ +հետո +մենք +մեջ +մի +ն +նա +նաև +նրա +նրանք +որ +որը +որոնք +որպես +ու +ում +պիտի +վրա +և diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_id.txt b/test/solr/configsets/property_search/conf/lang/stopwords_id.txt new file mode 100644 index 00000000..4617f83a --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_id.txt @@ -0,0 +1,359 @@ +# from appendix D of: A Study of Stemming Effects on Information +# Retrieval in Bahasa Indonesia +ada +adanya +adalah +adapun +agak +agaknya +agar +akan +akankah +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +diantaranya +antara +antaranya +diantara +apa +apaan +mengapa +apabila +apakah +apalagi +apatah +atau +ataukah +ataupun +bagai +bagaikan +sebagai +sebagainya +bagaimana +bagaimanapun +sebagaimana +bagaimanakah +bagi +bahkan +bahwa +bahwasanya +sebaliknya +banyak +sebanyak +beberapa +seberapa +begini +beginian +beginikah +beginilah +sebegini +begitu +begitukah +begitulah +begitupun +sebegitu +belum +belumlah +sebelum +sebelumnya +sebenarnya +berapa +berapakah +berapalah +berapapun +betulkah +sebetulnya +biasa +biasanya +bila +bilakah +bisa +bisakah +sebisanya +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +cuma +percuma +dahulu +dalam +dan +dapat +dari +daripada +dekat +demi +demikian +demikianlah +sedemikian +dengan +depan +di +dia +dialah +dini +diri +dirinya +terdiri +dong +dulu +enggak +enggaknya +entah +entahlah +terhadap +terhadapnya +hal +hampir +hanya +hanyalah +harus +haruslah +harusnya +seharusnya +hendak +hendaklah +hendaknya +hingga +sehingga +ia +ialah +ibarat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jangan +jangankan +janganlah +jika +jikalau +juga +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +dikarenakan +karena +karenanya +ke +kecil +kemudian +kenapa +kepada +kepadanya +ketika +seketika +khususnya +kini +kinilah +kiranya +sekiranya +kita +kitalah +kok +lagi +lagian +selagi +lah +lain +lainnya +melainkan +selaku +lalu +melalui +terlalu +lama +lamanya +selama +selama +selamanya +lebih +terlebih +bermacam +macam +semacam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masih +masihkah +semasih +masing +mau +maupun +semaunya +memang +mereka +merekalah +meski +meskipun +semula +mungkin +mungkinkah +nah +namun +nanti +nantinya +nyaris +oleh +olehnya +seorang +seseorang +pada +padanya +padahal +paling +sepanjang +pantas +sepantasnya +sepantasnyalah +para +pasti +pastilah +per +pernah +pula +pun +merupakan +rupanya +serupa +saat +saatnya +sesaat +saja +sajalah +saling +bersama +sama +sesama +sambil +sampai +sana +sangat +sangatlah +saya +sayalah +se +sebab +sebabnya +sebuah +tersebut +tersebutlah +sedang +sedangkan +sedikit +sedikitnya +segala +segalanya +segera +sesegera +sejak +sejenak +sekali +sekalian +sekalipun +sesekali +sekaligus +sekarang +sekarang +sekitar +sekitarnya +sela +selain +selalu +seluruh +seluruhnya +semakin +sementara +sempat +semua +semuanya +sendiri +sendirinya +seolah +seperti +sepertinya +sering +seringnya +serta +siapa +siapakah +siapapun +disini +disinilah +sini +sinilah +sesuatu +sesuatunya +suatu +sesudah +sesudahnya +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tak +tanpa +setelah +telah +tentang +tentu +tentulah +tentunya +tertentu +seterusnya +tapi +tetapi +setiap +tiap +setidaknya +tidak +tidakkah +tidaklah +toh +waduh +wah +wahai +sewaktu +walau +walaupun +wong +yaitu +yakni +yang diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_it.txt b/test/solr/configsets/property_search/conf/lang/stopwords_it.txt new file mode 100644 index 00000000..1219cc77 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_it.txt @@ -0,0 +1,303 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | An Italian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +ad | a (to) before vowel +al | a + il +allo | a + lo +ai | a + i +agli | a + gli +all | a + l' +agl | a + gl' +alla | a + la +alle | a + le +con | with +col | con + il +coi | con + i (forms collo, cogli etc are now very rare) +da | from +dal | da + il +dallo | da + lo +dai | da + i +dagli | da + gli +dall | da + l' +dagl | da + gll' +dalla | da + la +dalle | da + le +di | of +del | di + il +dello | di + lo +dei | di + i +degli | di + gli +dell | di + l' +degl | di + gl' +della | di + la +delle | di + le +in | in +nel | in + el +nello | in + lo +nei | in + i +negli | in + gli +nell | in + l' +negl | in + gl' +nella | in + la +nelle | in + le +su | on +sul | su + il +sullo | su + lo +sui | su + i +sugli | su + gli +sull | su + l' +sugl | su + gl' +sulla | su + la +sulle | su + le +per | through, by +tra | among +contro | against +io | I +tu | thou +lui | he +lei | she +noi | we +voi | you +loro | they +mio | my +mia | +miei | +mie | +tuo | +tua | +tuoi | thy +tue | +suo | +sua | +suoi | his, her +sue | +nostro | our +nostra | +nostri | +nostre | +vostro | your +vostra | +vostri | +vostre | +mi | me +ti | thee +ci | us, there +vi | you, there +lo | him, the +la | her, the +li | them +le | them, the +gli | to him, the +ne | from there etc +il | the +un | a +uno | a +una | a +ma | but +ed | and +se | if +perché | why, because +anche | also +come | how +dov | where (as dov') +dove | where +che | who, that +chi | who +cui | whom +non | not +più | more +quale | who, that +quanto | how much +quanti | +quanta | +quante | +quello | that +quelli | +quella | +quelle | +questo | this +questi | +questa | +queste | +si | yes +tutto | all +tutti | all + + | single letter forms: + +a | at +c | as c' for ce or ci +e | and +i | the +l | as l' +o | or + + | forms of avere, to have (not including the infinitive): + +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute + + | forms of essere, to be (not including the infinitive): +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo + + | forms of fare, to do (not including the infinitive, fa, fat-): +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo + + | forms of stare, to be (not including the infinitive): +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_ja.txt b/test/solr/configsets/property_search/conf/lang/stopwords_ja.txt new file mode 100644 index 00000000..d4321be6 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_ja.txt @@ -0,0 +1,127 @@ +# +# This file defines a stopword set for Japanese. +# +# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. +# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 +# for frequency lists, etc. that can be useful for making your own set (if desired) +# +# Note that there is an overlap between these stopwords and the terms stopped when used +# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note +# that comments are not allowed on the same line as stopwords. +# +# Also note that stopping is done in a case-insensitive manner. Change your StopFilter +# configuration if you need case-sensitive stopping. Lastly, note that stopping is done +# using the same character width as the entries in this file. Since this StopFilter is +# normally done after a CJKWidthFilter in your chain, you would usually want your romaji +# entries to be in half-width and your kana entries to be in full-width. +# +の +に +は +を +た +が +で +て +と +し +れ +さ +ある +いる +も +する +から +な +こと +として +い +や +れる +など +なっ +ない +この +ため +その +あっ +よう +また +もの +という +あり +まで +られ +なる +へ +か +だ +これ +によって +により +おり +より +による +ず +なり +られる +において +ば +なかっ +なく +しかし +について +せ +だっ +その後 +できる +それ +う +ので +なお +のみ +でき +き +つ +における +および +いう +さらに +でも +ら +たり +その他 +に関する +たち +ます +ん +なら +に対して +特に +せる +及び +これら +とき +では +にて +ほか +ながら +うち +そして +とともに +ただし +かつて +それぞれ +または +お +ほど +ものの +に対する +ほとんど +と共に +といった +です +とも +ところ +ここ +##### End of file diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_lv.txt b/test/solr/configsets/property_search/conf/lang/stopwords_lv.txt new file mode 100644 index 00000000..e21a23c0 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_lv.txt @@ -0,0 +1,172 @@ +# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins +# the original list of over 800 forms was refined: +# pronouns, adverbs, interjections were removed +# +# prepositions +aiz +ap +ar +apakš +ārpus +augšpus +bez +caur +dēļ +gar +iekš +iz +kopš +labad +lejpus +līdz +no +otrpus +pa +par +pār +pēc +pie +pirms +pret +priekš +starp +šaipus +uz +viņpus +virs +virspus +zem +apakšpus +# Conjunctions +un +bet +jo +ja +ka +lai +tomēr +tikko +turpretī +arī +kaut +gan +tādēļ +tā +ne +tikvien +vien +kā +ir +te +vai +kamēr +# Particles +ar +diezin +droši +diemžēl +nebūt +ik +it +taču +nu +pat +tiklab +iekšpus +nedz +tik +nevis +turpretim +jeb +iekam +iekām +iekāms +kolīdz +līdzko +tiklīdz +jebšu +tālab +tāpēc +nekā +itin +jā +jau +jel +nē +nezin +tad +tikai +vis +tak +iekams +vien +# modal verbs +būt +biju +biji +bija +bijām +bijāt +esmu +esi +esam +esat +būšu +būsi +būs +būsim +būsiet +tikt +tiku +tiki +tika +tikām +tikāt +tieku +tiec +tiek +tiekam +tiekat +tikšu +tiks +tiksim +tiksiet +tapt +tapi +tapāt +topat +tapšu +tapsi +taps +tapsim +tapsiet +kļūt +kļuvu +kļuvi +kļuva +kļuvām +kļuvāt +kļūstu +kļūsti +kļūst +kļūstam +kļūstat +kļūšu +kļūsi +kļūs +kļūsim +kļūsiet +# verbs +varēt +varēju +varējām +varēšu +varēsim +var +varēji +varējāt +varēsi +varēsiet +varat +varēja +varēs diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_nl.txt b/test/solr/configsets/property_search/conf/lang/stopwords_nl.txt new file mode 100644 index 00000000..47a2aeac --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_nl.txt @@ -0,0 +1,119 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Dutch stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large sample of Dutch text. + + | Dutch stop words frequently exhibit homonym clashes. These are indicated + | clearly below. + +de | the +en | and +van | of, from +ik | I, the ego +te | (1) chez, at etc, (2) to, (3) too +dat | that, which +die | that, those, who, which +in | in, inside +een | a, an, one +hij | he +het | the, it +niet | not, nothing, naught +zijn | (1) to be, being, (2) his, one's, its +is | is +was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river +op | on, upon, at, in, up, used up +aan | on, upon, to (as dative) +met | with, by +als | like, such as, when +voor | (1) before, in front of, (2) furrow +had | had, past tense all persons sing. of 'hebben' (have) +er | there +maar | but, only +om | round, about, for etc +hem | him +dan | then +zou | should/would, past tense all persons sing. of 'zullen' +of | or, whether, if +wat | what, something, anything +mijn | possessive and noun 'mine' +men | people, 'one' +dit | this +zo | so, thus, in this way +door | through by +over | over, across +ze | she, her, they, them +zich | oneself +bij | (1) a bee, (2) by, near, at +ook | also, too +tot | till, until +je | you +mij | me +uit | out of, from +der | Old Dutch form of 'van der' still found in surnames +daar | (1) there, (2) because +haar | (1) her, their, them, (2) hair +naar | (1) unpleasant, unwell etc, (2) towards, (3) as +heb | present first person sing. of 'to have' +hoe | how, why +heeft | present third person sing. of 'to have' +hebben | 'to have' and various parts thereof +deze | this +u | you +want | (1) for, (2) mitten, (3) rigging +nog | yet, still +zal | 'shall', first and third person sing. of verb 'zullen' (will) +me | me +zij | she, they +nu | now +ge | 'thou', still used in Belgium and south Netherlands +geen | none +omdat | because +iets | something, somewhat +worden | to become, grow, get +toch | yet, still +al | all, every, each +waren | (1) 'were' (2) to wander, (3) wares, (3) +veel | much, many +meer | (1) more, (2) lake +doen | to do, to make +toen | then, when +moet | noun 'spot/mote' and present form of 'to must' +ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' +zonder | without +kan | noun 'can' and present form of 'to be able' +hun | their, them +dus | so, consequently +alles | all, everything, anything +onder | under, beneath +ja | yes, of course +eens | once, one day +hier | here +wie | who +werd | imperfect third person sing. of 'become' +altijd | always +doch | yet, but etc +wordt | present third person sing. of 'become' +wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans +kunnen | to be able +ons | us/our +zelf | self +tegen | against, towards, at +na | after, near +reeds | already +wil | (1) present tense of 'want', (2) 'will', noun, (3) fender +kon | could; past tense of 'to be able' +niets | nothing +uw | your +iemand | somebody +geweest | been; past participle of 'be' +andere | other diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_no.txt b/test/solr/configsets/property_search/conf/lang/stopwords_no.txt new file mode 100644 index 00000000..a7a2c28b --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_no.txt @@ -0,0 +1,194 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Norwegian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This stop word list is for the dominant bokmål dialect. Words unique + | to nynorsk are marked *. + + | Revised by Jan Bruusgaard , Jan 2005 + +og | and +i | in +jeg | I +det | it/this/that +at | to (w. inf.) +en | a/an +et | a/an +den | it/this/that +til | to +er | is/am/are +som | who/that +på | on +de | they / you(formal) +med | with +han | he +av | of +ikke | not +ikkje | not * +der | there +så | so +var | was/were +meg | me +seg | you +men | but +ett | one +har | have +om | about +vi | we +min | my +mitt | my +ha | have +hadde | had +hun | she +nå | now +over | over +da | when/as +ved | by/know +fra | from +du | you +ut | out +sin | your +dem | them +oss | us +opp | up +man | you/one +kan | can +hans | his +hvor | where +eller | or +hva | what +skal | shall/must +selv | self (reflective) +sjøl | self (reflective) +her | here +alle | all +vil | will +bli | become +ble | became +blei | became * +blitt | have become +kunne | could +inn | in +når | when +være | be +kom | come +noen | some +noe | some +ville | would +dere | you +som | who/which/that +deres | their/theirs +kun | only/just +ja | yes +etter | after +ned | down +skulle | should +denne | this +for | for/because +deg | you +si | hers/his +sine | hers/his +sitt | hers/his +mot | against +å | to +meget | much +hvorfor | why +dette | this +disse | these/those +uten | without +hvordan | how +ingen | none +din | your +ditt | your +blir | become +samme | same +hvilken | which +hvilke | which (plural) +sånn | such a +inni | inside/within +mellom | between +vår | our +hver | each +hvem | who +vors | us/ours +hvis | whose +både | both +bare | only/just +enn | than +fordi | as/because +før | before +mange | many +også | also +slik | just +vært | been +være | to be +båe | both * +begge | both +siden | since +dykk | your * +dykkar | yours * +dei | they * +deira | them * +deires | theirs * +deim | them * +di | your (fem.) * +då | as/when * +eg | I * +ein | a/an * +eit | a/an * +eitt | a/an * +elles | or * +honom | he * +hjå | at * +ho | she * +hoe | she * +henne | her +hennar | her/hers +hennes | hers +hoss | how * +hossen | how * +ikkje | not * +ingi | noone * +inkje | noone * +korleis | how * +korso | how * +kva | what/which * +kvar | where * +kvarhelst | where * +kven | who/whom * +kvi | why * +kvifor | why * +me | we * +medan | while * +mi | my * +mine | my * +mykje | much * +no | now * +nokon | some (masc./neut.) * +noka | some (fem.) * +nokor | some * +noko | some * +nokre | some * +si | his/hers * +sia | since * +sidan | since * +so | so * +somt | some * +somme | some * +um | about* +upp | up * +vere | be * +vore | was * +verte | become * +vort | become * +varte | became * +vart | became * + diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_pt.txt b/test/solr/configsets/property_search/conf/lang/stopwords_pt.txt new file mode 100644 index 00000000..acfeb01a --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_pt.txt @@ -0,0 +1,253 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Portuguese stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | of, from +a | the; to, at; her +o | the; him +que | who, that +e | and +do | de + o +da | de + a +em | in +um | a +para | for + | é from SER +com | with +não | not, no +uma | a +os | the; them +no | em + o +se | himself etc +na | em + a +por | for +mais | more +as | the; them +dos | de + os +como | as, like +mas | but + | foi from SER +ao | a + o +ele | he +das | de + as + | tem from TER +à | a + a +seu | his +sua | her +ou | or + | ser from SER +quando | when +muito | much + | há from HAV +nos | em + os; us +já | already, now + | está from EST +eu | I +também | also +só | only, just +pelo | per + o +pela | per + a +até | up to +isso | that +ela | he +entre | between + | era from SER +depois | after +sem | without +mesmo | same +aos | a + os + | ter from TER +seus | his +quem | whom +nas | em + as +me | me +esse | that +eles | they + | estão from EST +você | you + | tinha from TER + | foram from SER +essa | that +num | em + um +nem | nor +suas | her +meu | my +às | a + as +minha | my + | têm from TER +numa | em + uma +pelos | per + os +elas | they + | havia from HAV + | seja from SER +qual | which + | será from SER +nós | we + | tenho from TER +lhe | to him, her +deles | of them +essas | those +esses | those +pelas | per + as +este | this + | fosse from SER +dele | of him + + | other words. There are many contractions such as naquele = em+aquele, + | mo = me+o, but they are rare. + | Indefinite article plural forms are also rare. + +tu | thou +te | thee +vocês | you (plural) +vos | you +lhes | to them +meus | my +minhas +teu | thy +tua +teus +tuas +nosso | our +nossa +nossos +nossas + +dela | of her +delas | of them + +esta | this +estes | these +estas | these +aquele | that +aquela | that +aqueles | those +aquelas | those +isto | this +aquilo | that + + | forms of estar, to be (not including the infinitive): +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem + + | forms of haver, to have (not including the infinitive): +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam + + | forms of ser, to be (not including the infinitive): +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam + + | forms of ter, to have (not including the infinitive): +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_ro.txt b/test/solr/configsets/property_search/conf/lang/stopwords_ro.txt new file mode 100644 index 00000000..4fdee90a --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_ro.txt @@ -0,0 +1,233 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +acea +aceasta +această +aceea +acei +aceia +acel +acela +acele +acelea +acest +acesta +aceste +acestea +aceşti +aceştia +acolo +acum +ai +aia +aibă +aici +al +ăla +ale +alea +ălea +altceva +altcineva +am +ar +are +aş +aşadar +asemenea +asta +ăsta +astăzi +astea +ăstea +ăştia +asupra +aţi +au +avea +avem +aveţi +azi +bine +bucur +bună +ca +că +căci +când +care +cărei +căror +cărui +cât +câte +câţi +către +câtva +ce +cel +ceva +chiar +cînd +cine +cineva +cît +cîte +cîţi +cîtva +contra +cu +cum +cumva +curând +curînd +da +dă +dacă +dar +datorită +de +deci +deja +deoarece +departe +deşi +din +dinaintea +dintr +dintre +drept +după +ea +ei +el +ele +eram +este +eşti +eu +face +fără +fi +fie +fiecare +fii +fim +fiţi +iar +ieri +îi +îl +îmi +împotriva +în +înainte +înaintea +încât +încît +încotro +între +întrucât +întrucît +îţi +la +lângă +le +li +lîngă +lor +lui +mă +mâine +mea +mei +mele +mereu +meu +mi +mine +mult +multă +mulţi +ne +nicăieri +nici +nimeni +nişte +noastră +noastre +noi +noştri +nostru +nu +ori +oricând +oricare +oricât +orice +oricînd +oricine +oricît +oricum +oriunde +până +pe +pentru +peste +pînă +poate +pot +prea +prima +primul +prin +printr +sa +să +săi +sale +sau +său +se +şi +sînt +sîntem +sînteţi +spre +sub +sunt +suntem +sunteţi +ta +tăi +tale +tău +te +ţi +ţie +tine +toată +toate +tot +toţi +totuşi +tu +un +una +unde +undeva +unei +unele +uneori +unor +vă +vi +voastră +voastre +voi +voştri +vostru +vouă +vreo +vreun diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_ru.txt b/test/solr/configsets/property_search/conf/lang/stopwords_ru.txt new file mode 100644 index 00000000..55271400 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_ru.txt @@ -0,0 +1,243 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | a russian stop word list. comments begin with vertical bar. each stop + | word is at the start of a line. + + | this is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | letter `ё' is translated to `е'. + +и | and +в | in/into +во | alternative form +не | not +что | what/that +он | he +на | on/onto +я | i +с | from +со | alternative form +как | how +а | milder form of `no' (but) +то | conjunction and form of `that' +все | all +она | she +так | so, thus +его | him +но | but +да | yes/and +ты | thou +к | towards, by +у | around, chez +же | intensifier particle +вы | you +за | beyond, behind +бы | conditional/subj. particle +по | up to, along +только | only +ее | her +мне | to me +было | it was +вот | here is/are, particle +от | away from +меня | me +еще | still, yet, more +нет | no, there isnt/arent +о | about +из | out of +ему | to him +теперь | now +когда | when +даже | even +ну | so, well +вдруг | suddenly +ли | interrogative particle +если | if +уже | already, but homonym of `narrower' +или | or +ни | neither +быть | to be +был | he was +него | prepositional form of его +до | up to +вас | you accusative +нибудь | indef. suffix preceded by hyphen +опять | again +уж | already, but homonym of `adder' +вам | to you +сказал | he said +ведь | particle `after all' +там | there +потом | then +себя | oneself +ничего | nothing +ей | to her +может | usually with `быть' as `maybe' +они | they +тут | here +где | where +есть | there is/are +надо | got to, must +ней | prepositional form of ей +для | for +мы | we +тебя | thee +их | them, their +чем | than +была | she was +сам | self +чтоб | in order to +без | without +будто | as if +человек | man, person, one +чего | genitive form of `what' +раз | once +тоже | also +себе | to oneself +под | beneath +жизнь | life +будет | will be +ж | short form of intensifer particle `же' +тогда | then +кто | who +этот | this +говорил | was saying +того | genitive form of `that' +потому | for that reason +этого | genitive form of `this' +какой | which +совсем | altogether +ним | prepositional form of `его', `они' +здесь | here +этом | prepositional form of `этот' +один | one +почти | almost +мой | my +тем | instrumental/dative plural of `тот', `то' +чтобы | full form of `in order that' +нее | her (acc.) +кажется | it seems +сейчас | now +были | they were +куда | where to +зачем | why +сказать | to say +всех | all (acc., gen. preposn. plural) +никогда | never +сегодня | today +можно | possible, one can +при | by +наконец | finally +два | two +об | alternative form of `о', about +другой | another +хоть | even +после | after +над | above +больше | more +тот | that one (masc.) +через | across, in +эти | these +нас | us +про | about +всего | in all, only, of all +них | prepositional form of `они' (they) +какая | which, feminine +много | lots +разве | interrogative particle +сказала | she said +три | three +эту | this, acc. fem. sing. +моя | my, feminine +впрочем | moreover, besides +хорошо | good +свою | ones own, acc. fem. sing. +этой | oblique form of `эта', fem. `this' +перед | in front of +иногда | sometimes +лучше | better +чуть | a little +том | preposn. form of `that one' +нельзя | one must not +такой | such a one +им | to them +более | more +всегда | always +конечно | of course +всю | acc. fem. sing of `all' +между | between + + + | b: some paradigms + | + | personal pronouns + | + | я меня мне мной [мною] + | ты тебя тебе тобой [тобою] + | он его ему им [него, нему, ним] + | она ее эи ею [нее, нэи, нею] + | оно его ему им [него, нему, ним] + | + | мы нас нам нами + | вы вас вам вами + | они их им ими [них, ним, ними] + | + | себя себе собой [собою] + | + | demonstrative pronouns: этот (this), тот (that) + | + | этот эта это эти + | этого эты это эти + | этого этой этого этих + | этому этой этому этим + | этим этой этим [этою] этими + | этом этой этом этих + | + | тот та то те + | того ту то те + | того той того тех + | тому той тому тем + | тем той тем [тою] теми + | том той том тех + | + | determinative pronouns + | + | (a) весь (all) + | + | весь вся все все + | всего всю все все + | всего всей всего всех + | всему всей всему всем + | всем всей всем [всею] всеми + | всем всей всем всех + | + | (b) сам (himself etc) + | + | сам сама само сами + | самого саму само самих + | самого самой самого самих + | самому самой самому самим + | самим самой самим [самою] самими + | самом самой самом самих + | + | stems of verbs `to be', `to have', `to do' and modal + | + | быть бы буд быв есть суть + | име + | дел + | мог мож мочь + | уме + | хоч хот + | долж + | можн + | нужн + | нельзя + diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_sv.txt b/test/solr/configsets/property_search/conf/lang/stopwords_sv.txt new file mode 100644 index 00000000..096f87f6 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_sv.txt @@ -0,0 +1,133 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Swedish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | Swedish stop words occasionally exhibit homonym clashes. For example + | så = so, but also seed. These are indicated clearly below. + +och | and +det | it, this/that +att | to (with infinitive) +i | in, at +en | a +jag | I +hon | she +som | who, that +han | he +på | on +den | it, this/that +med | with +var | where, each +sig | him(self) etc +för | for +så | so (also: seed) +till | to +är | is +men | but +ett | a +om | if; around, about +hade | had +de | they, these/those +av | of +icke | not, no +mig | me +du | you +henne | her +då | then, when +sin | his +nu | now +har | have +inte | inte någon = no one +hans | his +honom | him +skulle | 'sake' +hennes | her +där | there +min | my +man | one (pronoun) +ej | nor +vid | at, by, on (also: vast) +kunde | could +något | some etc +från | from, off +ut | out +när | when +efter | after, behind +upp | up +vi | we +dem | them +vara | be +vad | what +över | over +än | than +dig | you +kan | can +sina | his +här | here +ha | have +mot | towards +alla | all +under | under (also: wonder) +någon | some etc +eller | or (else) +allt | all +mycket | much +sedan | since +ju | why +denna | this/that +själv | myself, yourself etc +detta | this/that +åt | to +utan | without +varit | was +hur | how +ingen | no +mitt | my +ni | you +bli | to be, become +blev | from bli +oss | us +din | thy +dessa | these/those +några | some etc +deras | their +blir | from bli +mina | my +samma | (the) same +vilken | who, that +er | you, your +sådan | such a +vår | our +blivit | from bli +dess | its +inom | within +mellan | between +sådant | such a +varför | why +varje | each +vilka | who, that +ditt | thy +vem | who +vilket | who, that +sitta | his +sådana | such a +vart | each +dina | thy +vars | whose +vårt | our +våra | our +ert | your +era | your +vilkas | whose + diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_th.txt b/test/solr/configsets/property_search/conf/lang/stopwords_th.txt new file mode 100644 index 00000000..07f0fabe --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_th.txt @@ -0,0 +1,119 @@ +# Thai stopwords from: +# "Opinion Detection in Thai Political News Columns +# Based on Subjectivity Analysis" +# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak +ไว้ +ไม่ +ไป +ได้ +ให้ +ใน +โดย +แห่ง +แล้ว +และ +แรก +แบบ +แต่ +เอง +เห็น +เลย +เริ่ม +เรา +เมื่อ +เพื่อ +เพราะ +เป็นการ +เป็น +เปิดเผย +เปิด +เนื่องจาก +เดียวกัน +เดียว +เช่น +เฉพาะ +เคย +เข้า +เขา +อีก +อาจ +อะไร +ออก +อย่าง +อยู่ +อยาก +หาก +หลาย +หลังจาก +หลัง +หรือ +หนึ่ง +ส่วน +ส่ง +สุด +สําหรับ +ว่า +วัน +ลง +ร่วม +ราย +รับ +ระหว่าง +รวม +ยัง +มี +มาก +มา +พร้อม +พบ +ผ่าน +ผล +บาง +น่า +นี้ +นํา +นั้น +นัก +นอกจาก +ทุก +ที่สุด +ที่ +ทําให้ +ทํา +ทาง +ทั้งนี้ +ทั้ง +ถ้า +ถูก +ถึง +ต้อง +ต่างๆ +ต่าง +ต่อ +ตาม +ตั้งแต่ +ตั้ง +ด้าน +ด้วย +ดัง +ซึ่ง +ช่วง +จึง +จาก +จัด +จะ +คือ +ความ +ครั้ง +คง +ขึ้น +ของ +ขอ +ขณะ +ก่อน +ก็ +การ +กับ +กัน +กว่า +กล่าว diff --git a/test/solr/configsets/property_search/conf/lang/stopwords_tr.txt b/test/solr/configsets/property_search/conf/lang/stopwords_tr.txt new file mode 100644 index 00000000..84d9408d --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/stopwords_tr.txt @@ -0,0 +1,212 @@ +# Turkish stopwords from LUCENE-559 +# merged with the list from "Information Retrieval on Turkish Texts" +# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) +acaba +altmış +altı +ama +ancak +arada +aslında +ayrıca +bana +bazı +belki +ben +benden +beni +benim +beri +beş +bile +bin +bir +birçok +biri +birkaç +birkez +birşey +birşeyi +biz +bize +bizden +bizi +bizim +böyle +böylece +bu +buna +bunda +bundan +bunlar +bunları +bunların +bunu +bunun +burada +çok +çünkü +da +daha +dahi +de +defa +değil +diğer +diye +doksan +dokuz +dolayı +dolayısıyla +dört +edecek +eden +ederek +edilecek +ediliyor +edilmesi +ediyor +eğer +elli +en +etmesi +etti +ettiği +ettiğini +gibi +göre +halen +hangi +hatta +hem +henüz +hep +hepsi +her +herhangi +herkesin +hiç +hiçbir +için +iki +ile +ilgili +ise +işte +itibaren +itibariyle +kadar +karşın +katrilyon +kendi +kendilerine +kendini +kendisi +kendisine +kendisini +kez +ki +kim +kimden +kime +kimi +kimse +kırk +milyar +milyon +mu +mü +mı +nasıl +ne +neden +nedenle +nerde +nerede +nereye +niye +niçin +o +olan +olarak +oldu +olduğu +olduğunu +olduklarını +olmadı +olmadığı +olmak +olması +olmayan +olmaz +olsa +olsun +olup +olur +olursa +oluyor +on +ona +ondan +onlar +onlardan +onları +onların +onu +onun +otuz +oysa +öyle +pek +rağmen +sadece +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +sizi +sizin +şey +şeyden +şeyi +şeyler +şöyle +şu +şuna +şunda +şundan +şunları +şunu +tarafından +trilyon +tüm +üç +üzere +var +vardı +ve +veya +ya +yani +yapacak +yapılan +yapılması +yapıyor +yapmak +yaptı +yaptığı +yaptığını +yaptıkları +yedi +yerine +yetmiş +yine +yirmi +yoksa +yüz +zaten diff --git a/test/solr/configsets/property_search/conf/lang/userdict_ja.txt b/test/solr/configsets/property_search/conf/lang/userdict_ja.txt new file mode 100644 index 00000000..6f0368e4 --- /dev/null +++ b/test/solr/configsets/property_search/conf/lang/userdict_ja.txt @@ -0,0 +1,29 @@ +# +# This is a sample user dictionary for Kuromoji (JapaneseTokenizer) +# +# Add entries to this file in order to override the statistical model in terms +# of segmentation, readings and part-of-speech tags. Notice that entries do +# not have weights since they are always used when found. This is by-design +# in order to maximize ease-of-use. +# +# Entries are defined using the following CSV format: +# , ... , ... , +# +# Notice that a single half-width space separates tokens and readings, and +# that the number tokens and readings must match exactly. +# +# Also notice that multiple entries with the same is undefined. +# +# Whitespace only lines are ignored. Comments are not allowed on entry lines. +# + +# Custom segmentation for kanji compounds +日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 +関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 + +# Custom segmentation for compound katakana +トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 +ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 + +# Custom reading for former sumo wrestler +朝青龍,朝青龍,アサショウリュウ,カスタム人名 diff --git a/test/solr/configsets/property_search/conf/managed-schema b/test/solr/configsets/property_search/conf/managed-schema new file mode 100644 index 00000000..e99e27e9 --- /dev/null +++ b/test/solr/configsets/property_search/conf/managed-schema @@ -0,0 +1,1031 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/solr/configsets/property_search/conf/mapping-ISOLatin1Accent.txt b/test/solr/configsets/property_search/conf/mapping-ISOLatin1Accent.txt new file mode 100644 index 00000000..ede77425 --- /dev/null +++ b/test/solr/configsets/property_search/conf/mapping-ISOLatin1Accent.txt @@ -0,0 +1,246 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Syntax: +# "source" => "target" +# "source".length() > 0 (source cannot be empty.) +# "target".length() >= 0 (target can be empty.) + +# example: +# "À" => "A" +# "\u00C0" => "A" +# "\u00C0" => "\u0041" +# "ß" => "ss" +# "\t" => " " +# "\n" => "" + +# À => A +"\u00C0" => "A" + +# Á => A +"\u00C1" => "A" + +#  => A +"\u00C2" => "A" + +# à => A +"\u00C3" => "A" + +# Ä => A +"\u00C4" => "A" + +# Å => A +"\u00C5" => "A" + +# Æ => AE +"\u00C6" => "AE" + +# Ç => C +"\u00C7" => "C" + +# È => E +"\u00C8" => "E" + +# É => E +"\u00C9" => "E" + +# Ê => E +"\u00CA" => "E" + +# Ë => E +"\u00CB" => "E" + +# Ì => I +"\u00CC" => "I" + +# Í => I +"\u00CD" => "I" + +# Î => I +"\u00CE" => "I" + +# Ï => I +"\u00CF" => "I" + +# IJ => IJ +"\u0132" => "IJ" + +# Ð => D +"\u00D0" => "D" + +# Ñ => N +"\u00D1" => "N" + +# Ò => O +"\u00D2" => "O" + +# Ó => O +"\u00D3" => "O" + +# Ô => O +"\u00D4" => "O" + +# Õ => O +"\u00D5" => "O" + +# Ö => O +"\u00D6" => "O" + +# Ø => O +"\u00D8" => "O" + +# Œ => OE +"\u0152" => "OE" + +# Þ +"\u00DE" => "TH" + +# Ù => U +"\u00D9" => "U" + +# Ú => U +"\u00DA" => "U" + +# Û => U +"\u00DB" => "U" + +# Ü => U +"\u00DC" => "U" + +# Ý => Y +"\u00DD" => "Y" + +# Ÿ => Y +"\u0178" => "Y" + +# à => a +"\u00E0" => "a" + +# á => a +"\u00E1" => "a" + +# â => a +"\u00E2" => "a" + +# ã => a +"\u00E3" => "a" + +# ä => a +"\u00E4" => "a" + +# å => a +"\u00E5" => "a" + +# æ => ae +"\u00E6" => "ae" + +# ç => c +"\u00E7" => "c" + +# è => e +"\u00E8" => "e" + +# é => e +"\u00E9" => "e" + +# ê => e +"\u00EA" => "e" + +# ë => e +"\u00EB" => "e" + +# ì => i +"\u00EC" => "i" + +# í => i +"\u00ED" => "i" + +# î => i +"\u00EE" => "i" + +# ï => i +"\u00EF" => "i" + +# ij => ij +"\u0133" => "ij" + +# ð => d +"\u00F0" => "d" + +# ñ => n +"\u00F1" => "n" + +# ò => o +"\u00F2" => "o" + +# ó => o +"\u00F3" => "o" + +# ô => o +"\u00F4" => "o" + +# õ => o +"\u00F5" => "o" + +# ö => o +"\u00F6" => "o" + +# ø => o +"\u00F8" => "o" + +# œ => oe +"\u0153" => "oe" + +# ß => ss +"\u00DF" => "ss" + +# þ => th +"\u00FE" => "th" + +# ù => u +"\u00F9" => "u" + +# ú => u +"\u00FA" => "u" + +# û => u +"\u00FB" => "u" + +# ü => u +"\u00FC" => "u" + +# ý => y +"\u00FD" => "y" + +# ÿ => y +"\u00FF" => "y" + +# ff => ff +"\uFB00" => "ff" + +# fi => fi +"\uFB01" => "fi" + +# fl => fl +"\uFB02" => "fl" + +# ffi => ffi +"\uFB03" => "ffi" + +# ffl => ffl +"\uFB04" => "ffl" + +# ſt => ft +"\uFB05" => "ft" + +# st => st +"\uFB06" => "st" diff --git a/test/solr/configsets/property_search/conf/protwords.txt b/test/solr/configsets/property_search/conf/protwords.txt new file mode 100644 index 00000000..1dfc0abe --- /dev/null +++ b/test/solr/configsets/property_search/conf/protwords.txt @@ -0,0 +1,21 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +# Use a protected word file to protect against the stemmer reducing two +# unrelated words to the same base word. + +# Some non-words that normally won't be encountered, +# just to test that they won't be stemmed. +dontstems +zwhacky + diff --git a/test/solr/configsets/property_search/conf/schema.xml b/test/solr/configsets/property_search/conf/schema.xml new file mode 100644 index 00000000..20824ea6 --- /dev/null +++ b/test/solr/configsets/property_search/conf/schema.xml @@ -0,0 +1,1179 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/solr/configsets/property_search/conf/solrconfig.xml b/test/solr/configsets/property_search/conf/solrconfig.xml new file mode 100644 index 00000000..771a0f32 --- /dev/null +++ b/test/solr/configsets/property_search/conf/solrconfig.xml @@ -0,0 +1,1299 @@ + + + + + + + + + 8.8.2 + + + + + + + + + + + ${solr.data.dir:} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.lock.type:native} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.ulog.dir:} + ${solr.ulog.numVersionBuckets:65536} + + + + + ${solr.autoCommit.maxTime:15000} + false + + + + + + ${solr.autoSoftCommit.maxTime:-1} + + + + + + + + + + + + + + ${solr.max.booleanClauses:500000} + + + + + + + + + + + + + + + + + + + + + + + + true + + + + + + 20 + + + 200 + + + + + + + + + + + + + + + + + + + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + explicit + 10 + + + + + + + + + + + + + + + + explicit + json + true + + + + + + _text_ + + + + + + + + + text_general + + + + + + default + _text_ + solr.DirectSolrSpellChecker + + internal + + 0.5 + + 2 + + 1 + + 5 + + 4 + + 0.01 + + + + + + + + + + + + default + on + true + 10 + 5 + 5 + true + true + 10 + 5 + + + spellcheck + + + + + + + + + + true + false + + + terms + + + + + + + + + + + 100 + + + + + + + + 70 + + 0.5 + + [-\w ,/\n\"']{20,200} + + + + + + + ]]> + ]]> + + + + + + + + + + + + + + + + + + + + + + + + ,, + ,, + ,, + ,, + ,]]> + ]]> + + + + + + 10 + .,!? + + + + + + + WORD + + + en + US + + + + + + + + + + + + [^\w-\.] + _ + + + + + + + yyyy-MM-dd['T'[HH:mm[:ss[.SSS]][z + yyyy-MM-dd['T'[HH:mm[:ss[,SSS]][z + yyyy-MM-dd HH:mm[:ss[.SSS]][z + yyyy-MM-dd HH:mm[:ss[,SSS]][z + [EEE, ]dd MMM yyyy HH:mm[:ss] z + EEEE, dd-MMM-yy HH:mm:ss z + EEE MMM ppd HH:mm:ss [z ]yyyy + + + + + java.lang.String + text_general + + *_str + 256 + + + true + + + java.lang.Boolean + booleans + + + java.util.Date + pdates + + + java.lang.Long + java.lang.Integer + plongs + + + java.lang.Number + pdoubles + + + + + + + + + + + + + + + + + + + + text/plain; charset=UTF-8 + + + + + + + + + + + + + + diff --git a/test/solr/configsets/property_search/conf/stopwords.txt b/test/solr/configsets/property_search/conf/stopwords.txt new file mode 100644 index 00000000..ae1e83ee --- /dev/null +++ b/test/solr/configsets/property_search/conf/stopwords.txt @@ -0,0 +1,14 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/test/solr/configsets/property_search/conf/synonyms.txt b/test/solr/configsets/property_search/conf/synonyms.txt new file mode 100644 index 00000000..eab4ee87 --- /dev/null +++ b/test/solr/configsets/property_search/conf/synonyms.txt @@ -0,0 +1,29 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaafoo => aaabar +bbbfoo => bbbfoo bbbbar +cccfoo => cccbar cccbaz +fooaaa,baraaa,bazaaa + +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +pixima => pixma + diff --git a/test/solr/configsets/term_search/conf/enumsconfig.xml b/test/solr/configsets/term_search/conf/enumsconfig.xml new file mode 100644 index 00000000..72e7b7d3 --- /dev/null +++ b/test/solr/configsets/term_search/conf/enumsconfig.xml @@ -0,0 +1,12 @@ + + + + ONTOLOGY + VALUE_SET_COLLECTION + + + ANNOTATION + DATATYPE + OBJECT + + \ No newline at end of file diff --git a/test/solr/configsets/term_search/conf/lang/contractions_ca.txt b/test/solr/configsets/term_search/conf/lang/contractions_ca.txt new file mode 100644 index 00000000..307a85f9 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/contractions_ca.txt @@ -0,0 +1,8 @@ +# Set of Catalan contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +l +m +n +s +t diff --git a/test/solr/configsets/term_search/conf/lang/contractions_fr.txt b/test/solr/configsets/term_search/conf/lang/contractions_fr.txt new file mode 100644 index 00000000..f1bba51b --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/contractions_fr.txt @@ -0,0 +1,15 @@ +# Set of French contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +l +m +t +qu +n +s +j +d +c +jusqu +quoiqu +lorsqu +puisqu diff --git a/test/solr/configsets/term_search/conf/lang/contractions_ga.txt b/test/solr/configsets/term_search/conf/lang/contractions_ga.txt new file mode 100644 index 00000000..9ebe7fa3 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/contractions_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +d +m +b diff --git a/test/solr/configsets/term_search/conf/lang/contractions_it.txt b/test/solr/configsets/term_search/conf/lang/contractions_it.txt new file mode 100644 index 00000000..cac04095 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/contractions_it.txt @@ -0,0 +1,23 @@ +# Set of Italian contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +c +l +all +dall +dell +nell +sull +coll +pell +gl +agl +dagl +degl +negl +sugl +un +m +t +s +v +d diff --git a/test/solr/configsets/term_search/conf/lang/hyphenations_ga.txt b/test/solr/configsets/term_search/conf/lang/hyphenations_ga.txt new file mode 100644 index 00000000..4d2642cc --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/hyphenations_ga.txt @@ -0,0 +1,5 @@ +# Set of Irish hyphenations for StopFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +h +n +t diff --git a/test/solr/configsets/term_search/conf/lang/stemdict_nl.txt b/test/solr/configsets/term_search/conf/lang/stemdict_nl.txt new file mode 100644 index 00000000..44107297 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stemdict_nl.txt @@ -0,0 +1,6 @@ +# Set of overrides for the dutch stemmer +# TODO: load this as a resource from the analyzer and sync it in build.xml +fiets fiets +bromfiets bromfiets +ei eier +kind kinder diff --git a/test/solr/configsets/term_search/conf/lang/stoptags_ja.txt b/test/solr/configsets/term_search/conf/lang/stoptags_ja.txt new file mode 100644 index 00000000..71b75084 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stoptags_ja.txt @@ -0,0 +1,420 @@ +# +# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. +# +# Any token with a part-of-speech tag that exactly matches those defined in this +# file are removed from the token stream. +# +# Set your own stoptags by uncommenting the lines below. Note that comments are +# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, +# etc. that can be useful for building you own stoptag set. +# +# The entire possible tagset is provided below for convenience. +# +##### +# noun: unclassified nouns +#名詞 +# +# noun-common: Common nouns or nouns where the sub-classification is undefined +#名詞-一般 +# +# noun-proper: Proper nouns where the sub-classification is undefined +#名詞-固有名詞 +# +# noun-proper-misc: miscellaneous proper nouns +#名詞-固有名詞-一般 +# +# noun-proper-person: Personal names where the sub-classification is undefined +#名詞-固有名詞-人名 +# +# noun-proper-person-misc: names that cannot be divided into surname and +# given name; foreign names; names where the surname or given name is unknown. +# e.g. お市の方 +#名詞-固有名詞-人名-一般 +# +# noun-proper-person-surname: Mainly Japanese surnames. +# e.g. 山田 +#名詞-固有名詞-人名-姓 +# +# noun-proper-person-given_name: Mainly Japanese given names. +# e.g. 太郎 +#名詞-固有名詞-人名-名 +# +# noun-proper-organization: Names representing organizations. +# e.g. 通産省, NHK +#名詞-固有名詞-組織 +# +# noun-proper-place: Place names where the sub-classification is undefined +#名詞-固有名詞-地域 +# +# noun-proper-place-misc: Place names excluding countries. +# e.g. アジア, バルセロナ, 京都 +#名詞-固有名詞-地域-一般 +# +# noun-proper-place-country: Country names. +# e.g. 日本, オーストラリア +#名詞-固有名詞-地域-国 +# +# noun-pronoun: Pronouns where the sub-classification is undefined +#名詞-代名詞 +# +# noun-pronoun-misc: miscellaneous pronouns: +# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ +#名詞-代名詞-一般 +# +# noun-pronoun-contraction: Spoken language contraction made by combining a +# pronoun and the particle 'wa'. +# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ +#名詞-代名詞-縮約 +# +# noun-adverbial: Temporal nouns such as names of days or months that behave +# like adverbs. Nouns that represent amount or ratios and can be used adverbially, +# e.g. 金曜, 一月, 午後, 少量 +#名詞-副詞可能 +# +# noun-verbal: Nouns that take arguments with case and can appear followed by +# 'suru' and related verbs (する, できる, なさる, くださる) +# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り +#名詞-サ変接続 +# +# noun-adjective-base: The base form of adjectives, words that appear before な ("na") +# e.g. 健康, 安易, 駄目, だめ +#名詞-形容動詞語幹 +# +# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数. +# e.g. 0, 1, 2, 何, 数, 幾 +#名詞-数 +# +# noun-affix: noun affixes where the sub-classification is undefined +#名詞-非自立 +# +# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that +# attach to the base form of inflectional words, words that cannot be classified +# into any of the other categories below. This category includes indefinite nouns. +# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, +# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, +# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳, +# わり, 割り, 割, ん-口語/, もん-口語/ +#名詞-非自立-一般 +# +# noun-affix-adverbial: noun affixes that that can behave as adverbs. +# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, +# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, +# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, +# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, +# 儘, 侭, みぎり, 矢先 +#名詞-非自立-副詞可能 +# +# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars +# with the stem よう(だ) ("you(da)"). +# e.g. よう, やう, 様 (よう) +#名詞-非自立-助動詞語幹 +# +# noun-affix-adjective-base: noun affixes that can connect to the indeclinable +# connection form な (aux "da"). +# e.g. みたい, ふう +#名詞-非自立-形容動詞語幹 +# +# noun-special: special nouns where the sub-classification is undefined. +#名詞-特殊 +# +# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is +# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base +# form of inflectional words. +# e.g. そう +#名詞-特殊-助動詞語幹 +# +# noun-suffix: noun suffixes where the sub-classification is undefined. +#名詞-接尾 +# +# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect +# to ガル or タイ and can combine into compound nouns, words that cannot be classified into +# any of the other categories below. In general, this category is more inclusive than +# 接尾語 ("suffix") and is usually the last element in a compound noun. +# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み, +# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用 +#名詞-接尾-一般 +# +# noun-suffix-person: Suffixes that form nouns and attach to person names more often +# than other nouns. +# e.g. 君, 様, 著 +#名詞-接尾-人名 +# +# noun-suffix-place: Suffixes that form nouns and attach to place names more often +# than other nouns. +# e.g. 町, 市, 県 +#名詞-接尾-地域 +# +# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that +# can appear before スル ("suru"). +# e.g. 化, 視, 分け, 入り, 落ち, 買い +#名詞-接尾-サ変接続 +# +# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, +# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the +# conjunctive form of inflectional words. +# e.g. そう +#名詞-接尾-助動詞語幹 +# +# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive +# form of inflectional words and appear before the copula だ ("da"). +# e.g. 的, げ, がち +#名詞-接尾-形容動詞語幹 +# +# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. +# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ) +#名詞-接尾-副詞可能 +# +# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category +# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach +# to numbers. +# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半 +#名詞-接尾-助数詞 +# +# noun-suffix-special: Special suffixes that mainly attach to inflecting words. +# e.g. (楽し) さ, (考え) 方 +#名詞-接尾-特殊 +# +# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words +# together. +# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦) +#名詞-接続詞的 +# +# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are +# semantically verb-like. +# e.g. ごらん, ご覧, 御覧, 頂戴 +#名詞-動詞非自立的 +# +# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, +# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") +# is いわく ("iwaku"). +#名詞-引用文字列 +# +# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and +# behave like an adjective. +# e.g. 申し訳, 仕方, とんでも, 違い +#名詞-ナイ形容詞語幹 +# +##### +# prefix: unclassified prefixes +#接頭詞 +# +# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) +# excluding numerical expressions. +# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派) +#接頭詞-名詞接続 +# +# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb +# in conjunctive form followed by なる/なさる/くださる. +# e.g. お (読みなさい), お (座り) +#接頭詞-動詞接続 +# +# prefix-adjectival: Prefixes that attach to adjectives. +# e.g. お (寒いですねえ), バカ (でかい) +#接頭詞-形容詞接続 +# +# prefix-numerical: Prefixes that attach to numerical expressions. +# e.g. 約, およそ, 毎時 +#接頭詞-数接続 +# +##### +# verb: unclassified verbs +#動詞 +# +# verb-main: +#動詞-自立 +# +# verb-auxiliary: +#動詞-非自立 +# +# verb-suffix: +#動詞-接尾 +# +##### +# adjective: unclassified adjectives +#形容詞 +# +# adjective-main: +#形容詞-自立 +# +# adjective-auxiliary: +#形容詞-非自立 +# +# adjective-suffix: +#形容詞-接尾 +# +##### +# adverb: unclassified adverbs +#副詞 +# +# adverb-misc: Words that can be segmented into one unit and where adnominal +# modification is not possible. +# e.g. あいかわらず, 多分 +#副詞-一般 +# +# adverb-particle_conjunction: Adverbs that can be followed by の, は, に, +# な, する, だ, etc. +# e.g. こんなに, そんなに, あんなに, なにか, なんでも +#副詞-助詞類接続 +# +##### +# adnominal: Words that only have noun-modifying forms. +# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, +# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, +# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き +#連体詞 +# +##### +# conjunction: Conjunctions that can occur independently. +# e.g. が, けれども, そして, じゃあ, それどころか +接続詞 +# +##### +# particle: unclassified particles. +助詞 +# +# particle-case: case particles where the subclassification is undefined. +助詞-格助詞 +# +# particle-case-misc: Case particles. +# e.g. から, が, で, と, に, へ, より, を, の, にて +助詞-格助詞-一般 +# +# particle-case-quote: the "to" that appears after nouns, a person’s speech, +# quotation marks, expressions of decisions from a meeting, reasons, judgements, +# conjectures, etc. +# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...) +助詞-格助詞-引用 +# +# particle-case-compound: Compounds of particles and verbs that mainly behave +# like case particles. +# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って, +# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, +# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, +# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, +# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって, +# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, +# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる, +# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ +助詞-格助詞-連語 +# +# particle-conjunctive: +# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, +# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, +# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/ +助詞-接続助詞 +# +# particle-dependency: +# e.g. こそ, さえ, しか, すら, は, も, ぞ +助詞-係助詞 +# +# particle-adverbial: +# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, +# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/, +# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, +# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/, +# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」) +助詞-副助詞 +# +# particle-interjective: particles with interjective grammatical roles. +# e.g. (松島) や +助詞-間投助詞 +# +# particle-coordinate: +# e.g. と, たり, だの, だり, とか, なり, や, やら +助詞-並立助詞 +# +# particle-final: +# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, +# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/ +助詞-終助詞 +# +# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is +# adverbial, conjunctive, or sentence final. For example: +# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」 +# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」 +# 「(祈りが届いたせい) か (, 試験に合格した.)」 +# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」 +# e.g. か +助詞-副助詞/並立助詞/終助詞 +# +# particle-adnominalizer: The "no" that attaches to nouns and modifies +# non-inflectional words. +助詞-連体化 +# +# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs +# that are giongo, giseigo, or gitaigo. +# e.g. に, と +助詞-副詞化 +# +# particle-special: A particle that does not fit into one of the above classifications. +# This includes particles that are used in Tanka, Haiku, and other poetry. +# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家) +助詞-特殊 +# +##### +# auxiliary-verb: +助動詞 +# +##### +# interjection: Greetings and other exclamations. +# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, +# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい +#感動詞 +# +##### +# symbol: unclassified Symbols. +記号 +# +# symbol-misc: A general symbol not in one of the categories below. +# e.g. [○◎@$〒→+] +記号-一般 +# +# symbol-comma: Commas +# e.g. [,、] +記号-読点 +# +# symbol-period: Periods and full stops. +# e.g. [..。] +記号-句点 +# +# symbol-space: Full-width whitespace. +記号-空白 +# +# symbol-open_bracket: +# e.g. [({‘“『【] +記号-括弧開 +# +# symbol-close_bracket: +# e.g. [)}’”』」】] +記号-括弧閉 +# +# symbol-alphabetic: +#記号-アルファベット +# +##### +# other: unclassified other +#その他 +# +# other-interjection: Words that are hard to classify as noun-suffixes or +# sentence-final particles. +# e.g. (だ)ァ +その他-間投 +# +##### +# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. +# e.g. あの, うんと, えと +フィラー +# +##### +# non-verbal: non-verbal sound. +非言語音 +# +##### +# fragment: +#語断片 +# +##### +# unknown: unknown part of speech. +#未知語 +# +##### End of file diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_ar.txt b/test/solr/configsets/term_search/conf/lang/stopwords_ar.txt new file mode 100644 index 00000000..046829db --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_ar.txt @@ -0,0 +1,125 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Cleaned on October 11, 2009 (not normalized, so use before normalization) +# This means that when modifying this list, you might need to add some +# redundant entries, for example containing forms with both أ and ا +من +ومن +منها +منه +في +وفي +فيها +فيه +و +ف +ثم +او +أو +ب +بها +به +ا +أ +اى +اي +أي +أى +لا +ولا +الا +ألا +إلا +لكن +ما +وما +كما +فما +عن +مع +اذا +إذا +ان +أن +إن +انها +أنها +إنها +انه +أنه +إنه +بان +بأن +فان +فأن +وان +وأن +وإن +التى +التي +الذى +الذي +الذين +الى +الي +إلى +إلي +على +عليها +عليه +اما +أما +إما +ايضا +أيضا +كل +وكل +لم +ولم +لن +ولن +هى +هي +هو +وهى +وهي +وهو +فهى +فهي +فهو +انت +أنت +لك +لها +له +هذه +هذا +تلك +ذلك +هناك +كانت +كان +يكون +تكون +وكانت +وكان +غير +بعض +قد +نحو +بين +بينما +منذ +ضمن +حيث +الان +الآن +خلال +بعد +قبل +حتى +عند +عندما +لدى +جميع diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_bg.txt b/test/solr/configsets/term_search/conf/lang/stopwords_bg.txt new file mode 100644 index 00000000..1ae4ba2a --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_bg.txt @@ -0,0 +1,193 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +а +аз +ако +ала +бе +без +беше +би +бил +била +били +било +близо +бъдат +бъде +бяха +в +вас +ваш +ваша +вероятно +вече +взема +ви +вие +винаги +все +всеки +всички +всичко +всяка +във +въпреки +върху +г +ги +главно +го +д +да +дали +до +докато +докога +дори +досега +доста +е +едва +един +ето +за +зад +заедно +заради +засега +затова +защо +защото +и +из +или +им +има +имат +иска +й +каза +как +каква +какво +както +какъв +като +кога +когато +което +които +кой +който +колко +която +къде +където +към +ли +м +ме +между +мен +ми +мнозина +мога +могат +може +моля +момента +му +н +на +над +назад +най +направи +напред +например +нас +не +него +нея +ни +ние +никой +нито +но +някои +някой +няма +обаче +около +освен +особено +от +отгоре +отново +още +пак +по +повече +повечето +под +поне +поради +после +почти +прави +пред +преди +през +при +пък +първо +с +са +само +се +сега +си +скоро +след +сме +според +сред +срещу +сте +съм +със +също +т +тази +така +такива +такъв +там +твой +те +тези +ти +тн +то +това +тогава +този +той +толкова +точно +трябва +тук +тъй +тя +тях +у +харесва +ч +че +често +чрез +ще +щом +я diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_ca.txt b/test/solr/configsets/term_search/conf/lang/stopwords_ca.txt new file mode 100644 index 00000000..3da65dea --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_ca.txt @@ -0,0 +1,220 @@ +# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed) +a +abans +ací +ah +així +això +al +als +aleshores +algun +alguna +algunes +alguns +alhora +allà +allí +allò +altra +altre +altres +amb +ambdós +ambdues +apa +aquell +aquella +aquelles +aquells +aquest +aquesta +aquestes +aquests +aquí +baix +cada +cadascú +cadascuna +cadascunes +cadascuns +com +contra +d'un +d'una +d'unes +d'uns +dalt +de +del +dels +des +després +dins +dintre +donat +doncs +durant +e +eh +el +els +em +en +encara +ens +entre +érem +eren +éreu +es +és +esta +està +estàvem +estaven +estàveu +esteu +et +etc +ets +fins +fora +gairebé +ha +han +has +havia +he +hem +heu +hi +ho +i +igual +iguals +ja +l'hi +la +les +li +li'n +llavors +m'he +ma +mal +malgrat +mateix +mateixa +mateixes +mateixos +me +mentre +més +meu +meus +meva +meves +molt +molta +moltes +molts +mon +mons +n'he +n'hi +ne +ni +no +nogensmenys +només +nosaltres +nostra +nostre +nostres +o +oh +oi +on +pas +pel +pels +per +però +perquè +poc +poca +pocs +poques +potser +propi +qual +quals +quan +quant +que +què +quelcom +qui +quin +quina +quines +quins +s'ha +s'han +sa +semblant +semblants +ses +seu +seus +seva +seva +seves +si +sobre +sobretot +sóc +solament +sols +son +són +sons +sota +sou +t'ha +t'han +t'he +ta +tal +també +tampoc +tan +tant +tanta +tantes +teu +teus +teva +teves +ton +tons +tot +tota +totes +tots +un +una +unes +uns +us +va +vaig +vam +van +vas +veu +vosaltres +vostra +vostre +vostres diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_cz.txt b/test/solr/configsets/term_search/conf/lang/stopwords_cz.txt new file mode 100644 index 00000000..53c6097d --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_cz.txt @@ -0,0 +1,172 @@ +a +s +k +o +i +u +v +z +dnes +cz +tímto +budeš +budem +byli +jseš +můj +svým +ta +tomto +tohle +tuto +tyto +jej +zda +proč +máte +tato +kam +tohoto +kdo +kteří +mi +nám +tom +tomuto +mít +nic +proto +kterou +byla +toho +protože +asi +ho +naši +napište +re +což +tím +takže +svých +její +svými +jste +aj +tu +tedy +teto +bylo +kde +ke +pravé +ji +nad +nejsou +či +pod +téma +mezi +přes +ty +pak +vám +ani +když +však +neg +jsem +tento +článku +články +aby +jsme +před +pta +jejich +byl +ještě +až +bez +také +pouze +první +vaše +která +nás +nový +tipy +pokud +může +strana +jeho +své +jiné +zprávy +nové +není +vás +jen +podle +zde +už +být +více +bude +již +než +který +by +které +co +nebo +ten +tak +má +při +od +po +jsou +jak +další +ale +si +se +ve +to +jako +za +zpět +ze +do +pro +je +na +atd +atp +jakmile +přičemž +já +on +ona +ono +oni +ony +my +vy +jí +ji +mě +mne +jemu +tomu +těm +těmu +němu +němuž +jehož +jíž +jelikož +jež +jakož +načež diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_da.txt b/test/solr/configsets/term_search/conf/lang/stopwords_da.txt new file mode 100644 index 00000000..42e6145b --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_da.txt @@ -0,0 +1,110 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Danish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + +og | and +i | in +jeg | I +det | that (dem. pronoun)/it (pers. pronoun) +at | that (in front of a sentence)/to (with infinitive) +en | a/an +den | it (pers. pronoun)/that (dem. pronoun) +til | to/at/for/until/against/by/of/into, more +er | present tense of "to be" +som | who, as +på | on/upon/in/on/at/to/after/of/with/for, on +de | they +med | with/by/in, along +han | he +af | of/by/from/off/for/in/with/on, off +for | at/for/to/from/by/of/ago, in front/before, because +ikke | not +der | who/which, there/those +var | past tense of "to be" +mig | me/myself +sig | oneself/himself/herself/itself/themselves +men | but +et | a/an/one, one (number), someone/somebody/one +har | present tense of "to have" +om | round/about/for/in/a, about/around/down, if +vi | we +min | my +havde | past tense of "to have" +ham | him +hun | she +nu | now +over | over/above/across/by/beyond/past/on/about, over/past +da | then, when/as/since +fra | from/off/since, off, since +du | you +ud | out +sin | his/her/its/one's +dem | them +os | us/ourselves +op | up +man | you/one +hans | his +hvor | where +eller | or +hvad | what +skal | must/shall etc. +selv | myself/youself/herself/ourselves etc., even +her | here +alle | all/everyone/everybody etc. +vil | will (verb) +blev | past tense of "to stay/to remain/to get/to become" +kunne | could +ind | in +når | when +være | present tense of "to be" +dog | however/yet/after all +noget | something +ville | would +jo | you know/you see (adv), yes +deres | their/theirs +efter | after/behind/according to/for/by/from, later/afterwards +ned | down +skulle | should +denne | this +end | than +dette | this +mit | my/mine +også | also +under | under/beneath/below/during, below/underneath +have | have +dig | you +anden | other +hende | her +mine | my +alt | everything +meget | much/very, plenty of +sit | his, her, its, one's +sine | his, her, its, one's +vor | our +mod | against +disse | these +hvis | if +din | your/yours +nogle | some +hos | by/at +blive | be/become +mange | many +ad | by/through +bliver | present tense of "to be/to become" +hendes | her/hers +været | be +thi | for (conj) +jer | you +sådan | such, like this/like that diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_de.txt b/test/solr/configsets/term_search/conf/lang/stopwords_de.txt new file mode 100644 index 00000000..86525e7a --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_de.txt @@ -0,0 +1,294 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A German stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | The number of forms in this list is reduced significantly by passing it + | through the German stemmer. + + +aber | but + +alle | all +allem +allen +aller +alles + +als | than, as +also | so +am | an + dem +an | at + +ander | other +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders + +auch | also +auf | on +aus | out of +bei | by +bin | am +bis | until +bist | art +da | there +damit | with it +dann | then + +der | the +den +des +dem +die +das + +daß | that + +derselbe | the same +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe + +dazu | to that + +dein | thy +deine +deinem +deinen +deiner +deines + +denn | because + +derer | of those +dessen | of him + +dich | thee +dir | to thee +du | thou + +dies | this +diese +diesem +diesen +dieser +dieses + + +doch | (several meanings) +dort | (over) there + + +durch | through + +ein | a +eine +einem +einen +einer +eines + +einig | some +einige +einigem +einigen +einiger +einiges + +einmal | once + +er | he +ihn | him +ihm | to him + +es | it +etwas | something + +euer | your +eure +eurem +euren +eurer +eures + +für | for +gegen | towards +gewesen | p.p. of sein +hab | have +habe | have +haben | have +hat | has +hatte | had +hatten | had +hier | here +hin | there +hinter | behind + +ich | I +mich | me +mir | to me + + +ihr | you, to her +ihre +ihrem +ihren +ihrer +ihres +euch | to you + +im | in + dem +in | in +indem | while +ins | in + das +ist | is + +jede | each, every +jedem +jeden +jeder +jedes + +jene | that +jenem +jenen +jener +jenes + +jetzt | now +kann | can + +kein | no +keine +keinem +keinen +keiner +keines + +können | can +könnte | could +machen | do +man | one + +manche | some, many a +manchem +manchen +mancher +manches + +mein | my +meine +meinem +meinen +meiner +meines + +mit | with +muss | must +musste | had to +nach | to(wards) +nicht | not +nichts | nothing +noch | still, yet +nun | now +nur | only +ob | whether +oder | or +ohne | without +sehr | very + +sein | his +seine +seinem +seinen +seiner +seines + +selbst | self +sich | herself + +sie | they, she +ihnen | to them + +sind | are +so | so + +solche | such +solchem +solchen +solcher +solches + +soll | shall +sollte | should +sondern | but +sonst | else +über | over +um | about, around +und | and + +uns | us +unse +unsem +unsen +unser +unses + +unter | under +viel | much +vom | von + dem +von | from +vor | before +während | while +war | was +waren | were +warst | wast +was | what +weg | away, off +weil | because +weiter | further + +welche | which +welchem +welchen +welcher +welches + +wenn | when +werde | will +werden | will +wie | how +wieder | again +will | want +wir | we +wird | will +wirst | willst +wo | where +wollen | want +wollte | wanted +würde | would +würden | would +zu | to +zum | zu + dem +zur | zu + der +zwar | indeed +zwischen | between + diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_el.txt b/test/solr/configsets/term_search/conf/lang/stopwords_el.txt new file mode 100644 index 00000000..232681f5 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_el.txt @@ -0,0 +1,78 @@ +# Lucene Greek Stopwords list +# Note: by default this file is used after GreekLowerCaseFilter, +# so when modifying this file use 'σ' instead of 'ς' +ο +η +το +οι +τα +του +τησ +των +τον +την +και +κι +κ +ειμαι +εισαι +ειναι +ειμαστε +ειστε +στο +στον +στη +στην +μα +αλλα +απο +για +προσ +με +σε +ωσ +παρα +αντι +κατα +μετα +θα +να +δε +δεν +μη +μην +επι +ενω +εαν +αν +τοτε +που +πωσ +ποιοσ +ποια +ποιο +ποιοι +ποιεσ +ποιων +ποιουσ +αυτοσ +αυτη +αυτο +αυτοι +αυτων +αυτουσ +αυτεσ +αυτα +εκεινοσ +εκεινη +εκεινο +εκεινοι +εκεινεσ +εκεινα +εκεινων +εκεινουσ +οπωσ +ομωσ +ισωσ +οσο +οτι diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_en.txt b/test/solr/configsets/term_search/conf/lang/stopwords_en.txt new file mode 100644 index 00000000..2c164c0b --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_en.txt @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +# Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +such +that +the +their +then +there +these +they +this +to +was +will +with diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_es.txt b/test/solr/configsets/term_search/conf/lang/stopwords_es.txt new file mode 100644 index 00000000..487d78c8 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_es.txt @@ -0,0 +1,356 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Spanish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | from, of +la | the, her +que | who, that +el | the +en | in +y | and +a | to +los | the, them +del | de + el +se | himself, from him etc +las | the, them +por | for, by, etc +un | a +para | for +con | with +no | no +una | a +su | his, her +al | a + el + | es from SER +lo | him +como | how +más | more +pero | pero +sus | su plural +le | to him, her +ya | already +o | or + | fue from SER +este | this + | ha from HABER +sí | himself etc +porque | because +esta | this + | son from SER +entre | between + | está from ESTAR +cuando | when +muy | very +sin | without +sobre | on + | ser from SER + | tiene from TENER +también | also +me | me +hasta | until +hay | there is/are +donde | where + | han from HABER +quien | whom, that + | están from ESTAR + | estado from ESTAR +desde | from +todo | all +nos | us +durante | during + | estados from ESTAR +todos | all +uno | a +les | to them +ni | nor +contra | against +otros | other + | fueron from SER +ese | that +eso | that + | había from HABER +ante | before +ellos | they +e | and (variant of y) +esto | this +mí | me +antes | before +algunos | some +qué | what? +unos | a +yo | I +otro | other +otras | other +otra | other +él | he +tanto | so much, many +esa | that +estos | these +mucho | much, many +quienes | who +nada | nothing +muchos | many +cual | who + | sea from SER +poco | few +ella | she +estar | to be + | haber from HABER +estas | these + | estaba from ESTAR + | estamos from ESTAR +algunas | some +algo | something +nosotros | we + + | other forms + +mi | me +mis | mi plural +tú | thou +te | thee +ti | thee +tu | thy +tus | tu plural +ellas | they +nosotras | we +vosotros | you +vosotras | you +os | you +mío | mine +mía | +míos | +mías | +tuyo | thine +tuya | +tuyos | +tuyas | +suyo | his, hers, theirs +suya | +suyos | +suyas | +nuestro | ours +nuestra | +nuestros | +nuestras | +vuestro | yours +vuestra | +vuestros | +vuestras | +esos | those +esas | those + + | forms of estar, to be (not including the infinitive): +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad + + | forms of haber, to have (not including the infinitive): +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas + + | forms of ser, to be (not including the infinitive): +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +siendo +sido + | sed also means 'thirst' + + | forms of tener, to have (not including the infinitive): +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened + diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_et.txt b/test/solr/configsets/term_search/conf/lang/stopwords_et.txt new file mode 100644 index 00000000..1b06a134 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_et.txt @@ -0,0 +1,1603 @@ +# Estonian stopwords list +all +alla +allapoole +allpool +alt +altpoolt +eel +eespool +enne +hommikupoole +hoolimata +ilma +kaudu +keset +kesk +kohe +koos +kuhupoole +kuni +kuspool +kustpoolt +kõige +käsikäes +lappi +ligi +läbi +mööda +paitsi +peale +pealepoole +pealpool +pealt +pealtpoolt +piki +pikku +piku +pikuti +põiki +pärast +päri +risti +sealpool +sealtpoolt +seespool +seltsis +siiapoole +siinpool +siitpoolt +sinnapoole +sissepoole +taga +tagantpoolt +tagapidi +tagapool +taha +tahapoole +teispool +teispoole +tänu +tükkis +vaatamata +vastu +väljapoole +väljaspool +väljastpoolt +õhtupoole +ühes +ühestükis +ühestükkis +ülalpool +ülaltpoolt +üle +ülespoole +ülevalpool +ülevaltpoolt +ümber +ümbert +aegu +aegus +alguks +algul +algule +algult +alguni +all +alla +alt +alul +alutsi +arvel +asemel +asemele +eel +eeli +ees +eesotsas +eest +eestotsast +esitsi +ette +etteotsa +haaval +heaks +hoolimata +hulgas +hulgast +hulka +jalgu +jalus +jalust +jaoks +jooksul +juurde +juures +juurest +jälil +jälile +järel +järele +järelt +järgi +kaasas +kallal +kallale +kallalt +kamul +kannul +kannule +kannult +kaudu +kaupa +keskel +keskele +keskelt +keskis +keskpaiku +kestel +kestes +kilda +killas +killast +kimpu +kimpus +kiuste +kohal +kohale +kohalt +kohaselt +kohe +kohta +koos +korral +kukil +kukile +kukilt +kulul +kõrva +kõrval +kõrvale +kõrvalt +kõrvas +kõrvast +käekõrval +käekõrvale +käekõrvalt +käes +käest +kätte +külge +küljes +küljest +küüsi +küüsis +küüsist +ligi +ligidal +ligidale +ligidalt +aegu +aegus +alguks +algul +algule +algult +alguni +all +alla +alt +alul +alutsi +arvel +asemel +asemele +eel +eeli +ees +eesotsas +eest +eestotsast +esitsi +ette +etteotsa +haaval +heaks +hoolimata +hulgas +hulgast +hulka +jalgu +jalus +jalust +jaoks +jooksul +juurde +juures +juurest +jälil +jälile +järel +järele +järelt +järgi +kaasas +kallal +kallale +kallalt +kamul +kannul +kannule +kannult +kaudu +kaupa +keskel +keskele +keskelt +keskis +keskpaiku +kestel +kestes +kilda +killas +killast +kimpu +kimpus +kiuste +kohal +kohale +kohalt +kohaselt +kohe +kohta +koos +korral +kukil +kukile +kukilt +kulul +kõrva +kõrval +kõrvale +kõrvalt +kõrvas +kõrvast +käekõrval +käekõrvale +käekõrvalt +käes +käest +kätte +külge +küljes +küljest +küüsi +küüsis +küüsist +ligi +ligidal +ligidale +ligidalt +lool +läbi +lähedal +lähedale +lähedalt +man +mant +manu +meelest +mööda +nahas +nahka +nahkas +najal +najale +najalt +nõjal +nõjale +otsa +otsas +otsast +paigale +paigu +paiku +peal +peale +pealt +perra +perrä +pidi +pihta +piki +pikku +pool +poole +poolest +poolt +puhul +puksiiris +pähe +päralt +päras +pärast +päri +ringi +ringis +risust +saadetusel +saadik +saatel +saati +seas +seast +sees +seest +sekka +seljataga +seltsi +seltsis +seltsist +sisse +slepis +suhtes +šlepis +taga +tagant +tagantotsast +tagaotsas +tagaselja +tagasi +tagast +tagutsi +taha +tahaotsa +takka +tarvis +tasa +tuuri +tuuris +tõttu +tükkis +uhal +vaatamata +vahel +vahele +vahelt +vahepeal +vahepeale +vahepealt +vahetsi +varal +varale +varul +vastas +vastast +vastu +veerde +veeres +viisi +võidu +võrd +võrdki +võrra +võrragi +väel +väele +vältel +väärt +väärtki +äärde +ääre +ääres +äärest +ühes +üle +ümber +ümbert +a +abil +aina +ainult +alalt +alates +alati +alles +b +c +d +e +eales +ealeski +edasi +edaspidi +eelkõige +eemal +ei +eks +end +enda +enese +ennem +esialgu +f +g +h +hoopis +i +iganes +igatahes +igati +iial +iialgi +ikka +ikkagi +ilmaski +iseenda +iseenese +iseenesest +isegi +j +jah +ju +juba +juhul +just +järelikult +k +ka +kah +kas +kasvõi +keda +kestahes +kogu +koguni +kohati +kokku +kuhu +kuhugi +kuidagi +kuidas +kunagi +kus +kusagil +kusjuures +kuskil +kust +kõigepealt +küll +l +liiga +lisaks +m +miks +mil +millal +millalgi +mispärast +mistahes +mistõttu +mitte +muide +muidu +muidugi +muist +mujal +mujale +mujalt +mõlemad +mõnda +mõne +mõnikord +n +nii +niikaua +niimoodi +niipaljuke +niisama +niisiis +niivõrd +nõnda +nüüd +o +omaette +omakorda +omavahel +ometi +p +palju +paljuke +palju-palju +peaaegu +peagi +peamiselt +pigem +pisut +praegu +päris +r +rohkem +s +samas +samuti +seal +sealt +sedakorda +sedapuhku +seega +seejuures +seejärel +seekord +seepärast +seetõttu +sellepärast +seni +sestap +siia +siiani +siin +siinkohal +siis +siiski +siit +sinna +suht +š +z +ž +t +teel +teineteise +tõesti +täiesti +u +umbes +v +w +veel +veelgi +vist +võibolla +võib-olla +väga +vähemalt +välja +väljas +väljast +õ +ä +ära +ö +ü +ühtlasi +üksi +ükskõik +ülal +ülale +ülalt +üles +ülesse +üleval +ülevalt +ülimalt +üsna +x +y +aga +ega +ehk +ehkki +elik +ellik +enge +ennegu +ent +et +ja +justkui +kui +kuid +kuigi +kuivõrd +kuna +kuni +kut +mistab +muudkui +nagu +nigu +ning +olgugi +otsekui +otsenagu +selmet +sest +sestab +vaid +või +aa +adaa +adjöö +ae +ah +ahaa +ahah +ah-ah-ah +ah-haa +ahoi +ai +aidaa +aidu-raidu +aih +aijeh +aituma +aitäh +aitüma +ammuu +amps +ampsti +aptsih +ass +at +ata +at-at-at +atsih +atsihh +auh +bai-bai +bingo +braavo +brr +ee +eeh +eh +ehee +eheh +eh-eh-hee +eh-eh-ee +ehei +ehh +ehhee +einoh +ena +ennäe +ennäh +fuh +fui +fuih +haa +hah +hahaa +hah-hah-hah +halleluuja +hallo +halloo +hass +hee +heh +he-he-hee +hei +heldeke(ne) +heureka +hihii +hip-hip-hurraa +hmh +hmjah +hoh-hoh-hoo +hohoo +hoi +hollallaa +hoo +hoplaa +hopp +hops +hopsassaa +hopsti +hosianna +huh +huidii +huist +hurjah +hurjeh +hurjoh +hurjuh +hurraa +huu +hõhõh +hõi +hõissa +hõissassa +hõk +hõkk +häh +hä-hä-hää +hüvasti +ih-ah-haa +ih-ih-hii +ii-ha-ha +issake +issakene +isver +jaa-ah +ja-ah +jaah +janäe +jeeh +jeerum +jeever +jessas +jestas +juhhei +jumalaga +jumalime +jumaluke +jumalukene +jutas +kaaps +kaapsti +kaasike +kae +kalps +kalpsti +kannäe +kanäe +kappadi +kaps +kapsti +karkõmm +karkäuh +karkääks +karkääksti +karmauh +karmauhti +karnaps +karnapsti +karniuhti +karpartsaki +karpauh +karpauhti +karplauh +karplauhti +karprauh +karprauhti +karsumdi +karsumm +kartsumdi +kartsumm +karviuh +karviuhti +kaske +kassa +kauh +kauhti +keh +keksti +kepsti +khe +khm +kih +kiiks +kiiksti +kiis +kiiss +kikerii +kikerikii +kili +kilk +kilk-kõlk +kilks +kilks-kolks +kilks-kõlks +kill +killadi +killadi|-kolladi +killadi-kõlladi +killa-kolla +killa-kõlla +kill-kõll +kimps-komps +kipp +kips-kõps +kiriküüt +kirra-kõrra +kirr-kõrr +kirts +klaps +klapsti +klirdi +klirr +klonks +klops +klopsti +kluk +klu-kluu +klõks +klõksti +klõmdi +klõmm +klõmpsti +klõnks +klõnksti +klõps +klõpsti +kläu +kohva-kohva +kok +koks +koksti +kolaki +kolk +kolks +kolksti +koll +kolladi +komp +komps +kompsti +kop +kopp +koppadi +kops +kopsti +kossu +kotsu +kraa +kraak +kraaks +kraaps +kraapsti +krahh +kraks +kraksti +kraps +krapsti +krauh +krauhti +kriiks +kriiksti +kriips +kriips-kraaps +kripa-krõpa +krips-kraps +kriuh +kriuks +kriuksti +kromps +kronk +kronks +krooks +kruu +krõks +krõksti +krõpa +krõps +krõpsti +krõuh +kräu +kräuh +kräuhti +kräuks +kss +kukeleegu +kukku +kuku +kulu +kurluu +kurnäu +kuss +kussu +kõks +kõksti +kõldi +kõlks +kõlksti +kõll +kõmaki +kõmdi +kõmm +kõmps +kõpp +kõps +kõpsadi +kõpsat +kõpsti +kõrr +kõrra-kõrra +kõss +kõtt +kõõksti +kärr +kärts +kärtsti +käuks +käuksti +kääga +kääks +kääksti +köh +köki-möki +köksti +laks +laksti +lampsti +larts +lartsti +lats +latsti +leelo +legoo +lehva +liiri-lõõri +lika-lõka +likat-lõkat +limpsti +lips +lipsti +lirts +lirtsaki +lirtsti +lonksti +lops +lopsti +lorts +lortsti +luks +lups +lupsti +lurts +lurtsti +lõks +lõksti +lõmps +lõmpsti +lõnks +lõnksti +lärts +lärtsti +läts +lätsti +lörts +lörtsti +lötsti +lööps +lööpsti +marss +mats +matsti +mauh +mauhti +mh +mhh +mhmh +miau +mjaa +mkm +m-mh +mnjaa +mnjah +moens +mulks +mulksti +mull-mull +mull-mull-mull +muu +muuh +mõh +mõmm +mäh +mäts +mäu +mää +möh +möh-öh-ää +möö +müh-müh +mühüh +müks +müksti +müraki +mürr +mürts +mürtsaki +mürtsti +mütaku +müta-mäta +müta-müta +müt-müt +müt-müt-müt +müts +mütsti +mütt +naa +naah +nah +naks +naksti +nanuu +naps +napsti +nilpsti +nipsti +nirr +niuh +niuh-näuh +niuhti +noh +noksti +nolpsti +nonoh +nonoo +nonäh +noo +nooh +nooks +norr +nurr +nuuts +nõh +nõhh +nõka-nõka +nõks +nõksat-nõksat +nõks-nõks +nõksti +nõõ +nõõh +näeh +näh +nälpsti +nämm-nämm +näpsti +näts +nätsti +näu +näuh +näuhti +näuks +näuksti +nääh +nääks +nühkat-nühkat +oeh +oh +ohh +ohhh +oh-hoi +oh-hoo +ohoh +oh-oh-oo +oh-oh-hoo +ohoi +ohoo +oi +oih +oijee +oijeh +oo +ooh +oo-oh +oo-ohh +oot +ossa +ot +paa +pah +pahh +pakaa +pamm +pantsti +pardon +pardonks +parlartsti +parts +partsti +partsumdi +partsumm +pastoi +pats +patst +patsti +pau +pauh +pauhti +pele +pfui +phuh +phuuh +phäh +phähh +piiks +piip +piiri-pääri +pimm +pimm-pamm +pimm-pomm +pimm-põmm +piraki +piuks +piu-pau +plaks +plaksti +plarts +plartsti +plats +platsti +plauh +plauhh +plauhti +pliks +pliks-plaks +plinn +pliraki +plirts +plirtsti +pliu +pliuh +ploks +plotsti +plumps +plumpsti +plõks +plõksti +plõmdi +plõmm +plõnn +plärr +plärts +plärtsat +plärtsti +pläu +pläuh +plää +plörtsat +pomm +popp +pops +popsti +ports +pot +pots +potsti +pott +praks +praksti +prants +prantsaki +prantsti +prassai +prauh +prauhh +prauhti +priks +priuh +priuhh +priuh-prauh +proosit +proost +prr +prrr +prõks +prõksti +prõmdi +prõmm +prõntsti +prääk +prääks +pst +psst +ptrr +ptruu +ptüi +puh +puhh +puksti +pumm +pumps +pup-pup-pup +purts +puuh +põks +põksti +põmdi +põmm +põmmadi +põnks +põnn +põnnadi +põnt +põnts +põntsti +põraki +põrr +põrra-põrra +päh +pähh +päntsti +pää +pöörd +püh +raks +raksti +raps +rapsti +ratataa +rauh +riips +riipsti +riks +riks-raks +rips-raps +rivitult +robaki +rops +ropsaki +ropsti +ruik +räntsti +räts +röh +röhh +sah +sahh +sahkat +saps +sapsti +sauh +sauhti +servus +sihkadi-sahkadi +sihka-sahka +sihkat-sahkat +silks +silk-solk +sips +sipsti +sirr +sirr-sorr +sirts +sirtsti +siu +siuh +siuh-sauh +siuh-säuh +siuhti +siuks +siuts +skool +so +soh +solks +solksti +solpsti +soo +sooh +so-oh +soo-oh +sopp +sops +sopsti +sorr +sorts +sortsti +so-soo +soss +soss-soss +ss +sss +sst +stopp +suhkat-sahkat +sulk +sulks +sulksti +sull +sulla-sulla +sulpa-sulpa +sulps +sulpsti +sumaki +sumdi +summ +summat-summat +sups +supsaku +supsti +surts +surtsti +suss +susti +suts +sutsti +säh +sähke +särts +särtsti +säu +säuh +säuhti +taevake +taevakene +takk +tere +terekest +tibi-tibi +tikk-takk +tiks +tilk +tilks +till +tilla-talla +till-tall +tilulii +tinn +tip +tip-tap +tirr +tirtsti +tiu +tjaa +tjah +tohhoh +tohhoo +tohoh +tohoo +tok +tokk +toks +toksti +tonks +tonksti +tota +totsti +tot-tot +tprr +tpruu +trah +trahh +trallallaa +trill +trillallaa +trr +trrr +tsah +tsahh +tsilk +tsilk-tsolk +tsirr +tsiuh +tskae +tsolk +tss +tst +tsst +tsuhh +tsuk +tsumm +tsurr +tsäuh +tšao +tšš +tššš +tuk +tuks +turts +turtsti +tutki +tutkit +tutu-lutu +tutulutu +tuut +tuutu-luutu +tõks +tötsti +tümps +uh +uhh +uh-huu +uhtsa +uhtsaa +uhuh +uhuu +ui +uih +uih-aih +uijah +uijeh +uist +uit +uka +upsti +uraa +urjah +urjeh +urjoh +urjuh +urr +urraa +ust +utu +uu +uuh +vaak +vaat +vae +vaeh +vai +vat +vau +vhüüt +vidiit +viiks +vilks +vilksti +vinki-vinki +virdi +virr +viu +viudi +viuh +viuhti +voeh +voh +vohh +volks +volksti +vooh +vops +vopsti +vot +vuh +vuhti +vuih +vulks +vulksti +vull +vulpsti +vups +vupsaki +vupsaku +vupsti +vurdi +vurr +vurra-vurra +vurts +vurtsti +vutt +võe +võeh +või +võih +võrr +võts +võtt +vääks +õe +õits +õk +õkk +õrr +õss +õuh +äh +ähh +ähhähhää +äh-hää +äh-äh-hää +äiu +äiu-ää +äss +ää +ääh +äähh +öh +öhh +ök +üh +eelmine +eikeegi +eimiski +emb-kumb +enam +enim +iga +igasugune +igaüks +ise +isesugune +järgmine +keegi +kes +kumb +kumbki +kõik +meiesugune +meietaoline +midagi +mihuke +mihukene +milletaoline +milline +mina +minake +mingi +mingisugune +minusugune +minutaoline +mis +miski +miskisugune +missugune +misuke +mitmes +mitmesugune +mitu +mitu-mitu +mitu-setu +muu +mõlema +mõnesugune +mõni +mõningane +mõningas +mäherdune +määrane +naasugune +need +nemad +nendesugune +nendetaoline +nihuke +nihukene +niimitu +niisamasugune +niisugune +nisuke +nisukene +oma +omaenese +omasugune +omataoline +pool +praegune +sama +samasugune +samataoline +see +seesama +seesamane +seesamune +seesinane +seesugune +selline +sihuke +sihukene +sina +sinusugune +sinutaoline +siuke +siukene +säherdune +säärane +taoline +teiesugune +teine +teistsugune +tema +temake +temakene +temasugune +temataoline +too +toosama +toosamane +üks +üksteise +hakkama +minema +olema +pidama +saama +tegema +tulema +võima diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_eu.txt b/test/solr/configsets/term_search/conf/lang/stopwords_eu.txt new file mode 100644 index 00000000..25f1db93 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_eu.txt @@ -0,0 +1,99 @@ +# example set of basque stopwords +al +anitz +arabera +asko +baina +bat +batean +batek +bati +batzuei +batzuek +batzuetan +batzuk +bera +beraiek +berau +berauek +bere +berori +beroriek +beste +bezala +da +dago +dira +ditu +du +dute +edo +egin +ere +eta +eurak +ez +gainera +gu +gutxi +guzti +haiei +haiek +haietan +hainbeste +hala +han +handik +hango +hara +hari +hark +hartan +hau +hauei +hauek +hauetan +hemen +hemendik +hemengo +hi +hona +honek +honela +honetan +honi +hor +hori +horiei +horiek +horietan +horko +horra +horrek +horrela +horretan +horri +hortik +hura +izan +ni +noiz +nola +non +nondik +nongo +nor +nora +ze +zein +zen +zenbait +zenbat +zer +zergatik +ziren +zituen +zu +zuek +zuen +zuten diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_fa.txt b/test/solr/configsets/term_search/conf/lang/stopwords_fa.txt new file mode 100644 index 00000000..723641c6 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_fa.txt @@ -0,0 +1,313 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Note: by default this file is used after normalization, so when adding entries +# to this file, use the arabic 'ي' instead of 'ی' +انان +نداشته +سراسر +خياه +ايشان +وي +تاكنون +بيشتري +دوم +پس +ناشي +وگو +يا +داشتند +سپس +هنگام +هرگز +پنج +نشان +امسال +ديگر +گروهي +شدند +چطور +ده +و +دو +نخستين +ولي +چرا +چه +وسط +ه +كدام +قابل +يك +رفت +هفت +همچنين +در +هزار +بله +بلي +شايد +اما +شناسي +گرفته +دهد +داشته +دانست +داشتن +خواهيم +ميليارد +وقتيكه +امد +خواهد +جز +اورده +شده +بلكه +خدمات +شدن +برخي +نبود +بسياري +جلوگيري +حق +كردند +نوعي +بعري +نكرده +نظير +نبايد +بوده +بودن +داد +اورد +هست +جايي +شود +دنبال +داده +بايد +سابق +هيچ +همان +انجا +كمتر +كجاست +گردد +كسي +تر +مردم +تان +دادن +بودند +سري +جدا +ندارند +مگر +يكديگر +دارد +دهند +بنابراين +هنگامي +سمت +جا +انچه +خود +دادند +زياد +دارند +اثر +بدون +بهترين +بيشتر +البته +به +براساس +بيرون +كرد +بعضي +گرفت +توي +اي +ميليون +او +جريان +تول +بر +مانند +برابر +باشيم +مدتي +گويند +اكنون +تا +تنها +جديد +چند +بي +نشده +كردن +كردم +گويد +كرده +كنيم +نمي +نزد +روي +قصد +فقط +بالاي +ديگران +اين +ديروز +توسط +سوم +ايم +دانند +سوي +استفاده +شما +كنار +داريم +ساخته +طور +امده +رفته +نخست +بيست +نزديك +طي +كنيد +از +انها +تمامي +داشت +يكي +طريق +اش +چيست +روب +نمايد +گفت +چندين +چيزي +تواند +ام +ايا +با +ان +ايد +ترين +اينكه +ديگري +راه +هايي +بروز +همچنان +پاعين +كس +حدود +مختلف +مقابل +چيز +گيرد +ندارد +ضد +همچون +سازي +شان +مورد +باره +مرسي +خويش +برخوردار +چون +خارج +شش +هنوز +تحت +ضمن +هستيم +گفته +فكر +بسيار +پيش +براي +روزهاي +انكه +نخواهد +بالا +كل +وقتي +كي +چنين +كه +گيري +نيست +است +كجا +كند +نيز +يابد +بندي +حتي +توانند +عقب +خواست +كنند +بين +تمام +همه +ما +باشند +مثل +شد +اري +باشد +اره +طبق +بعد +اگر +صورت +غير +جاي +بيش +ريزي +اند +زيرا +چگونه +بار +لطفا +مي +درباره +من +ديده +همين +گذاري +برداري +علت +گذاشته +هم +فوق +نه +ها +شوند +اباد +همواره +هر +اول +خواهند +چهار +نام +امروز +مان +هاي +قبل +كنم +سعي +تازه +را +هستند +زير +جلوي +عنوان +بود diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_fi.txt b/test/solr/configsets/term_search/conf/lang/stopwords_fi.txt new file mode 100644 index 00000000..4372c9a0 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_fi.txt @@ -0,0 +1,97 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| forms of BE + +olla +olen +olet +on +olemme +olette +ovat +ole | negative form + +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet + +en | negation +et +ei +emme +ette +eivät + +|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans +minä minun minut minua minussa minusta minuun minulla minulta minulle | I +sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you +hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she +me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we +te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you +he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they + +tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this +tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that +se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it +nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these +nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those +ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they + +kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who +ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) +mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what +mitkä | (pl) + +joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which +jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) + +| conjunctions + +että | that +ja | and +jos | if +koska | because +kuin | than +mutta | but +niin | so +sekä | and +sillä | for +tai | or +vaan | but +vai | or +vaikka | although + + +| prepositions + +kanssa | with +mukaan | according to +noin | about +poikki | across +yli | over, across + +| other + +kun | when +niin | so +nyt | now +itse | self + diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_fr.txt b/test/solr/configsets/term_search/conf/lang/stopwords_fr.txt new file mode 100644 index 00000000..749abae6 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_fr.txt @@ -0,0 +1,186 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A French stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +au | a + le +aux | a + les +avec | with +ce | this +ces | these +dans | with +de | of +des | de + les +du | de + le +elle | she +en | `of them' etc +et | and +eux | them +il | he +je | I +la | the +le | the +leur | their +lui | him +ma | my (fem) +mais | but +me | me +même | same; as in moi-même (myself) etc +mes | me (pl) +moi | me +mon | my (masc) +ne | not +nos | our (pl) +notre | our +nous | we +on | one +ou | where +par | by +pas | not +pour | for +qu | que before vowel +que | that +qui | who +sa | his, her (fem) +se | oneself +ses | his (pl) +son | his, her (masc) +sur | on +ta | thy (fem) +te | thee +tes | thy (pl) +toi | thee +ton | thy (masc) +tu | thou +un | a +une | a +vos | your (pl) +votre | your +vous | you + + | single letter forms + +c | c' +d | d' +j | j' +l | l' +à | to, at +m | m' +n | n' +s | s' +t | t' +y | there + + | forms of être (not including the infinitive): +été +étée +étées +étés +étant +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent + + | forms of avoir (not including the infinitive): +ayant +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent + + | Later additions (from Jean-Christophe Deschamps) +ceci | this +cela | that +celà | that +cet | this +cette | this +ici | here +ils | they +les | the (pl) +leurs | their (pl) +quel | which +quels | which +quelle | which +quelles | which +sans | without +soi | oneself + diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_ga.txt b/test/solr/configsets/term_search/conf/lang/stopwords_ga.txt new file mode 100644 index 00000000..9ff88d74 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_ga.txt @@ -0,0 +1,110 @@ + +a +ach +ag +agus +an +aon +ar +arna +as +b' +ba +beirt +bhúr +caoga +ceathair +ceathrar +chomh +chtó +chuig +chun +cois +céad +cúig +cúigear +d' +daichead +dar +de +deich +deichniúr +den +dhá +do +don +dtí +dá +dár +dó +faoi +faoin +faoina +faoinár +fara +fiche +gach +gan +go +gur +haon +hocht +i +iad +idir +in +ina +ins +inár +is +le +leis +lena +lenár +m' +mar +mo +mé +na +nach +naoi +naonúr +ná +ní +níor +nó +nócha +ocht +ochtar +os +roimh +sa +seacht +seachtar +seachtó +seasca +seisear +siad +sibh +sinn +sna +sé +sí +tar +thar +thú +triúr +trí +trína +trínár +tríocha +tú +um +ár +é +éis +í +ó +ón +óna +ónár diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_gl.txt b/test/solr/configsets/term_search/conf/lang/stopwords_gl.txt new file mode 100644 index 00000000..d8760b12 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_gl.txt @@ -0,0 +1,161 @@ +# galican stopwords +a +aínda +alí +aquel +aquela +aquelas +aqueles +aquilo +aquí +ao +aos +as +así +á +ben +cando +che +co +coa +comigo +con +connosco +contigo +convosco +coas +cos +cun +cuns +cunha +cunhas +da +dalgunha +dalgunhas +dalgún +dalgúns +das +de +del +dela +delas +deles +desde +deste +do +dos +dun +duns +dunha +dunhas +e +el +ela +elas +eles +en +era +eran +esa +esas +ese +eses +esta +estar +estaba +está +están +este +estes +estiven +estou +eu +é +facer +foi +foron +fun +había +hai +iso +isto +la +las +lle +lles +lo +los +mais +me +meu +meus +min +miña +miñas +moi +na +nas +neste +nin +no +non +nos +nosa +nosas +noso +nosos +nós +nun +nunha +nuns +nunhas +o +os +ou +ó +ós +para +pero +pode +pois +pola +polas +polo +polos +por +que +se +senón +ser +seu +seus +sexa +sido +sobre +súa +súas +tamén +tan +te +ten +teñen +teño +ter +teu +teus +ti +tido +tiña +tiven +túa +túas +un +unha +unhas +uns +vos +vosa +vosas +voso +vosos +vós diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_hi.txt b/test/solr/configsets/term_search/conf/lang/stopwords_hi.txt new file mode 100644 index 00000000..86286bb0 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_hi.txt @@ -0,0 +1,235 @@ +# Also see http://www.opensource.org/licenses/bsd-license.html +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# This file was created by Jacques Savoy and is distributed under the BSD license. +# Note: by default this file also contains forms normalized by HindiNormalizer +# for spelling variation (see section below), such that it can be used whether or +# not you enable that feature. When adding additional entries to this list, +# please add the normalized form as well. +अंदर +अत +अपना +अपनी +अपने +अभी +आदि +आप +इत्यादि +इन +इनका +इन्हीं +इन्हें +इन्हों +इस +इसका +इसकी +इसके +इसमें +इसी +इसे +उन +उनका +उनकी +उनके +उनको +उन्हीं +उन्हें +उन्हों +उस +उसके +उसी +उसे +एक +एवं +एस +ऐसे +और +कई +कर +करता +करते +करना +करने +करें +कहते +कहा +का +काफ़ी +कि +कितना +किन्हें +किन्हों +किया +किर +किस +किसी +किसे +की +कुछ +कुल +के +को +कोई +कौन +कौनसा +गया +घर +जब +जहाँ +जा +जितना +जिन +जिन्हें +जिन्हों +जिस +जिसे +जीधर +जैसा +जैसे +जो +तक +तब +तरह +तिन +तिन्हें +तिन्हों +तिस +तिसे +तो +था +थी +थे +दबारा +दिया +दुसरा +दूसरे +दो +द्वारा +न +नहीं +ना +निहायत +नीचे +ने +पर +पर +पहले +पूरा +पे +फिर +बनी +बही +बहुत +बाद +बाला +बिलकुल +भी +भीतर +मगर +मानो +मे +में +यदि +यह +यहाँ +यही +या +यिह +ये +रखें +रहा +रहे +ऱ्वासा +लिए +लिये +लेकिन +व +वर्ग +वह +वह +वहाँ +वहीं +वाले +वुह +वे +वग़ैरह +संग +सकता +सकते +सबसे +सभी +साथ +साबुत +साभ +सारा +से +सो +ही +हुआ +हुई +हुए +है +हैं +हो +होता +होती +होते +होना +होने +# additional normalized forms of the above +अपनि +जेसे +होति +सभि +तिंहों +इंहों +दवारा +इसि +किंहें +थि +उंहों +ओर +जिंहें +वहिं +अभि +बनि +हि +उंहिं +उंहें +हें +वगेरह +एसे +रवासा +कोन +निचे +काफि +उसि +पुरा +भितर +हे +बहि +वहां +कोइ +यहां +जिंहों +तिंहें +किसि +कइ +यहि +इंहिं +जिधर +इंहें +अदि +इतयादि +हुइ +कोनसा +इसकि +दुसरे +जहां +अप +किंहों +उनकि +भि +वरग +हुअ +जेसा +नहिं diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_hu.txt b/test/solr/configsets/term_search/conf/lang/stopwords_hu.txt new file mode 100644 index 00000000..37526da8 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_hu.txt @@ -0,0 +1,211 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| Hungarian stop word list +| prepared by Anna Tordai + +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elő +először +előtt +első +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +ő +ők +őket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_hy.txt b/test/solr/configsets/term_search/conf/lang/stopwords_hy.txt new file mode 100644 index 00000000..60c1c50f --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_hy.txt @@ -0,0 +1,46 @@ +# example set of Armenian stopwords. +այդ +այլ +այն +այս +դու +դուք +եմ +են +ենք +ես +եք +է +էի +էին +էինք +էիր +էիք +էր +ըստ +թ +ի +ին +իսկ +իր +կամ +համար +հետ +հետո +մենք +մեջ +մի +ն +նա +նաև +նրա +նրանք +որ +որը +որոնք +որպես +ու +ում +պիտի +վրա +և diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_id.txt b/test/solr/configsets/term_search/conf/lang/stopwords_id.txt new file mode 100644 index 00000000..4617f83a --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_id.txt @@ -0,0 +1,359 @@ +# from appendix D of: A Study of Stemming Effects on Information +# Retrieval in Bahasa Indonesia +ada +adanya +adalah +adapun +agak +agaknya +agar +akan +akankah +akhirnya +aku +akulah +amat +amatlah +anda +andalah +antar +diantaranya +antara +antaranya +diantara +apa +apaan +mengapa +apabila +apakah +apalagi +apatah +atau +ataukah +ataupun +bagai +bagaikan +sebagai +sebagainya +bagaimana +bagaimanapun +sebagaimana +bagaimanakah +bagi +bahkan +bahwa +bahwasanya +sebaliknya +banyak +sebanyak +beberapa +seberapa +begini +beginian +beginikah +beginilah +sebegini +begitu +begitukah +begitulah +begitupun +sebegitu +belum +belumlah +sebelum +sebelumnya +sebenarnya +berapa +berapakah +berapalah +berapapun +betulkah +sebetulnya +biasa +biasanya +bila +bilakah +bisa +bisakah +sebisanya +boleh +bolehkah +bolehlah +buat +bukan +bukankah +bukanlah +bukannya +cuma +percuma +dahulu +dalam +dan +dapat +dari +daripada +dekat +demi +demikian +demikianlah +sedemikian +dengan +depan +di +dia +dialah +dini +diri +dirinya +terdiri +dong +dulu +enggak +enggaknya +entah +entahlah +terhadap +terhadapnya +hal +hampir +hanya +hanyalah +harus +haruslah +harusnya +seharusnya +hendak +hendaklah +hendaknya +hingga +sehingga +ia +ialah +ibarat +ingin +inginkah +inginkan +ini +inikah +inilah +itu +itukah +itulah +jangan +jangankan +janganlah +jika +jikalau +juga +justru +kala +kalau +kalaulah +kalaupun +kalian +kami +kamilah +kamu +kamulah +kan +kapan +kapankah +kapanpun +dikarenakan +karena +karenanya +ke +kecil +kemudian +kenapa +kepada +kepadanya +ketika +seketika +khususnya +kini +kinilah +kiranya +sekiranya +kita +kitalah +kok +lagi +lagian +selagi +lah +lain +lainnya +melainkan +selaku +lalu +melalui +terlalu +lama +lamanya +selama +selama +selamanya +lebih +terlebih +bermacam +macam +semacam +maka +makanya +makin +malah +malahan +mampu +mampukah +mana +manakala +manalagi +masih +masihkah +semasih +masing +mau +maupun +semaunya +memang +mereka +merekalah +meski +meskipun +semula +mungkin +mungkinkah +nah +namun +nanti +nantinya +nyaris +oleh +olehnya +seorang +seseorang +pada +padanya +padahal +paling +sepanjang +pantas +sepantasnya +sepantasnyalah +para +pasti +pastilah +per +pernah +pula +pun +merupakan +rupanya +serupa +saat +saatnya +sesaat +saja +sajalah +saling +bersama +sama +sesama +sambil +sampai +sana +sangat +sangatlah +saya +sayalah +se +sebab +sebabnya +sebuah +tersebut +tersebutlah +sedang +sedangkan +sedikit +sedikitnya +segala +segalanya +segera +sesegera +sejak +sejenak +sekali +sekalian +sekalipun +sesekali +sekaligus +sekarang +sekarang +sekitar +sekitarnya +sela +selain +selalu +seluruh +seluruhnya +semakin +sementara +sempat +semua +semuanya +sendiri +sendirinya +seolah +seperti +sepertinya +sering +seringnya +serta +siapa +siapakah +siapapun +disini +disinilah +sini +sinilah +sesuatu +sesuatunya +suatu +sesudah +sesudahnya +sudah +sudahkah +sudahlah +supaya +tadi +tadinya +tak +tanpa +setelah +telah +tentang +tentu +tentulah +tentunya +tertentu +seterusnya +tapi +tetapi +setiap +tiap +setidaknya +tidak +tidakkah +tidaklah +toh +waduh +wah +wahai +sewaktu +walau +walaupun +wong +yaitu +yakni +yang diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_it.txt b/test/solr/configsets/term_search/conf/lang/stopwords_it.txt new file mode 100644 index 00000000..1219cc77 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_it.txt @@ -0,0 +1,303 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | An Italian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +ad | a (to) before vowel +al | a + il +allo | a + lo +ai | a + i +agli | a + gli +all | a + l' +agl | a + gl' +alla | a + la +alle | a + le +con | with +col | con + il +coi | con + i (forms collo, cogli etc are now very rare) +da | from +dal | da + il +dallo | da + lo +dai | da + i +dagli | da + gli +dall | da + l' +dagl | da + gll' +dalla | da + la +dalle | da + le +di | of +del | di + il +dello | di + lo +dei | di + i +degli | di + gli +dell | di + l' +degl | di + gl' +della | di + la +delle | di + le +in | in +nel | in + el +nello | in + lo +nei | in + i +negli | in + gli +nell | in + l' +negl | in + gl' +nella | in + la +nelle | in + le +su | on +sul | su + il +sullo | su + lo +sui | su + i +sugli | su + gli +sull | su + l' +sugl | su + gl' +sulla | su + la +sulle | su + le +per | through, by +tra | among +contro | against +io | I +tu | thou +lui | he +lei | she +noi | we +voi | you +loro | they +mio | my +mia | +miei | +mie | +tuo | +tua | +tuoi | thy +tue | +suo | +sua | +suoi | his, her +sue | +nostro | our +nostra | +nostri | +nostre | +vostro | your +vostra | +vostri | +vostre | +mi | me +ti | thee +ci | us, there +vi | you, there +lo | him, the +la | her, the +li | them +le | them, the +gli | to him, the +ne | from there etc +il | the +un | a +uno | a +una | a +ma | but +ed | and +se | if +perché | why, because +anche | also +come | how +dov | where (as dov') +dove | where +che | who, that +chi | who +cui | whom +non | not +più | more +quale | who, that +quanto | how much +quanti | +quanta | +quante | +quello | that +quelli | +quella | +quelle | +questo | this +questi | +questa | +queste | +si | yes +tutto | all +tutti | all + + | single letter forms: + +a | at +c | as c' for ce or ci +e | and +i | the +l | as l' +o | or + + | forms of avere, to have (not including the infinitive): + +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute + + | forms of essere, to be (not including the infinitive): +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo + + | forms of fare, to do (not including the infinitive, fa, fat-): +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo + + | forms of stare, to be (not including the infinitive): +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_ja.txt b/test/solr/configsets/term_search/conf/lang/stopwords_ja.txt new file mode 100644 index 00000000..d4321be6 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_ja.txt @@ -0,0 +1,127 @@ +# +# This file defines a stopword set for Japanese. +# +# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. +# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 +# for frequency lists, etc. that can be useful for making your own set (if desired) +# +# Note that there is an overlap between these stopwords and the terms stopped when used +# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note +# that comments are not allowed on the same line as stopwords. +# +# Also note that stopping is done in a case-insensitive manner. Change your StopFilter +# configuration if you need case-sensitive stopping. Lastly, note that stopping is done +# using the same character width as the entries in this file. Since this StopFilter is +# normally done after a CJKWidthFilter in your chain, you would usually want your romaji +# entries to be in half-width and your kana entries to be in full-width. +# +の +に +は +を +た +が +で +て +と +し +れ +さ +ある +いる +も +する +から +な +こと +として +い +や +れる +など +なっ +ない +この +ため +その +あっ +よう +また +もの +という +あり +まで +られ +なる +へ +か +だ +これ +によって +により +おり +より +による +ず +なり +られる +において +ば +なかっ +なく +しかし +について +せ +だっ +その後 +できる +それ +う +ので +なお +のみ +でき +き +つ +における +および +いう +さらに +でも +ら +たり +その他 +に関する +たち +ます +ん +なら +に対して +特に +せる +及び +これら +とき +では +にて +ほか +ながら +うち +そして +とともに +ただし +かつて +それぞれ +または +お +ほど +ものの +に対する +ほとんど +と共に +といった +です +とも +ところ +ここ +##### End of file diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_lv.txt b/test/solr/configsets/term_search/conf/lang/stopwords_lv.txt new file mode 100644 index 00000000..e21a23c0 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_lv.txt @@ -0,0 +1,172 @@ +# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins +# the original list of over 800 forms was refined: +# pronouns, adverbs, interjections were removed +# +# prepositions +aiz +ap +ar +apakš +ārpus +augšpus +bez +caur +dēļ +gar +iekš +iz +kopš +labad +lejpus +līdz +no +otrpus +pa +par +pār +pēc +pie +pirms +pret +priekš +starp +šaipus +uz +viņpus +virs +virspus +zem +apakšpus +# Conjunctions +un +bet +jo +ja +ka +lai +tomēr +tikko +turpretī +arī +kaut +gan +tādēļ +tā +ne +tikvien +vien +kā +ir +te +vai +kamēr +# Particles +ar +diezin +droši +diemžēl +nebūt +ik +it +taču +nu +pat +tiklab +iekšpus +nedz +tik +nevis +turpretim +jeb +iekam +iekām +iekāms +kolīdz +līdzko +tiklīdz +jebšu +tālab +tāpēc +nekā +itin +jā +jau +jel +nē +nezin +tad +tikai +vis +tak +iekams +vien +# modal verbs +būt +biju +biji +bija +bijām +bijāt +esmu +esi +esam +esat +būšu +būsi +būs +būsim +būsiet +tikt +tiku +tiki +tika +tikām +tikāt +tieku +tiec +tiek +tiekam +tiekat +tikšu +tiks +tiksim +tiksiet +tapt +tapi +tapāt +topat +tapšu +tapsi +taps +tapsim +tapsiet +kļūt +kļuvu +kļuvi +kļuva +kļuvām +kļuvāt +kļūstu +kļūsti +kļūst +kļūstam +kļūstat +kļūšu +kļūsi +kļūs +kļūsim +kļūsiet +# verbs +varēt +varēju +varējām +varēšu +varēsim +var +varēji +varējāt +varēsi +varēsiet +varat +varēja +varēs diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_nl.txt b/test/solr/configsets/term_search/conf/lang/stopwords_nl.txt new file mode 100644 index 00000000..47a2aeac --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_nl.txt @@ -0,0 +1,119 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Dutch stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large sample of Dutch text. + + | Dutch stop words frequently exhibit homonym clashes. These are indicated + | clearly below. + +de | the +en | and +van | of, from +ik | I, the ego +te | (1) chez, at etc, (2) to, (3) too +dat | that, which +die | that, those, who, which +in | in, inside +een | a, an, one +hij | he +het | the, it +niet | not, nothing, naught +zijn | (1) to be, being, (2) his, one's, its +is | is +was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river +op | on, upon, at, in, up, used up +aan | on, upon, to (as dative) +met | with, by +als | like, such as, when +voor | (1) before, in front of, (2) furrow +had | had, past tense all persons sing. of 'hebben' (have) +er | there +maar | but, only +om | round, about, for etc +hem | him +dan | then +zou | should/would, past tense all persons sing. of 'zullen' +of | or, whether, if +wat | what, something, anything +mijn | possessive and noun 'mine' +men | people, 'one' +dit | this +zo | so, thus, in this way +door | through by +over | over, across +ze | she, her, they, them +zich | oneself +bij | (1) a bee, (2) by, near, at +ook | also, too +tot | till, until +je | you +mij | me +uit | out of, from +der | Old Dutch form of 'van der' still found in surnames +daar | (1) there, (2) because +haar | (1) her, their, them, (2) hair +naar | (1) unpleasant, unwell etc, (2) towards, (3) as +heb | present first person sing. of 'to have' +hoe | how, why +heeft | present third person sing. of 'to have' +hebben | 'to have' and various parts thereof +deze | this +u | you +want | (1) for, (2) mitten, (3) rigging +nog | yet, still +zal | 'shall', first and third person sing. of verb 'zullen' (will) +me | me +zij | she, they +nu | now +ge | 'thou', still used in Belgium and south Netherlands +geen | none +omdat | because +iets | something, somewhat +worden | to become, grow, get +toch | yet, still +al | all, every, each +waren | (1) 'were' (2) to wander, (3) wares, (3) +veel | much, many +meer | (1) more, (2) lake +doen | to do, to make +toen | then, when +moet | noun 'spot/mote' and present form of 'to must' +ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' +zonder | without +kan | noun 'can' and present form of 'to be able' +hun | their, them +dus | so, consequently +alles | all, everything, anything +onder | under, beneath +ja | yes, of course +eens | once, one day +hier | here +wie | who +werd | imperfect third person sing. of 'become' +altijd | always +doch | yet, but etc +wordt | present third person sing. of 'become' +wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans +kunnen | to be able +ons | us/our +zelf | self +tegen | against, towards, at +na | after, near +reeds | already +wil | (1) present tense of 'want', (2) 'will', noun, (3) fender +kon | could; past tense of 'to be able' +niets | nothing +uw | your +iemand | somebody +geweest | been; past participle of 'be' +andere | other diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_no.txt b/test/solr/configsets/term_search/conf/lang/stopwords_no.txt new file mode 100644 index 00000000..a7a2c28b --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_no.txt @@ -0,0 +1,194 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Norwegian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This stop word list is for the dominant bokmål dialect. Words unique + | to nynorsk are marked *. + + | Revised by Jan Bruusgaard , Jan 2005 + +og | and +i | in +jeg | I +det | it/this/that +at | to (w. inf.) +en | a/an +et | a/an +den | it/this/that +til | to +er | is/am/are +som | who/that +på | on +de | they / you(formal) +med | with +han | he +av | of +ikke | not +ikkje | not * +der | there +så | so +var | was/were +meg | me +seg | you +men | but +ett | one +har | have +om | about +vi | we +min | my +mitt | my +ha | have +hadde | had +hun | she +nå | now +over | over +da | when/as +ved | by/know +fra | from +du | you +ut | out +sin | your +dem | them +oss | us +opp | up +man | you/one +kan | can +hans | his +hvor | where +eller | or +hva | what +skal | shall/must +selv | self (reflective) +sjøl | self (reflective) +her | here +alle | all +vil | will +bli | become +ble | became +blei | became * +blitt | have become +kunne | could +inn | in +når | when +være | be +kom | come +noen | some +noe | some +ville | would +dere | you +som | who/which/that +deres | their/theirs +kun | only/just +ja | yes +etter | after +ned | down +skulle | should +denne | this +for | for/because +deg | you +si | hers/his +sine | hers/his +sitt | hers/his +mot | against +å | to +meget | much +hvorfor | why +dette | this +disse | these/those +uten | without +hvordan | how +ingen | none +din | your +ditt | your +blir | become +samme | same +hvilken | which +hvilke | which (plural) +sånn | such a +inni | inside/within +mellom | between +vår | our +hver | each +hvem | who +vors | us/ours +hvis | whose +både | both +bare | only/just +enn | than +fordi | as/because +før | before +mange | many +også | also +slik | just +vært | been +være | to be +båe | both * +begge | both +siden | since +dykk | your * +dykkar | yours * +dei | they * +deira | them * +deires | theirs * +deim | them * +di | your (fem.) * +då | as/when * +eg | I * +ein | a/an * +eit | a/an * +eitt | a/an * +elles | or * +honom | he * +hjå | at * +ho | she * +hoe | she * +henne | her +hennar | her/hers +hennes | hers +hoss | how * +hossen | how * +ikkje | not * +ingi | noone * +inkje | noone * +korleis | how * +korso | how * +kva | what/which * +kvar | where * +kvarhelst | where * +kven | who/whom * +kvi | why * +kvifor | why * +me | we * +medan | while * +mi | my * +mine | my * +mykje | much * +no | now * +nokon | some (masc./neut.) * +noka | some (fem.) * +nokor | some * +noko | some * +nokre | some * +si | his/hers * +sia | since * +sidan | since * +so | so * +somt | some * +somme | some * +um | about* +upp | up * +vere | be * +vore | was * +verte | become * +vort | become * +varte | became * +vart | became * + diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_pt.txt b/test/solr/configsets/term_search/conf/lang/stopwords_pt.txt new file mode 100644 index 00000000..acfeb01a --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_pt.txt @@ -0,0 +1,253 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Portuguese stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | of, from +a | the; to, at; her +o | the; him +que | who, that +e | and +do | de + o +da | de + a +em | in +um | a +para | for + | é from SER +com | with +não | not, no +uma | a +os | the; them +no | em + o +se | himself etc +na | em + a +por | for +mais | more +as | the; them +dos | de + os +como | as, like +mas | but + | foi from SER +ao | a + o +ele | he +das | de + as + | tem from TER +à | a + a +seu | his +sua | her +ou | or + | ser from SER +quando | when +muito | much + | há from HAV +nos | em + os; us +já | already, now + | está from EST +eu | I +também | also +só | only, just +pelo | per + o +pela | per + a +até | up to +isso | that +ela | he +entre | between + | era from SER +depois | after +sem | without +mesmo | same +aos | a + os + | ter from TER +seus | his +quem | whom +nas | em + as +me | me +esse | that +eles | they + | estão from EST +você | you + | tinha from TER + | foram from SER +essa | that +num | em + um +nem | nor +suas | her +meu | my +às | a + as +minha | my + | têm from TER +numa | em + uma +pelos | per + os +elas | they + | havia from HAV + | seja from SER +qual | which + | será from SER +nós | we + | tenho from TER +lhe | to him, her +deles | of them +essas | those +esses | those +pelas | per + as +este | this + | fosse from SER +dele | of him + + | other words. There are many contractions such as naquele = em+aquele, + | mo = me+o, but they are rare. + | Indefinite article plural forms are also rare. + +tu | thou +te | thee +vocês | you (plural) +vos | you +lhes | to them +meus | my +minhas +teu | thy +tua +teus +tuas +nosso | our +nossa +nossos +nossas + +dela | of her +delas | of them + +esta | this +estes | these +estas | these +aquele | that +aquela | that +aqueles | those +aquelas | those +isto | this +aquilo | that + + | forms of estar, to be (not including the infinitive): +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem + + | forms of haver, to have (not including the infinitive): +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam + + | forms of ser, to be (not including the infinitive): +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam + + | forms of ter, to have (not including the infinitive): +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_ro.txt b/test/solr/configsets/term_search/conf/lang/stopwords_ro.txt new file mode 100644 index 00000000..4fdee90a --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_ro.txt @@ -0,0 +1,233 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +acea +aceasta +această +aceea +acei +aceia +acel +acela +acele +acelea +acest +acesta +aceste +acestea +aceşti +aceştia +acolo +acum +ai +aia +aibă +aici +al +ăla +ale +alea +ălea +altceva +altcineva +am +ar +are +aş +aşadar +asemenea +asta +ăsta +astăzi +astea +ăstea +ăştia +asupra +aţi +au +avea +avem +aveţi +azi +bine +bucur +bună +ca +că +căci +când +care +cărei +căror +cărui +cât +câte +câţi +către +câtva +ce +cel +ceva +chiar +cînd +cine +cineva +cît +cîte +cîţi +cîtva +contra +cu +cum +cumva +curând +curînd +da +dă +dacă +dar +datorită +de +deci +deja +deoarece +departe +deşi +din +dinaintea +dintr +dintre +drept +după +ea +ei +el +ele +eram +este +eşti +eu +face +fără +fi +fie +fiecare +fii +fim +fiţi +iar +ieri +îi +îl +îmi +împotriva +în +înainte +înaintea +încât +încît +încotro +între +întrucât +întrucît +îţi +la +lângă +le +li +lîngă +lor +lui +mă +mâine +mea +mei +mele +mereu +meu +mi +mine +mult +multă +mulţi +ne +nicăieri +nici +nimeni +nişte +noastră +noastre +noi +noştri +nostru +nu +ori +oricând +oricare +oricât +orice +oricînd +oricine +oricît +oricum +oriunde +până +pe +pentru +peste +pînă +poate +pot +prea +prima +primul +prin +printr +sa +să +săi +sale +sau +său +se +şi +sînt +sîntem +sînteţi +spre +sub +sunt +suntem +sunteţi +ta +tăi +tale +tău +te +ţi +ţie +tine +toată +toate +tot +toţi +totuşi +tu +un +una +unde +undeva +unei +unele +uneori +unor +vă +vi +voastră +voastre +voi +voştri +vostru +vouă +vreo +vreun diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_ru.txt b/test/solr/configsets/term_search/conf/lang/stopwords_ru.txt new file mode 100644 index 00000000..55271400 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_ru.txt @@ -0,0 +1,243 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | a russian stop word list. comments begin with vertical bar. each stop + | word is at the start of a line. + + | this is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | letter `ё' is translated to `е'. + +и | and +в | in/into +во | alternative form +не | not +что | what/that +он | he +на | on/onto +я | i +с | from +со | alternative form +как | how +а | milder form of `no' (but) +то | conjunction and form of `that' +все | all +она | she +так | so, thus +его | him +но | but +да | yes/and +ты | thou +к | towards, by +у | around, chez +же | intensifier particle +вы | you +за | beyond, behind +бы | conditional/subj. particle +по | up to, along +только | only +ее | her +мне | to me +было | it was +вот | here is/are, particle +от | away from +меня | me +еще | still, yet, more +нет | no, there isnt/arent +о | about +из | out of +ему | to him +теперь | now +когда | when +даже | even +ну | so, well +вдруг | suddenly +ли | interrogative particle +если | if +уже | already, but homonym of `narrower' +или | or +ни | neither +быть | to be +был | he was +него | prepositional form of его +до | up to +вас | you accusative +нибудь | indef. suffix preceded by hyphen +опять | again +уж | already, but homonym of `adder' +вам | to you +сказал | he said +ведь | particle `after all' +там | there +потом | then +себя | oneself +ничего | nothing +ей | to her +может | usually with `быть' as `maybe' +они | they +тут | here +где | where +есть | there is/are +надо | got to, must +ней | prepositional form of ей +для | for +мы | we +тебя | thee +их | them, their +чем | than +была | she was +сам | self +чтоб | in order to +без | without +будто | as if +человек | man, person, one +чего | genitive form of `what' +раз | once +тоже | also +себе | to oneself +под | beneath +жизнь | life +будет | will be +ж | short form of intensifer particle `же' +тогда | then +кто | who +этот | this +говорил | was saying +того | genitive form of `that' +потому | for that reason +этого | genitive form of `this' +какой | which +совсем | altogether +ним | prepositional form of `его', `они' +здесь | here +этом | prepositional form of `этот' +один | one +почти | almost +мой | my +тем | instrumental/dative plural of `тот', `то' +чтобы | full form of `in order that' +нее | her (acc.) +кажется | it seems +сейчас | now +были | they were +куда | where to +зачем | why +сказать | to say +всех | all (acc., gen. preposn. plural) +никогда | never +сегодня | today +можно | possible, one can +при | by +наконец | finally +два | two +об | alternative form of `о', about +другой | another +хоть | even +после | after +над | above +больше | more +тот | that one (masc.) +через | across, in +эти | these +нас | us +про | about +всего | in all, only, of all +них | prepositional form of `они' (they) +какая | which, feminine +много | lots +разве | interrogative particle +сказала | she said +три | three +эту | this, acc. fem. sing. +моя | my, feminine +впрочем | moreover, besides +хорошо | good +свою | ones own, acc. fem. sing. +этой | oblique form of `эта', fem. `this' +перед | in front of +иногда | sometimes +лучше | better +чуть | a little +том | preposn. form of `that one' +нельзя | one must not +такой | such a one +им | to them +более | more +всегда | always +конечно | of course +всю | acc. fem. sing of `all' +между | between + + + | b: some paradigms + | + | personal pronouns + | + | я меня мне мной [мною] + | ты тебя тебе тобой [тобою] + | он его ему им [него, нему, ним] + | она ее эи ею [нее, нэи, нею] + | оно его ему им [него, нему, ним] + | + | мы нас нам нами + | вы вас вам вами + | они их им ими [них, ним, ними] + | + | себя себе собой [собою] + | + | demonstrative pronouns: этот (this), тот (that) + | + | этот эта это эти + | этого эты это эти + | этого этой этого этих + | этому этой этому этим + | этим этой этим [этою] этими + | этом этой этом этих + | + | тот та то те + | того ту то те + | того той того тех + | тому той тому тем + | тем той тем [тою] теми + | том той том тех + | + | determinative pronouns + | + | (a) весь (all) + | + | весь вся все все + | всего всю все все + | всего всей всего всех + | всему всей всему всем + | всем всей всем [всею] всеми + | всем всей всем всех + | + | (b) сам (himself etc) + | + | сам сама само сами + | самого саму само самих + | самого самой самого самих + | самому самой самому самим + | самим самой самим [самою] самими + | самом самой самом самих + | + | stems of verbs `to be', `to have', `to do' and modal + | + | быть бы буд быв есть суть + | име + | дел + | мог мож мочь + | уме + | хоч хот + | долж + | можн + | нужн + | нельзя + diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_sv.txt b/test/solr/configsets/term_search/conf/lang/stopwords_sv.txt new file mode 100644 index 00000000..096f87f6 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_sv.txt @@ -0,0 +1,133 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Swedish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | Swedish stop words occasionally exhibit homonym clashes. For example + | så = so, but also seed. These are indicated clearly below. + +och | and +det | it, this/that +att | to (with infinitive) +i | in, at +en | a +jag | I +hon | she +som | who, that +han | he +på | on +den | it, this/that +med | with +var | where, each +sig | him(self) etc +för | for +så | so (also: seed) +till | to +är | is +men | but +ett | a +om | if; around, about +hade | had +de | they, these/those +av | of +icke | not, no +mig | me +du | you +henne | her +då | then, when +sin | his +nu | now +har | have +inte | inte någon = no one +hans | his +honom | him +skulle | 'sake' +hennes | her +där | there +min | my +man | one (pronoun) +ej | nor +vid | at, by, on (also: vast) +kunde | could +något | some etc +från | from, off +ut | out +när | when +efter | after, behind +upp | up +vi | we +dem | them +vara | be +vad | what +över | over +än | than +dig | you +kan | can +sina | his +här | here +ha | have +mot | towards +alla | all +under | under (also: wonder) +någon | some etc +eller | or (else) +allt | all +mycket | much +sedan | since +ju | why +denna | this/that +själv | myself, yourself etc +detta | this/that +åt | to +utan | without +varit | was +hur | how +ingen | no +mitt | my +ni | you +bli | to be, become +blev | from bli +oss | us +din | thy +dessa | these/those +några | some etc +deras | their +blir | from bli +mina | my +samma | (the) same +vilken | who, that +er | you, your +sådan | such a +vår | our +blivit | from bli +dess | its +inom | within +mellan | between +sådant | such a +varför | why +varje | each +vilka | who, that +ditt | thy +vem | who +vilket | who, that +sitta | his +sådana | such a +vart | each +dina | thy +vars | whose +vårt | our +våra | our +ert | your +era | your +vilkas | whose + diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_th.txt b/test/solr/configsets/term_search/conf/lang/stopwords_th.txt new file mode 100644 index 00000000..07f0fabe --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_th.txt @@ -0,0 +1,119 @@ +# Thai stopwords from: +# "Opinion Detection in Thai Political News Columns +# Based on Subjectivity Analysis" +# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak +ไว้ +ไม่ +ไป +ได้ +ให้ +ใน +โดย +แห่ง +แล้ว +และ +แรก +แบบ +แต่ +เอง +เห็น +เลย +เริ่ม +เรา +เมื่อ +เพื่อ +เพราะ +เป็นการ +เป็น +เปิดเผย +เปิด +เนื่องจาก +เดียวกัน +เดียว +เช่น +เฉพาะ +เคย +เข้า +เขา +อีก +อาจ +อะไร +ออก +อย่าง +อยู่ +อยาก +หาก +หลาย +หลังจาก +หลัง +หรือ +หนึ่ง +ส่วน +ส่ง +สุด +สําหรับ +ว่า +วัน +ลง +ร่วม +ราย +รับ +ระหว่าง +รวม +ยัง +มี +มาก +มา +พร้อม +พบ +ผ่าน +ผล +บาง +น่า +นี้ +นํา +นั้น +นัก +นอกจาก +ทุก +ที่สุด +ที่ +ทําให้ +ทํา +ทาง +ทั้งนี้ +ทั้ง +ถ้า +ถูก +ถึง +ต้อง +ต่างๆ +ต่าง +ต่อ +ตาม +ตั้งแต่ +ตั้ง +ด้าน +ด้วย +ดัง +ซึ่ง +ช่วง +จึง +จาก +จัด +จะ +คือ +ความ +ครั้ง +คง +ขึ้น +ของ +ขอ +ขณะ +ก่อน +ก็ +การ +กับ +กัน +กว่า +กล่าว diff --git a/test/solr/configsets/term_search/conf/lang/stopwords_tr.txt b/test/solr/configsets/term_search/conf/lang/stopwords_tr.txt new file mode 100644 index 00000000..84d9408d --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/stopwords_tr.txt @@ -0,0 +1,212 @@ +# Turkish stopwords from LUCENE-559 +# merged with the list from "Information Retrieval on Turkish Texts" +# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) +acaba +altmış +altı +ama +ancak +arada +aslında +ayrıca +bana +bazı +belki +ben +benden +beni +benim +beri +beş +bile +bin +bir +birçok +biri +birkaç +birkez +birşey +birşeyi +biz +bize +bizden +bizi +bizim +böyle +böylece +bu +buna +bunda +bundan +bunlar +bunları +bunların +bunu +bunun +burada +çok +çünkü +da +daha +dahi +de +defa +değil +diğer +diye +doksan +dokuz +dolayı +dolayısıyla +dört +edecek +eden +ederek +edilecek +ediliyor +edilmesi +ediyor +eğer +elli +en +etmesi +etti +ettiği +ettiğini +gibi +göre +halen +hangi +hatta +hem +henüz +hep +hepsi +her +herhangi +herkesin +hiç +hiçbir +için +iki +ile +ilgili +ise +işte +itibaren +itibariyle +kadar +karşın +katrilyon +kendi +kendilerine +kendini +kendisi +kendisine +kendisini +kez +ki +kim +kimden +kime +kimi +kimse +kırk +milyar +milyon +mu +mü +mı +nasıl +ne +neden +nedenle +nerde +nerede +nereye +niye +niçin +o +olan +olarak +oldu +olduğu +olduğunu +olduklarını +olmadı +olmadığı +olmak +olması +olmayan +olmaz +olsa +olsun +olup +olur +olursa +oluyor +on +ona +ondan +onlar +onlardan +onları +onların +onu +onun +otuz +oysa +öyle +pek +rağmen +sadece +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +sizi +sizin +şey +şeyden +şeyi +şeyler +şöyle +şu +şuna +şunda +şundan +şunları +şunu +tarafından +trilyon +tüm +üç +üzere +var +vardı +ve +veya +ya +yani +yapacak +yapılan +yapılması +yapıyor +yapmak +yaptı +yaptığı +yaptığını +yaptıkları +yedi +yerine +yetmiş +yine +yirmi +yoksa +yüz +zaten diff --git a/test/solr/configsets/term_search/conf/lang/userdict_ja.txt b/test/solr/configsets/term_search/conf/lang/userdict_ja.txt new file mode 100644 index 00000000..6f0368e4 --- /dev/null +++ b/test/solr/configsets/term_search/conf/lang/userdict_ja.txt @@ -0,0 +1,29 @@ +# +# This is a sample user dictionary for Kuromoji (JapaneseTokenizer) +# +# Add entries to this file in order to override the statistical model in terms +# of segmentation, readings and part-of-speech tags. Notice that entries do +# not have weights since they are always used when found. This is by-design +# in order to maximize ease-of-use. +# +# Entries are defined using the following CSV format: +# , ... , ... , +# +# Notice that a single half-width space separates tokens and readings, and +# that the number tokens and readings must match exactly. +# +# Also notice that multiple entries with the same is undefined. +# +# Whitespace only lines are ignored. Comments are not allowed on entry lines. +# + +# Custom segmentation for kanji compounds +日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞 +関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞 + +# Custom segmentation for compound katakana +トートバッグ,トート バッグ,トート バッグ,かずカナ名詞 +ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞 + +# Custom reading for former sumo wrestler +朝青龍,朝青龍,アサショウリュウ,カスタム人名 diff --git a/test/solr/configsets/term_search/conf/managed-schema b/test/solr/configsets/term_search/conf/managed-schema new file mode 100644 index 00000000..e99e27e9 --- /dev/null +++ b/test/solr/configsets/term_search/conf/managed-schema @@ -0,0 +1,1031 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/solr/configsets/term_search/conf/mapping-ISOLatin1Accent.txt b/test/solr/configsets/term_search/conf/mapping-ISOLatin1Accent.txt new file mode 100644 index 00000000..ede77425 --- /dev/null +++ b/test/solr/configsets/term_search/conf/mapping-ISOLatin1Accent.txt @@ -0,0 +1,246 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Syntax: +# "source" => "target" +# "source".length() > 0 (source cannot be empty.) +# "target".length() >= 0 (target can be empty.) + +# example: +# "À" => "A" +# "\u00C0" => "A" +# "\u00C0" => "\u0041" +# "ß" => "ss" +# "\t" => " " +# "\n" => "" + +# À => A +"\u00C0" => "A" + +# Á => A +"\u00C1" => "A" + +#  => A +"\u00C2" => "A" + +# à => A +"\u00C3" => "A" + +# Ä => A +"\u00C4" => "A" + +# Å => A +"\u00C5" => "A" + +# Æ => AE +"\u00C6" => "AE" + +# Ç => C +"\u00C7" => "C" + +# È => E +"\u00C8" => "E" + +# É => E +"\u00C9" => "E" + +# Ê => E +"\u00CA" => "E" + +# Ë => E +"\u00CB" => "E" + +# Ì => I +"\u00CC" => "I" + +# Í => I +"\u00CD" => "I" + +# Î => I +"\u00CE" => "I" + +# Ï => I +"\u00CF" => "I" + +# IJ => IJ +"\u0132" => "IJ" + +# Ð => D +"\u00D0" => "D" + +# Ñ => N +"\u00D1" => "N" + +# Ò => O +"\u00D2" => "O" + +# Ó => O +"\u00D3" => "O" + +# Ô => O +"\u00D4" => "O" + +# Õ => O +"\u00D5" => "O" + +# Ö => O +"\u00D6" => "O" + +# Ø => O +"\u00D8" => "O" + +# Œ => OE +"\u0152" => "OE" + +# Þ +"\u00DE" => "TH" + +# Ù => U +"\u00D9" => "U" + +# Ú => U +"\u00DA" => "U" + +# Û => U +"\u00DB" => "U" + +# Ü => U +"\u00DC" => "U" + +# Ý => Y +"\u00DD" => "Y" + +# Ÿ => Y +"\u0178" => "Y" + +# à => a +"\u00E0" => "a" + +# á => a +"\u00E1" => "a" + +# â => a +"\u00E2" => "a" + +# ã => a +"\u00E3" => "a" + +# ä => a +"\u00E4" => "a" + +# å => a +"\u00E5" => "a" + +# æ => ae +"\u00E6" => "ae" + +# ç => c +"\u00E7" => "c" + +# è => e +"\u00E8" => "e" + +# é => e +"\u00E9" => "e" + +# ê => e +"\u00EA" => "e" + +# ë => e +"\u00EB" => "e" + +# ì => i +"\u00EC" => "i" + +# í => i +"\u00ED" => "i" + +# î => i +"\u00EE" => "i" + +# ï => i +"\u00EF" => "i" + +# ij => ij +"\u0133" => "ij" + +# ð => d +"\u00F0" => "d" + +# ñ => n +"\u00F1" => "n" + +# ò => o +"\u00F2" => "o" + +# ó => o +"\u00F3" => "o" + +# ô => o +"\u00F4" => "o" + +# õ => o +"\u00F5" => "o" + +# ö => o +"\u00F6" => "o" + +# ø => o +"\u00F8" => "o" + +# œ => oe +"\u0153" => "oe" + +# ß => ss +"\u00DF" => "ss" + +# þ => th +"\u00FE" => "th" + +# ù => u +"\u00F9" => "u" + +# ú => u +"\u00FA" => "u" + +# û => u +"\u00FB" => "u" + +# ü => u +"\u00FC" => "u" + +# ý => y +"\u00FD" => "y" + +# ÿ => y +"\u00FF" => "y" + +# ff => ff +"\uFB00" => "ff" + +# fi => fi +"\uFB01" => "fi" + +# fl => fl +"\uFB02" => "fl" + +# ffi => ffi +"\uFB03" => "ffi" + +# ffl => ffl +"\uFB04" => "ffl" + +# ſt => ft +"\uFB05" => "ft" + +# st => st +"\uFB06" => "st" diff --git a/test/solr/configsets/term_search/conf/protwords.txt b/test/solr/configsets/term_search/conf/protwords.txt new file mode 100644 index 00000000..1dfc0abe --- /dev/null +++ b/test/solr/configsets/term_search/conf/protwords.txt @@ -0,0 +1,21 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +# Use a protected word file to protect against the stemmer reducing two +# unrelated words to the same base word. + +# Some non-words that normally won't be encountered, +# just to test that they won't be stemmed. +dontstems +zwhacky + diff --git a/test/solr/configsets/term_search/conf/schema.xml b/test/solr/configsets/term_search/conf/schema.xml new file mode 100644 index 00000000..fa95e127 --- /dev/null +++ b/test/solr/configsets/term_search/conf/schema.xml @@ -0,0 +1,1222 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/solr/configsets/term_search/conf/solrconfig.xml b/test/solr/configsets/term_search/conf/solrconfig.xml new file mode 100644 index 00000000..771a0f32 --- /dev/null +++ b/test/solr/configsets/term_search/conf/solrconfig.xml @@ -0,0 +1,1299 @@ + + + + + + + + + 8.8.2 + + + + + + + + + + + ${solr.data.dir:} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.lock.type:native} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${solr.ulog.dir:} + ${solr.ulog.numVersionBuckets:65536} + + + + + ${solr.autoCommit.maxTime:15000} + false + + + + + + ${solr.autoSoftCommit.maxTime:-1} + + + + + + + + + + + + + + ${solr.max.booleanClauses:500000} + + + + + + + + + + + + + + + + + + + + + + + + true + + + + + + 20 + + + 200 + + + + + + + + + + + + + + + + + + + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + explicit + 10 + + + + + + + + + + + + + + + + explicit + json + true + + + + + + _text_ + + + + + + + + + text_general + + + + + + default + _text_ + solr.DirectSolrSpellChecker + + internal + + 0.5 + + 2 + + 1 + + 5 + + 4 + + 0.01 + + + + + + + + + + + + default + on + true + 10 + 5 + 5 + true + true + 10 + 5 + + + spellcheck + + + + + + + + + + true + false + + + terms + + + + + + + + + + + 100 + + + + + + + + 70 + + 0.5 + + [-\w ,/\n\"']{20,200} + + + + + + + ]]> + ]]> + + + + + + + + + + + + + + + + + + + + + + + + ,, + ,, + ,, + ,, + ,]]> + ]]> + + + + + + 10 + .,!? + + + + + + + WORD + + + en + US + + + + + + + + + + + + [^\w-\.] + _ + + + + + + + yyyy-MM-dd['T'[HH:mm[:ss[.SSS]][z + yyyy-MM-dd['T'[HH:mm[:ss[,SSS]][z + yyyy-MM-dd HH:mm[:ss[.SSS]][z + yyyy-MM-dd HH:mm[:ss[,SSS]][z + [EEE, ]dd MMM yyyy HH:mm[:ss] z + EEEE, dd-MMM-yy HH:mm:ss z + EEE MMM ppd HH:mm:ss [z ]yyyy + + + + + java.lang.String + text_general + + *_str + 256 + + + true + + + java.lang.Boolean + booleans + + + java.util.Date + pdates + + + java.lang.Long + java.lang.Integer + plongs + + + java.lang.Number + pdoubles + + + + + + + + + + + + + + + + + + + + text/plain; charset=UTF-8 + + + + + + + + + + + + + + diff --git a/test/solr/configsets/term_search/conf/stopwords.txt b/test/solr/configsets/term_search/conf/stopwords.txt new file mode 100644 index 00000000..ae1e83ee --- /dev/null +++ b/test/solr/configsets/term_search/conf/stopwords.txt @@ -0,0 +1,14 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/test/solr/configsets/term_search/conf/synonyms.txt b/test/solr/configsets/term_search/conf/synonyms.txt new file mode 100644 index 00000000..eab4ee87 --- /dev/null +++ b/test/solr/configsets/term_search/conf/synonyms.txt @@ -0,0 +1,29 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaafoo => aaabar +bbbfoo => bbbfoo bbbbar +cccfoo => cccbar cccbaz +fooaaa,baraaa,bazaaa + +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +pixima => pixma + diff --git a/test/solr/docker-compose.yml b/test/solr/docker-compose.yml new file mode 100644 index 00000000..3ddae69c --- /dev/null +++ b/test/solr/docker-compose.yml @@ -0,0 +1,13 @@ +version: '3.8' + +services: + op_solr: + image: solr:8.8 + volumes: + - ./solr_configsets:/configsets:ro + ports: + - "8983:8983" + command: > + bash -c "precreate-core term_search_core1 /configsets/term_search + && precreate-core prop_search_core1 /configsets/property_search + && solr-foreground" diff --git a/test/solr/generate_ncbo_configsets.sh b/test/solr/generate_ncbo_configsets.sh new file mode 100755 index 00000000..7b4281f7 --- /dev/null +++ b/test/solr/generate_ncbo_configsets.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# generates solr configsets by merging _default configset with config files in config/solr +# _default is copied from sorl distribuion solr-8.10.1/server/solr/configsets/_default/ + +#cd solr/configsets +ld_config='config/solr' +configsets='test/solr/configsets' +[ -d ${configsets}/property_search ] && rm -Rf ${configsets}/property_search +[ -d ${configsets}/term_search ] && rm -Rf ${configsets}/term_search +if [[ ! -d ${ld_config}/property_search ]]; then + echo 'cant find ld solr config sets' + exit 1 +fi +if [[ ! -d ${configsets}/_default/conf ]]; then + echo 'cant find default solr configset' + exit 1 +fi +mkdir -p ${configsets}/property_search/conf +mkdir -p ${configsets}/term_search/conf +cp -a ${configsets}/_default/conf/* ${configsets}/property_search/conf/ +cp -a ${configsets}/_default/conf/* ${configsets}/term_search/conf/ +cp -a $ld_config/property_search/* ${configsets}/property_search/conf +cp -a $ld_config/term_search/* ${configsets}/term_search/conf + From 88e8399f990adc240d37a9e4196a5207720fe011 Mon Sep 17 00:00:00 2001 From: mdorf Date: Wed, 1 Nov 2023 11:08:58 -0700 Subject: [PATCH 47/62] fixed ncbo_ontology_archive_old_submissions error output --- Gemfile.lock | 4 ++-- bin/ncbo_ontology_archive_old_submissions | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 92164456..825ea3a5 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: 911d71aefe433314d11398445e3856fca503b9c1 + revision: 6db93bb3d5095a5fe0d017e572c5a04caa34ebc6 branch: develop specs: goo (0.0.2) @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 3ae6bfb56dc59a670b5bc1a513ff4929f8cf3756 + revision: 067104ae94c0e9d058cfbf419364fbf03f34de43 branch: develop specs: ncbo_annotator (0.0.1) diff --git a/bin/ncbo_ontology_archive_old_submissions b/bin/ncbo_ontology_archive_old_submissions index 535c129e..1b2268a5 100755 --- a/bin/ncbo_ontology_archive_old_submissions +++ b/bin/ncbo_ontology_archive_old_submissions @@ -119,13 +119,17 @@ onts.each do |ont| end end -msg = JSON.pretty_generate(bad_submissions) puts -puts msg -logger.error(msg) - -msg = "Number of errored submissions: #{bad_submissions.length}" -puts msg -logger.error(msg) - +if bad_submissions.empty? + msg = "No errored submissions found" + puts msg + logger.info(msg) +else + msg = JSON.pretty_generate(bad_submissions) + puts msg + logger.error(msg) + msg = "Number of errored submissions: #{bad_submissions.length}" + puts msg + logger.error(msg) +end \ No newline at end of file From 33cc9b491e0d1d36064624d028580cb615e34496 Mon Sep 17 00:00:00 2001 From: mdorf Date: Wed, 1 Nov 2023 20:14:26 -0700 Subject: [PATCH 48/62] Gemfile.lock update --- Gemfile.lock | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 1927ea82..13bf6b72 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -98,7 +98,7 @@ GEM faraday-net_http (3.0.2) faraday-retry (2.2.0) faraday (~> 2.0) - ffi (1.15.5) + ffi (1.16.3) gapic-common (0.20.0) faraday (>= 1.9, < 3.a) faraday-retry (>= 1.0, < 3.a) @@ -119,29 +119,29 @@ GEM google-cloud-env (1.6.0) faraday (>= 0.17.3, < 3.0) google-cloud-errors (1.3.1) - google-protobuf (3.24.3) - google-protobuf (3.24.3-x86_64-darwin) - google-protobuf (3.24.3-x86_64-linux) + google-protobuf (3.25.0) + google-protobuf (3.25.0-x86_64-darwin) + google-protobuf (3.25.0-x86_64-linux) googleapis-common-protos (1.4.0) google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.8.0) + googleapis-common-protos-types (1.9.0) google-protobuf (~> 3.18) - googleauth (1.8.0) + googleauth (1.8.1) faraday (>= 0.17.3, < 3.a) jwt (>= 1.4, < 3.0) multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) - grpc (1.58.0) - google-protobuf (~> 3.23) + grpc (1.59.2) + google-protobuf (~> 3.24) googleapis-common-protos-types (~> 1.0) - grpc (1.58.0-x86_64-darwin) - google-protobuf (~> 3.23) + grpc (1.59.2-x86_64-darwin) + google-protobuf (~> 3.24) googleapis-common-protos-types (~> 1.0) - grpc (1.58.0-x86_64-linux) - google-protobuf (~> 3.23) + grpc (1.59.2-x86_64-linux) + google-protobuf (~> 3.24) googleapis-common-protos-types (~> 1.0) htmlentities (4.3.4) http-accept (1.7.0) @@ -163,7 +163,7 @@ GEM method_source (1.0.0) mime-types (3.5.1) mime-types-data (~> 3.2015) - mime-types-data (3.2023.0808) + mime-types-data (3.2023.1003) minitest (4.7.5) mlanett-redis-lock (0.2.7) redis @@ -184,12 +184,12 @@ GEM rack (3.0.8) rack-test (2.1.0) rack (>= 1.3) - rake (13.0.6) + rake (13.1.0) rdf (1.0.8) addressable (>= 2.2) - redis (5.0.7) - redis-client (>= 0.9.0) - redis-client (0.17.0) + redis (5.0.8) + redis-client (>= 0.17.0) + redis-client (0.18.0) connection_pool rest-client (2.1.0) http-accept (>= 1.7.0, < 2.0) @@ -264,4 +264,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.4.17 + 2.3.15 From 7429289a0cf9d48c43191ed3bcaa9ce82a20f6d4 Mon Sep 17 00:00:00 2001 From: mdorf Date: Wed, 1 Nov 2023 20:16:56 -0700 Subject: [PATCH 49/62] Gemfile.lock update --- Gemfile.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index e7a782db..9e47a23c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: e33a0e451f8a8226d98291168e45b46d7065e670 + revision: ff10e5ff4103431da1aec3cbbaebc57547c0035c branch: develop specs: ontologies_linked_data (0.0.1) @@ -119,9 +119,9 @@ GEM google-cloud-env (1.6.0) faraday (>= 0.17.3, < 3.0) google-cloud-errors (1.3.1) - google-protobuf (3.24.4) - google-protobuf (3.24.4-x86_64-darwin) - google-protobuf (3.24.4-x86_64-linux) + google-protobuf (3.25.0) + google-protobuf (3.25.0-x86_64-darwin) + google-protobuf (3.25.0-x86_64-linux) googleapis-common-protos (1.4.0) google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) From bb93561f78522cf6b289afc81b3bf86cdbbb8cfc Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Tue, 7 Nov 2023 21:31:46 -0800 Subject: [PATCH 50/62] Gemfile update --- Gemfile.lock | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index e7a782db..fab55057 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: e33a0e451f8a8226d98291168e45b46d7065e670 + revision: 9487c7f73e68abab097af523d42c1d2e106e614b branch: develop specs: ontologies_linked_data (0.0.1) @@ -76,7 +76,7 @@ GEM multi_json (~> 1.0) addressable (2.8.5) public_suffix (>= 2.0.2, < 6.0) - base64 (0.1.1) + base64 (0.2.0) bcrypt (3.1.19) builder (3.2.4) coderay (1.1.3) @@ -119,14 +119,14 @@ GEM google-cloud-env (1.6.0) faraday (>= 0.17.3, < 3.0) google-cloud-errors (1.3.1) - google-protobuf (3.24.4) - google-protobuf (3.24.4-x86_64-darwin) - google-protobuf (3.24.4-x86_64-linux) + google-protobuf (3.25.0) + google-protobuf (3.25.0-x86_64-darwin) + google-protobuf (3.25.0-x86_64-linux) googleapis-common-protos (1.4.0) google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.9.0) + googleapis-common-protos-types (1.10.0) google-protobuf (~> 3.18) googleauth (1.8.1) faraday (>= 0.17.3, < 3.a) @@ -154,8 +154,8 @@ GEM jwt (2.7.1) launchy (2.5.2) addressable (~> 2.8) - libxml-ruby (4.1.1) - logger (1.5.3) + libxml-ruby (4.1.2) + logger (1.6.0) macaddr (1.7.2) systemu (~> 2.6.5) mail (2.6.6) From a20827249fe225af6f18e9efea5e1097ab28d86b Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Tue, 7 Nov 2023 21:46:34 -0800 Subject: [PATCH 51/62] Gemfile update --- Gemfile.lock | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 13bf6b72..617eccb3 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/ncbo/goo.git - revision: daea7822af9e5ca1961d6873a758735133a1b2db + revision: 657149d6b33813253fa7440252f69c04e0631190 branch: master specs: goo (0.0.2) @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 04226ac5840a328e6f906f15c769ee6ee5723102 + revision: 4f4361e2c181143bba3876326ecda407a587207e branch: master specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: a7ad210e846a390f203457be2459719214d142fe + revision: 7783784f9d2ceada9be706cf6c084d272ae653e8 branch: master specs: ontologies_linked_data (0.0.1) @@ -76,7 +76,7 @@ GEM multi_json (~> 1.0) addressable (2.8.5) public_suffix (>= 2.0.2, < 6.0) - base64 (0.1.1) + base64 (0.2.0) bcrypt (3.1.19) builder (3.2.4) coderay (1.1.3) @@ -126,7 +126,7 @@ GEM google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.9.0) + googleapis-common-protos-types (1.10.0) google-protobuf (~> 3.18) googleauth (1.8.1) faraday (>= 0.17.3, < 3.a) @@ -154,8 +154,8 @@ GEM jwt (2.7.1) launchy (2.5.2) addressable (~> 2.8) - libxml-ruby (4.1.1) - logger (1.5.3) + libxml-ruby (4.1.2) + logger (1.6.0) macaddr (1.7.2) systemu (~> 2.6.5) mail (2.6.6) From e8fa020e75bda0ea228089523e26e065d228eb3b Mon Sep 17 00:00:00 2001 From: mdorf Date: Tue, 14 Nov 2023 14:13:06 -0800 Subject: [PATCH 52/62] fixes to the analytics script and a new script to generate UA analytics for documentation --- Gemfile | 6 ++ Gemfile.lock | 35 ++++++-- bin/generate_ua_analytics_file.rb | 126 ++++++++++++++++++++++++++++ lib/ncbo_cron/ontology_analytics.rb | 49 ++++++----- 4 files changed, 190 insertions(+), 26 deletions(-) create mode 100755 bin/generate_ua_analytics_file.rb diff --git a/Gemfile b/Gemfile index a2c93e43..ea60eb54 100644 --- a/Gemfile +++ b/Gemfile @@ -3,6 +3,12 @@ source 'https://rubygems.org' gemspec gem 'ffi' + +# This is needed temporarily to pull the Google Universal Analytics (UA) +# data and store it in a file. See (bin/generate_ua_analytics_file.rb) +# The ability to pull this data from Google will cease on July 1, 2024 +gem "google-apis-analytics_v3" + gem 'google-analytics-data' gem 'mail', '2.6.6' gem 'multi_json' diff --git a/Gemfile.lock b/Gemfile.lock index 9e47a23c..de996c17 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 067104ae94c0e9d058cfbf419364fbf03f34de43 + revision: ebbb7a3c28ecde49c261290bec34ab082490a271 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: ff10e5ff4103431da1aec3cbbaebc57547c0035c + revision: 5600020a8017cb4901e719f577032b0be6a14949 branch: develop specs: ontologies_linked_data (0.0.1) @@ -76,7 +76,7 @@ GEM multi_json (~> 1.0) addressable (2.8.5) public_suffix (>= 2.0.2, < 6.0) - base64 (0.1.1) + base64 (0.2.0) bcrypt (3.1.19) builder (3.2.4) coderay (1.1.3) @@ -84,6 +84,7 @@ GEM connection_pool (2.4.1) cube-ruby (0.0.3) dante (0.2.0) + declarative (0.0.20) docile (1.4.0) domain_name (0.5.20190701) unf (>= 0.0.5, < 1.0.0) @@ -113,6 +114,17 @@ GEM google-analytics-data-v1beta (0.9.0) gapic-common (>= 0.20.0, < 2.a) google-cloud-errors (~> 1.0) + google-apis-analytics_v3 (0.13.0) + google-apis-core (>= 0.11.0, < 2.a) + google-apis-core (0.11.2) + addressable (~> 2.5, >= 2.5.1) + googleauth (>= 0.16.2, < 2.a) + httpclient (>= 2.8.1, < 3.a) + mini_mime (~> 1.0) + representable (~> 3.0) + retriable (>= 2.0, < 4.a) + rexml + webrick google-cloud-core (1.6.0) google-cloud-env (~> 1.0) google-cloud-errors (~> 1.0) @@ -126,7 +138,7 @@ GEM google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.9.0) + googleapis-common-protos-types (1.10.0) google-protobuf (~> 3.18) googleauth (1.8.1) faraday (>= 0.17.3, < 3.a) @@ -147,6 +159,7 @@ GEM http-accept (1.7.0) http-cookie (1.0.5) domain_name (~> 0.5) + httpclient (2.8.3) i18n (0.9.5) concurrent-ruby (~> 1.0) json (2.6.3) @@ -154,8 +167,8 @@ GEM jwt (2.7.1) launchy (2.5.2) addressable (~> 2.8) - libxml-ruby (4.1.1) - logger (1.5.3) + libxml-ruby (4.1.2) + logger (1.6.0) macaddr (1.7.2) systemu (~> 2.6.5) mail (2.6.6) @@ -164,6 +177,7 @@ GEM mime-types (3.5.1) mime-types-data (~> 3.2015) mime-types-data (3.2023.1003) + mini_mime (1.1.5) minitest (4.7.5) mlanett-redis-lock (0.2.7) redis @@ -191,11 +205,16 @@ GEM redis-client (>= 0.17.0) redis-client (0.18.0) connection_pool + representable (3.2.0) + declarative (< 0.1.0) + trailblazer-option (>= 0.1.1, < 0.2.0) + uber (< 0.2.0) rest-client (2.1.0) http-accept (>= 1.7.0, < 2.0) http-cookie (>= 1.0.2, < 2.0) mime-types (>= 1.16, < 4.0) netrc (~> 0.8) + retriable (3.1.2) rexml (3.2.6) rsolr (2.5.0) builder (>= 2.1.2) @@ -224,13 +243,16 @@ GEM systemu (2.6.5) test-unit-minitest (0.9.1) minitest (~> 4.7) + trailblazer-option (0.1.2) tzinfo (2.0.6) concurrent-ruby (~> 1.0) + uber (0.1.0) unf (0.1.4) unf_ext unf_ext (0.0.8.2) uuid (2.3.9) macaddr (~> 1.0) + webrick (1.8.1) PLATFORMS ruby @@ -244,6 +266,7 @@ DEPENDENCIES ffi goo! google-analytics-data + google-apis-analytics_v3 mail (= 2.6.6) minitest (< 5.0) multi_json diff --git a/bin/generate_ua_analytics_file.rb b/bin/generate_ua_analytics_file.rb new file mode 100755 index 00000000..0a432a92 --- /dev/null +++ b/bin/generate_ua_analytics_file.rb @@ -0,0 +1,126 @@ +require 'logger' +require 'google/apis/analytics_v3' +require 'google/api_client/auth/key_utils' + +module NcboCron + module Models + + class OntologyAnalyticsUA + + def initialize(logger) + @logger = logger + end + + def run + redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port) + ontology_analytics = fetch_ontology_analytics + File.open(NcboCron.settings.analytics_path_to_ua_data_file, 'w') do |f| + f.write(ontology_analytics.to_json) + end + end + + def fetch_ontology_analytics + google_client = authenticate_google + aggregated_results = Hash.new + start_year = Date.parse(NcboCron.settings.analytics_start_date).year || 2013 + ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map {|o| o.acronym} + # ont_acronyms = ["NCIT", "ONTOMA", "CMPO", "AEO", "SNOMEDCT"] + filter_str = (NcboCron.settings.analytics_filter_str.nil? || NcboCron.settings.analytics_filter_str.empty?) ? "" : ";#{NcboCron.settings.analytics_filter_str}" + + ont_acronyms.each do |acronym| + max_results = 10000 + num_results = 10000 + start_index = 1 + results = nil + + loop do + results = google_client.get_ga_data( + ids = NcboCron.settings.analytics_profile_id, + start_date = NcboCron.settings.analytics_start_date, + end_date = Date.today.to_s, + metrics = 'ga:pageviews', + { + dimensions: 'ga:pagePath,ga:year,ga:month', + filters: "ga:pagePath=~^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$#{filter_str}", + start_index: start_index, + max_results: max_results + } + ) + results.rows ||= [] + start_index += max_results + num_results = results.rows.length + @logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}" + @logger.flush + + results.rows.each do |row| + if aggregated_results.has_key?(acronym) + # year + if aggregated_results[acronym].has_key?(row[1].to_i) + # month + if aggregated_results[acronym][row[1].to_i].has_key?(row[2].to_i) + aggregated_results[acronym][row[1].to_i][row[2].to_i] += row[3].to_i + else + aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i + end + else + aggregated_results[acronym][row[1].to_i] = Hash.new + aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i + end + else + aggregated_results[acronym] = Hash.new + aggregated_results[acronym][row[1].to_i] = Hash.new + aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i + end + end + + if num_results < max_results + # fill up non existent years + (start_year..Date.today.year).each do |y| + aggregated_results[acronym] = Hash.new if aggregated_results[acronym].nil? + aggregated_results[acronym][y] = Hash.new unless aggregated_results[acronym].has_key?(y) + end + # fill up non existent months with zeros + (1..12).each { |n| aggregated_results[acronym].values.each { |v| v[n] = 0 unless v.has_key?(n) } } + break + end + end + end + + @logger.info "Completed Universal Analytics pull..." + @logger.flush + + aggregated_results + end + + def authenticate_google + Google::Apis::ClientOptions.default.application_name = NcboCron.settings.analytics_app_name + Google::Apis::ClientOptions.default.application_version = NcboCron.settings.analytics_app_version + # enable google api call retries in order to + # minigate analytics processing failure due to occasional google api timeouts and other outages + Google::Apis::RequestOptions.default.retries = 5 + # uncoment to enable logging for debugging purposes + # Google::Apis.logger.level = Logger::DEBUG + # Google::Apis.logger = @logger + client = Google::Apis::AnalyticsV3::AnalyticsService.new + key = Google::APIClient::KeyUtils::load_from_pkcs12(NcboCron.settings.analytics_path_to_ua_key_file, 'notasecret') + client.authorization = Signet::OAuth2::Client.new( + :token_credential_uri => 'https://accounts.google.com/o/oauth2/token', + :audience => 'https://accounts.google.com/o/oauth2/token', + :scope => 'https://www.googleapis.com/auth/analytics.readonly', + :issuer => NcboCron.settings.analytics_service_account_email_address, + :signing_key => key + ).tap { |auth| auth.fetch_access_token! } + client + end + end + end +end + +require 'ontologies_linked_data' +require 'goo' +require 'ncbo_annotator' +require 'ncbo_cron/config' +require_relative '../config/config' +ontology_analytics_log_path = File.join("logs", "ontology-analytics-ua.log") +ontology_analytics_logger = Logger.new(ontology_analytics_log_path) +NcboCron::Models::OntologyAnalyticsUA.new(ontology_analytics_logger).run diff --git a/lib/ncbo_cron/ontology_analytics.rb b/lib/ncbo_cron/ontology_analytics.rb index 3a91b813..c5a4de00 100644 --- a/lib/ncbo_cron/ontology_analytics.rb +++ b/lib/ncbo_cron/ontology_analytics.rb @@ -38,7 +38,6 @@ def fetch_ontology_analytics @logger.flush ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map {|o| o.acronym} # ont_acronyms = ["NCIT", "SNOMEDCT", "MEDDRA"] - @logger.info "Authenticating with the Google Analytics Endpoint..." @logger.flush google_client = authenticate_google @@ -137,39 +136,49 @@ def fetch_ontology_analytics break if num_results < max_results end # loop end # ont_acronyms - @logger.info "Refresh complete, merging GA4 and UA data..." - @logger.flush - full_data = merge_ga4_ua_data(aggregated_results) - @logger.info "Merged" + @logger.info "Refresh complete" @logger.flush + full_data = merge_and_fill_missing_data(aggregated_results) end # Benchmark.realtime @logger.info "Completed Google Analytics refresh in #{(time/60).round(1)} minutes." @logger.flush full_data end - def merge_ga4_ua_data(ga4_data) - ua_data_file = File.read(NcboCron.settings.analytics_path_to_ua_data_file) - ua_data = JSON.parse(ua_data_file) - ua_ga4_intersecting_year = Date.parse(GA4_START_DATE).year.to_s - ua_ga4_intersecting_month = Date.parse(GA4_START_DATE).month.to_s - - # add up hits for June of 2023 (the only intersecting month between UA and GA4) - ua_data.each do |acronym, _| - if ga4_data.has_key?(acronym) - if ga4_data[acronym][ua_ga4_intersecting_year].has_key?(ua_ga4_intersecting_month) - ua_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] += - ga4_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] - # delete data for June of 2023 from ga4_data to avoid overwriting when merging - ga4_data[acronym][ua_ga4_intersecting_year].delete(ua_ga4_intersecting_month) + def merge_and_fill_missing_data(ga4_data) + ua_data = {} + + if File.exists?(NcboCron.settings.analytics_path_to_ua_data_file) && + !File.zero?(NcboCron.settings.analytics_path_to_ua_data_file) + @logger.info "Merging GA4 and UA data..." + @logger.flush + ua_data_file = File.read(NcboCron.settings.analytics_path_to_ua_data_file) + ua_data = JSON.parse(ua_data_file) + ua_ga4_intersecting_year = Date.parse(GA4_START_DATE).year.to_s + ua_ga4_intersecting_month = Date.parse(GA4_START_DATE).month.to_s + + # add up hits for June of 2023 (the only intersecting month between UA and GA4) + ua_data.each do |acronym, _| + if ga4_data.has_key?(acronym) + if ga4_data[acronym][ua_ga4_intersecting_year].has_key?(ua_ga4_intersecting_month) + ua_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] += + ga4_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] + # delete data for June of 2023 from ga4_data to avoid overwriting when merging + ga4_data[acronym][ua_ga4_intersecting_year].delete(ua_ga4_intersecting_month) + end end end end + # merge ua and ga4 data merged_data = ua_data.deep_merge(ga4_data) # fill missing years and months + @logger.info "Filling in missing years data..." + @logger.flush fill_missing_data(merged_data) # sort acronyms, years and months + @logger.info "Sorting final data..." + @logger.flush sort_ga_data(merged_data) end @@ -221,4 +230,4 @@ def deep_merge(second) # # ontology_analytics_logger = Logger.new(ontology_analytics_log_path) # ontology_analytics_logger = Logger.new(STDOUT) # NcboCron::Models::OntologyAnalytics.new(ontology_analytics_logger).run -# # ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontologies-report true --disable-mapping-counts true --disable-spam-deletion true --ontology-analytics '14 * * * *' +# ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontologies-report true --disable-mapping-counts true --disable-spam-deletion true --ontology-analytics '14 * * * *' From c2a72dbc223cd003c0cbc96f3fb2d910b7b0f57a Mon Sep 17 00:00:00 2001 From: mdorf Date: Sun, 10 Dec 2023 09:23:35 -0800 Subject: [PATCH 53/62] Gemfile.lock update --- Gemfile.lock | 72 ++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 74482dd3..8ca29047 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: ebbb7a3c28ecde49c261290bec34ab082490a271 + revision: 067104ae94c0e9d058cfbf419364fbf03f34de43 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 5600020a8017cb4901e719f577032b0be6a14949 + revision: 9487c7f73e68abab097af523d42c1d2e106e614b branch: develop specs: ontologies_linked_data (0.0.1) @@ -71,13 +71,17 @@ PATH GEM remote: https://rubygems.org/ specs: - activesupport (3.2.22.5) - i18n (~> 0.6, >= 0.6.4) - multi_json (~> 1.0) - addressable (2.8.5) + activesupport (4.0.13) + i18n (~> 0.6, >= 0.6.9) + minitest (~> 4.2) + multi_json (~> 1.3) + thread_safe (~> 0.1) + tzinfo (~> 0.3.37) + addressable (2.8.6) public_suffix (>= 2.0.2, < 6.0) base64 (0.2.0) - bcrypt (3.1.19) + bcrypt (3.1.20) + bigdecimal (3.1.4) builder (3.2.4) coderay (1.1.3) concurrent-ruby (1.2.2) @@ -91,7 +95,7 @@ GEM htmlentities (~> 4.3.3) launchy (~> 2.1) mail (~> 2.6) - faraday (2.7.11) + faraday (2.7.12) base64 faraday-net_http (>= 2.0, < 3.1) ruby2_keywords (>= 0.0.4) @@ -124,35 +128,36 @@ GEM retriable (>= 2.0, < 4.a) rexml webrick - google-cloud-core (1.6.0) - google-cloud-env (~> 1.0) + google-cloud-core (1.6.1) + google-cloud-env (>= 1.0, < 3.a) google-cloud-errors (~> 1.0) - google-cloud-env (1.6.0) - faraday (>= 0.17.3, < 3.0) + google-cloud-env (2.0.1) + faraday (>= 1.0, < 3.a) google-cloud-errors (1.3.1) - google-protobuf (3.25.0) - google-protobuf (3.25.0-x86_64-darwin) - google-protobuf (3.25.0-x86_64-linux) + google-protobuf (3.25.1) + google-protobuf (3.25.1-x86_64-darwin) + google-protobuf (3.25.1-x86_64-linux) googleapis-common-protos (1.4.0) google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.10.0) + googleapis-common-protos-types (1.11.0) google-protobuf (~> 3.18) - googleauth (1.8.1) - faraday (>= 0.17.3, < 3.a) + googleauth (1.9.0) + faraday (>= 1.0, < 3.a) + google-cloud-env (~> 2.0, >= 2.0.1) jwt (>= 1.4, < 3.0) multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) - grpc (1.59.2) - google-protobuf (~> 3.24) + grpc (1.60.0) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) - grpc (1.59.2-x86_64-darwin) - google-protobuf (~> 3.24) + grpc (1.60.0-x86_64-darwin) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) - grpc (1.59.2-x86_64-linux) - google-protobuf (~> 3.24) + grpc (1.60.0-x86_64-linux) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) htmlentities (4.3.4) http-accept (1.7.0) @@ -161,8 +166,8 @@ GEM httpclient (2.8.3) i18n (0.9.5) concurrent-ruby (~> 1.0) - json (2.6.3) - json_pure (2.6.3) + json (2.7.1) + json_pure (2.7.1) jwt (2.7.1) launchy (2.5.2) addressable (~> 2.8) @@ -175,7 +180,7 @@ GEM method_source (1.0.0) mime-types (3.5.1) mime-types-data (~> 3.2015) - mime-types-data (3.2023.1003) + mime-types-data (3.2023.1205) mini_mime (1.1.5) minitest (4.7.5) mlanett-redis-lock (0.2.7) @@ -183,7 +188,8 @@ GEM multi_json (1.15.0) net-http-persistent (2.9.4) netrc (0.11.0) - oj (3.16.1) + oj (3.16.2) + bigdecimal (~> 3.1) omni_logger (0.1.4) logger os (1.1.4) @@ -193,7 +199,7 @@ GEM pry (0.14.2) coderay (~> 1.1) method_source (~> 1.0) - public_suffix (5.0.3) + public_suffix (5.0.4) rack (3.0.8) rack-test (2.1.0) rack (>= 1.3) @@ -202,7 +208,7 @@ GEM addressable (>= 2.2) redis (5.0.8) redis-client (>= 0.17.0) - redis-client (0.18.0) + redis-client (0.19.0) connection_pool representable (3.2.0) declarative (< 0.1.0) @@ -242,9 +248,9 @@ GEM systemu (2.6.5) test-unit-minitest (0.9.1) minitest (~> 4.7) + thread_safe (0.3.6) trailblazer-option (0.1.2) - tzinfo (2.0.6) - concurrent-ruby (~> 1.0) + tzinfo (0.3.62) uber (0.1.0) uuid (2.3.9) macaddr (~> 1.0) @@ -283,4 +289,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.15 + 2.4.22 From ab43cdc82e49e145a759269c7b74d8d61b213176 Mon Sep 17 00:00:00 2001 From: mdorf Date: Sun, 10 Dec 2023 12:18:32 -0800 Subject: [PATCH 54/62] Gemfile.lock update --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 8ca29047..511c10a4 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 9487c7f73e68abab097af523d42c1d2e106e614b + revision: 809c54f56b1a4d30d8c2d49e9c005f07c2d6c596 branch: develop specs: ontologies_linked_data (0.0.1) From a659415dd42c0593394aa26982751eb88ad828c7 Mon Sep 17 00:00:00 2001 From: mdorf Date: Sat, 16 Dec 2023 17:39:14 -0800 Subject: [PATCH 55/62] implemented the first pass at bmir-radx/radx-project#37 --- Gemfile.lock | 57 +++--- bin/ncbo_ontology_pull | 2 +- lib/ncbo_cron/ontology_helper.rb | 185 ++++++++++++++++++++ lib/ncbo_cron/ontology_pull.rb | 147 +--------------- lib/ncbo_cron/ontology_rank.rb | 7 +- lib/ncbo_cron/ontology_submission_parser.rb | 51 +++--- test/test_case.rb | 6 +- test/test_ontology_pull.rb | 31 +++- 8 files changed, 277 insertions(+), 209 deletions(-) create mode 100644 lib/ncbo_cron/ontology_helper.rb diff --git a/Gemfile.lock b/Gemfile.lock index 74482dd3..ed375af7 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: ebbb7a3c28ecde49c261290bec34ab082490a271 + revision: d7ee80860a0eab9293af81083a0700d099c50263 branch: develop specs: ncbo_annotator (0.0.1) @@ -26,7 +26,7 @@ GIT GIT remote: https://github.com/ncbo/ontologies_linked_data.git - revision: 5600020a8017cb4901e719f577032b0be6a14949 + revision: 9487c7f73e68abab097af523d42c1d2e106e614b branch: develop specs: ontologies_linked_data (0.0.1) @@ -74,10 +74,11 @@ GEM activesupport (3.2.22.5) i18n (~> 0.6, >= 0.6.4) multi_json (~> 1.0) - addressable (2.8.5) + addressable (2.8.6) public_suffix (>= 2.0.2, < 6.0) base64 (0.2.0) - bcrypt (3.1.19) + bcrypt (3.1.20) + bigdecimal (3.1.4) builder (3.2.4) coderay (1.1.3) concurrent-ruby (1.2.2) @@ -91,7 +92,7 @@ GEM htmlentities (~> 4.3.3) launchy (~> 2.1) mail (~> 2.6) - faraday (2.7.11) + faraday (2.7.12) base64 faraday-net_http (>= 2.0, < 3.1) ruby2_keywords (>= 0.0.4) @@ -124,35 +125,36 @@ GEM retriable (>= 2.0, < 4.a) rexml webrick - google-cloud-core (1.6.0) - google-cloud-env (~> 1.0) + google-cloud-core (1.6.1) + google-cloud-env (>= 1.0, < 3.a) google-cloud-errors (~> 1.0) - google-cloud-env (1.6.0) - faraday (>= 0.17.3, < 3.0) + google-cloud-env (2.1.0) + faraday (>= 1.0, < 3.a) google-cloud-errors (1.3.1) - google-protobuf (3.25.0) - google-protobuf (3.25.0-x86_64-darwin) - google-protobuf (3.25.0-x86_64-linux) + google-protobuf (3.25.1) + google-protobuf (3.25.1-x86_64-darwin) + google-protobuf (3.25.1-x86_64-linux) googleapis-common-protos (1.4.0) google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.10.0) + googleapis-common-protos-types (1.11.0) google-protobuf (~> 3.18) - googleauth (1.8.1) - faraday (>= 0.17.3, < 3.a) + googleauth (1.9.1) + faraday (>= 1.0, < 3.a) + google-cloud-env (~> 2.1) jwt (>= 1.4, < 3.0) multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) - grpc (1.59.2) - google-protobuf (~> 3.24) + grpc (1.60.0) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) - grpc (1.59.2-x86_64-darwin) - google-protobuf (~> 3.24) + grpc (1.60.0-x86_64-darwin) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) - grpc (1.59.2-x86_64-linux) - google-protobuf (~> 3.24) + grpc (1.60.0-x86_64-linux) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) htmlentities (4.3.4) http-accept (1.7.0) @@ -161,8 +163,8 @@ GEM httpclient (2.8.3) i18n (0.9.5) concurrent-ruby (~> 1.0) - json (2.6.3) - json_pure (2.6.3) + json (2.7.1) + json_pure (2.7.1) jwt (2.7.1) launchy (2.5.2) addressable (~> 2.8) @@ -175,7 +177,7 @@ GEM method_source (1.0.0) mime-types (3.5.1) mime-types-data (~> 3.2015) - mime-types-data (3.2023.1003) + mime-types-data (3.2023.1205) mini_mime (1.1.5) minitest (4.7.5) mlanett-redis-lock (0.2.7) @@ -183,7 +185,8 @@ GEM multi_json (1.15.0) net-http-persistent (2.9.4) netrc (0.11.0) - oj (3.16.1) + oj (3.16.3) + bigdecimal (>= 3.0) omni_logger (0.1.4) logger os (1.1.4) @@ -193,7 +196,7 @@ GEM pry (0.14.2) coderay (~> 1.1) method_source (~> 1.0) - public_suffix (5.0.3) + public_suffix (5.0.4) rack (3.0.8) rack-test (2.1.0) rack (>= 1.3) @@ -202,7 +205,7 @@ GEM addressable (>= 2.2) redis (5.0.8) redis-client (>= 0.17.0) - redis-client (0.18.0) + redis-client (0.19.0) connection_pool representable (3.2.0) declarative (< 0.1.0) diff --git a/bin/ncbo_ontology_pull b/bin/ncbo_ontology_pull index a017e4d7..be3e08de 100755 --- a/bin/ncbo_ontology_pull +++ b/bin/ncbo_ontology_pull @@ -32,7 +32,7 @@ logger = Logger.new($stdout) logger.info "Starting ncbo pull"; logger.flush puller = NcboCron::Models::OntologyPull.new begin - puller.do_ontology_pull(ontology_acronym, logger: logger , enable_pull_umls:true ) + puller.do_ontology_pull(ontology_acronym, logger: logger, enable_pull_umls: true) rescue StandardError => e logger.error e.message logger.flush diff --git a/lib/ncbo_cron/ontology_helper.rb b/lib/ncbo_cron/ontology_helper.rb new file mode 100644 index 00000000..42534768 --- /dev/null +++ b/lib/ncbo_cron/ontology_helper.rb @@ -0,0 +1,185 @@ +require 'logger' + +module NcboCron + module Helpers + module OntologyHelper + + REDIS_SUBMISSION_ID_PREFIX = "sub:" + PROCESS_QUEUE_HOLDER = "parseQueue" + PROCESS_ACTIONS = { + :process_rdf => true, + :generate_labels => true, + :index_search => true, + :index_properties => true, + :run_metrics => true, + :process_annotator => true, + :diff => true, + :remote_pull => false + } + + class RemoteFileException < StandardError + attr_reader :submission + + def initialize(submission) + super + @submission = submission + end + end + + def self.do_ontology_pull(ontology_acronym, enable_pull_umls = false, umls_download_url = '', logger = nil, + add_to_queue = true) + logger ||= Logger.new($stdout) + ont = LinkedData::Models::Ontology.find(ontology_acronym).include(:acronym).first + new_submission = nil + raise StandardError, "Ontology #{ontology_acronym} not found" if ont.nil? + + last = ont.latest_submission(status: :any) + raise StandardError, "No submission found for #{ontology_acronym}" if last.nil? + + last.bring(:hasOntologyLanguage) if last.bring?(:hasOntologyLanguage) + if !enable_pull_umls && last.hasOntologyLanguage.umls? + raise StandardError, "Pull umls not enabled" + end + + last.bring(:pullLocation) if last.bring?(:pullLocation) + raise StandardError, "#{ontology_acronym} has no pullLocation" if last.pullLocation.nil? + + last.bring(:uploadFilePath) if last.bring?(:uploadFilePath) + + if last.hasOntologyLanguage.umls? && umls_download_url && !umls_download_url.empty? + last.pullLocation = RDF::URI.new(umls_download_url + last.pullLocation.split("/")[-1]) + logger.info("Using alternative download for umls #{last.pullLocation.to_s}") + logger.flush + end + + if last.remote_file_exists?(last.pullLocation.to_s) + logger.info "Checking download for #{ont.acronym}" + logger.info "Location: #{last.pullLocation.to_s}"; logger.flush + file, filename = last.download_ontology_file + file, md5local, md5remote, new_file_exists = self.new_file_exists?(file, last) + + if new_file_exists + logger.info "New file found for #{ont.acronym}\nold: #{md5local}\nnew: #{md5remote}" + logger.flush() + new_submission = self.create_submission(ont, last, file, filename, logger, add_to_queue) + else + logger.info "There is no new file found for #{ont.acronym}" + logger.flush() + end + + file.close + new_submission + else + raise self::RemoteFileException.new(last) + end + end + + def self.create_submission(ont, sub, file, filename, logger = nil, add_to_queue = true, new_version = nil, + new_released = nil) + logger ||= Kernel.const_defined?("LOGGER") ? Kernel.const_get("LOGGER") : Logger.new(STDOUT) + new_sub = LinkedData::Models::OntologySubmission.new + + sub.bring_remaining + sub.loaded_attributes.each do |attr| + new_sub.send("#{attr}=", sub.send(attr)) + end + + submission_id = ont.next_submission_id() + new_sub.submissionId = submission_id + file_location = LinkedData::Models::OntologySubmission.copy_file_repository(ont.acronym, submission_id, file, filename) + new_sub.uploadFilePath = file_location + + unless new_version.nil? + new_sub.version = new_version + end + + if new_released.nil? + new_sub.released = DateTime.now + else + new_sub.released = DateTime.parse(new_released) + end + new_sub.submissionStatus = nil + new_sub.creationDate = nil + new_sub.missingImports = nil + new_sub.metrics = nil + full_file_path = File.expand_path(file_location) + + # check if OWLAPI is able to parse the file before creating a new submission + owlapi = LinkedData::Parser::OWLAPICommand.new( + full_file_path, + File.expand_path(new_sub.data_folder.to_s), + logger: logger) + owlapi.disable_reasoner + parsable = true + + begin + owlapi.parse + rescue Exception => e + logger.error("The new file for ontology #{ont.acronym}, submission id: #{submission_id} did not clear OWLAPI: #{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") + logger.error("A new submission has NOT been created.") + logger.flush + parsable = false + end + + if parsable + if new_sub.valid? + new_sub.save() + + if add_to_queue + self.queue_submission(new_sub, { all: true }) + logger.info("OntologyPull created a new submission (#{submission_id}) for ontology #{ont.acronym}") + end + else + logger.error("Unable to create a new submission for ontology #{ont.acronym} with id #{submission_id}: #{new_sub.errors}") + logger.flush + end + else + # delete the bad file + File.delete full_file_path if File.exist? full_file_path + end + new_sub + end + + def self.queue_submission(submission, actions={:all => true}) + redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port) + + if actions[:all] + actions = PROCESS_ACTIONS.dup + else + actions.delete_if {|k, v| !PROCESS_ACTIONS.has_key?(k)} + end + actionStr = MultiJson.dump(actions) + redis.hset(PROCESS_QUEUE_HOLDER, get_prefixed_id(submission.id), actionStr) unless actions.empty? + end + + def self.get_prefixed_id(id) + "#{REDIS_SUBMISSION_ID_PREFIX}#{id}" + end + + def self.last_fragment_of_uri(uri) + uri.to_s.split("/")[-1] + end + + def self.acronym_from_submission_id(submissionID) + submissionID.to_s.split("/")[-3] + end + + def self.new_file_exists?(file, last) + file = File.open(file.path, "rb") + remote_contents = file.read + md5remote = Digest::MD5.hexdigest(remote_contents) + + if last.uploadFilePath && File.exist?(last.uploadFilePath) + file_contents = open(last.uploadFilePath) { |f| f.read } + md5local = Digest::MD5.hexdigest(file_contents) + new_file_exists = (not md5remote.eql?(md5local)) + else + # There is no existing file, so let's create a submission with the downloaded one + new_file_exists = true + end + return file, md5local, md5remote, new_file_exists + end + + end + end +end \ No newline at end of file diff --git a/lib/ncbo_cron/ontology_pull.rb b/lib/ncbo_cron/ontology_pull.rb index 7aa9fc23..c554c95e 100644 --- a/lib/ncbo_cron/ontology_pull.rb +++ b/lib/ncbo_cron/ontology_pull.rb @@ -1,22 +1,11 @@ -require 'open-uri' require 'logger' -require_relative 'ontology_submission_parser' +require_relative 'ontology_helper' module NcboCron module Models class OntologyPull - class RemoteFileException < StandardError - attr_reader :submission - - def initialize(submission) - super - @submission = submission - end - end - - def do_remote_ontology_pull(options = {}) logger = options[:logger] || Logger.new($stdout) logger.info "UMLS auto-pull #{options[:enable_pull_umls] == true}" @@ -33,11 +22,11 @@ def do_remote_ontology_pull(options = {}) ontologies.each do |ont| begin begin - new_submissions << self.do_ontology_pull(ont.acronym, + new_submissions << NcboCron::Helpers::OntologyHelper.do_ontology_pull(ont.acronym, enable_pull_umls: enable_pull_umls, umls_download_url: umls_download_url, - logger: logger) - rescue RemoteFileException => error + logger: logger, add_to_queue: true) + rescue NcboCron::Helpers::OntologyHelper::RemoteFileException => error logger.info "RemoteFileException: No submission file at pull location #{error.submission.pullLocation.to_s} for ontology #{ont.acronym}." logger.flush LinkedData::Utils::Notifications.remote_ontology_pull(error.submission) @@ -58,136 +47,8 @@ def do_remote_ontology_pull(options = {}) new_submissions end - def do_ontology_pull(ontology_acronym, enable_pull_umls: false, umls_download_url: '', logger: nil) - ont = LinkedData::Models::Ontology.find(ontology_acronym).include(:acronym).first - new_submission = nil - raise StandardError, "Ontology #{ontology_acronym} not found" if ont.nil? - - last = ont.latest_submission(status: :any) - raise StandardError, "No submission found for #{ontology_acronym}" if last.nil? - - last.bring(:hasOntologyLanguage) if last.bring?(:hasOntologyLanguage) - if !enable_pull_umls && last.hasOntologyLanguage.umls? - raise StandardError, "Pull umls not enabled" - end - - last.bring(:pullLocation) if last.bring?(:pullLocation) - raise StandardError, "#{ontology_acronym} has no pullLocation" if last.pullLocation.nil? - - last.bring(:uploadFilePath) if last.bring?(:uploadFilePath) - - if last.hasOntologyLanguage.umls? && umls_download_url - last.pullLocation = RDF::URI.new(umls_download_url + last.pullLocation.split("/")[-1]) - logger.info("Using alternative download for umls #{last.pullLocation.to_s}") - logger.flush - end - - if last.remote_file_exists?(last.pullLocation.to_s) - logger.info "Checking download for #{ont.acronym}" - logger.info "Location: #{last.pullLocation.to_s}"; logger.flush - file, filename = last.download_ontology_file - file, md5local, md5remote, new_file_exists = new_file_exists?(file, last) - - if new_file_exists - logger.info "New file found for #{ont.acronym}\nold: #{md5local}\nnew: #{md5remote}" - logger.flush() - new_submission = create_submission(ont, last, file, filename, logger) - else - logger.info "There is no new file found for #{ont.acronym}" - logger.flush() - end - - file.close - new_submission - else - raise RemoteFileException.new(last) - end - end - - def create_submission(ont, sub, file, filename, logger = nil, - add_to_pull = true, new_version = nil, new_released = nil) - logger ||= Kernel.const_defined?("LOGGER") ? Kernel.const_get("LOGGER") : Logger.new(STDOUT) - new_sub = LinkedData::Models::OntologySubmission.new - - sub.bring_remaining - sub.loaded_attributes.each do |attr| - new_sub.send("#{attr}=", sub.send(attr)) - end - - submission_id = ont.next_submission_id() - new_sub.submissionId = submission_id - file_location = LinkedData::Models::OntologySubmission.copy_file_repository(ont.acronym, submission_id, file, filename) - new_sub.uploadFilePath = file_location - unless new_version.nil? - new_sub.version = new_version - end - if new_released.nil? - new_sub.released = DateTime.now - else - new_sub.released = DateTime.parse(new_released) - end - new_sub.submissionStatus = nil - new_sub.creationDate = nil - new_sub.missingImports = nil - new_sub.metrics = nil - full_file_path = File.expand_path(file_location) - - # check if OWLAPI is able to parse the file before creating a new submission - owlapi = LinkedData::Parser::OWLAPICommand.new( - full_file_path, - File.expand_path(new_sub.data_folder.to_s), - logger: logger) - owlapi.disable_reasoner - parsable = true - - begin - owlapi.parse - rescue Exception => e - logger.error("The new file for ontology #{ont.acronym}, submission id: #{submission_id} did not clear OWLAPI: #{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}") - logger.error("A new submission has NOT been created.") - logger.flush - parsable = false - end - - if parsable - if new_sub.valid? - new_sub.save() - - if add_to_pull - submission_queue = NcboCron::Models::OntologySubmissionParser.new - submission_queue.queue_submission(new_sub, { all: true }) - logger.info("OntologyPull created a new submission (#{submission_id}) for ontology #{ont.acronym}") - end - else - logger.error("Unable to create a new submission in OntologyPull: #{new_sub.errors}") - logger.flush - end - else - # delete the bad file - File.delete full_file_path if File.exist? full_file_path - end - new_sub - end - - private - def new_file_exists?(file, last) - file = File.open(file.path, "rb") - remote_contents = file.read - md5remote = Digest::MD5.hexdigest(remote_contents) - - if last.uploadFilePath && File.exist?(last.uploadFilePath) - file_contents = open(last.uploadFilePath) { |f| f.read } - md5local = Digest::MD5.hexdigest(file_contents) - new_file_exists = (not md5remote.eql?(md5local)) - else - # There is no existing file, so let's create a submission with the downloaded one - new_file_exists = true - end - return file, md5local, md5remote, new_file_exists - end - def redis_goo Redis.new(host: LinkedData.settings.goo_redis_host, port: LinkedData.settings.goo_redis_port, timeout: 30) end diff --git a/lib/ncbo_cron/ontology_rank.rb b/lib/ncbo_cron/ontology_rank.rb index b60c2740..64de8844 100644 --- a/lib/ncbo_cron/ontology_rank.rb +++ b/lib/ncbo_cron/ontology_rank.rb @@ -1,5 +1,6 @@ require 'logger' require 'benchmark' +require_relative 'ontology_helper' module NcboCron module Models @@ -66,7 +67,7 @@ def umls_scores(ontologies) ontologies.each do |ont| if ont.group && !ont.group.empty? - umls_gr = ont.group.select {|gr| acronym_from_id(gr.id.to_s).include?('UMLS')} + umls_gr = ont.group.select {|gr| NcboCron::Helpers::OntologyHelper.last_fragment_of_uri(gr.id.to_s).include?('UMLS')} scores[ont.acronym] = umls_gr.empty? ? 0 : 1 else scores[ont.acronym] = 0 @@ -75,10 +76,6 @@ def umls_scores(ontologies) scores end - def acronym_from_id(id) - id.to_s.split("/")[-1] - end - def normalize(x, xmin, xmax, ymin, ymax) xrange = xmax - xmin yrange = ymax - ymin diff --git a/lib/ncbo_cron/ontology_submission_parser.rb b/lib/ncbo_cron/ontology_submission_parser.rb index 34c53930..f493eced 100644 --- a/lib/ncbo_cron/ontology_submission_parser.rb +++ b/lib/ncbo_cron/ontology_submission_parser.rb @@ -1,39 +1,22 @@ require 'multi_json' +require_relative 'ontology_helper' module NcboCron module Models class OntologySubmissionParser - QUEUE_HOLDER = "parseQueue" - IDPREFIX = "sub:" - - ACTIONS = { - :process_rdf => true, - :generate_labels => true, - :index_search => true, - :index_properties => true, - :run_metrics => true, - :process_annotator => true, - :diff => true - } + QUEUE_HOLDER = NcboCron::Helpers::OntologyHelper::PROCESS_QUEUE_HOLDER + ACTIONS = NcboCron::Helpers::OntologyHelper::PROCESS_ACTIONS def initialize() end - def queue_submission(submission, actions={:all => true}) - redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port) - - if actions[:all] - actions = ACTIONS.dup - else - actions.delete_if {|k, v| !ACTIONS.has_key?(k)} - end - actionStr = MultiJson.dump(actions) - redis.hset(QUEUE_HOLDER, get_prefixed_id(submission.id), actionStr) unless actions.empty? + def queue_submission(submission, actions={ :all => true }) + NcboCron::Helpers::OntologyHelper.queue_submission(submission, actions) end - def process_queue_submissions(options = {}) + def process_queue_submissions(options={}) logger = options[:logger] logger ||= Kernel.const_defined?("LOGGER") ? Kernel.const_get("LOGGER") : Logger.new(STDOUT) redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port) @@ -44,6 +27,18 @@ def process_queue_submissions(options = {}) realKey = process_data[:key] key = process_data[:redis_key] redis.hdel(QUEUE_HOLDER, key) + + # if :remote_pull is one of the actions, pull the ontology and halt if no new submission is found + if actions.key?(:remote_pull) && actions[:remote_pull] + acronym = NcboCron::Helpers::OntologyHelper.acronym_from_submission_id(realKey) + new_submission = NcboCron::Helpers::OntologyHelper.do_ontology_pull(acronym, enable_pull_umls: false, + umls_download_url: '', logger: logger, + add_to_queue: false) + return unless new_submission + realKey = new_submission.id.to_s + actions.delete(:remote_pull) + end + begin process_submission(logger, realKey, actions) rescue Exception => e @@ -56,7 +51,7 @@ def process_queue_submissions(options = {}) def queued_items(redis, logger=nil) logger ||= Kernel.const_defined?("LOGGER") ? Kernel.const_get("LOGGER") : Logger.new(STDOUT) all = redis.hgetall(QUEUE_HOLDER) - prefix_remove = Regexp.new(/^#{IDPREFIX}/) + prefix_remove = Regexp.new(/^#{NcboCron::Helpers::OntologyHelper::REDIS_SUBMISSION_ID_PREFIX}/) items = [] all.each do |key, val| begin @@ -76,10 +71,6 @@ def queued_items(redis, logger=nil) items end - def get_prefixed_id(id) - "#{IDPREFIX}#{id}" - end - def zombie_classes_graphs query = "SELECT DISTINCT ?g WHERE { GRAPH ?g { ?s ?p ?o }}" class_graphs = [] @@ -191,6 +182,10 @@ def process_submission(logger, submission_id, actions=ACTIONS) end end + def get_prefixed_id(id) + NcboCron::Helpers::OntologyHelper.get_prefixed_id(id) + end + private def archive_old_submissions(logger, sub) diff --git a/test/test_case.rb b/test/test_case.rb index 5f164ecd..75bb0454 100644 --- a/test/test_case.rb +++ b/test/test_case.rb @@ -56,7 +56,7 @@ def count_pattern(pattern) return 0 end - def backend_4s_delete + def backend_triplestore_delete raise StandardError, 'Too many triples in KB, does not seem right to run tests' unless count_pattern('?s ?p ?o') < 400000 @@ -89,7 +89,7 @@ def _run_suites(suites, type) end def _run_suite(suite, type) - backend_4s_delete + backend_triplestore_delete suite.before_suite if suite.respond_to?(:before_suite) super(suite, type) rescue Exception => e @@ -98,7 +98,7 @@ def _run_suite(suite, type) puts 'Traced from:' raise e ensure - backend_4s_delete + backend_triplestore_delete suite.after_suite if suite.respond_to?(:after_suite) end end diff --git a/test/test_ontology_pull.rb b/test/test_ontology_pull.rb index 74923677..ca3c6130 100644 --- a/test/test_ontology_pull.rb +++ b/test/test_ontology_pull.rb @@ -76,6 +76,32 @@ def test_remote_ontology_pull assert_equal 2, ont.submissions.length end + def test_remote_pull_parsing_action + ontologies = init_ontologies(1, process_submissions: true) + ont = LinkedData::Models::Ontology.find(ontologies[0].id).first + ont.bring(:submissions) if ont.bring?(:submissions) + assert_equal 1, ont.submissions.length + + # add this ontology to submission queue with :remote_pull action enabled + parser = NcboCron::Models::OntologySubmissionParser.new + actions = NcboCron::Models::OntologySubmissionParser::ACTIONS.dup + actions[:remote_pull] = true + parser.queue_submission(ont.submissions[0], actions) + parser.process_queue_submissions + + # make sure there are now 2 submissions present + ont = LinkedData::Models::Ontology.find(ontologies[0].id).first + ont.bring(:submissions) if ont.bring?(:submissions) + assert_equal 2, ont.submissions.length + + # verify that no new submission is created when the file has not changed + parser.queue_submission(ont.submissions[0], actions) + parser.process_queue_submissions + ont = LinkedData::Models::Ontology.find(ontologies[0].id).first + ont.bring(:submissions) if ont.bring?(:submissions) + assert_equal 2, ont.submissions.length + end + def test_pull_error_notification server_port = Random.rand(55000..65535) @@ -164,8 +190,9 @@ def test_no_pull_location private - def init_ontologies(submission_count) - ont_count, acronyms, ontologies = LinkedData::SampleData::Ontology.create_ontologies_and_submissions(ont_count: 1, submission_count: submission_count, process_submission: false) + def init_ontologies(submission_count, process_submissions = false) + ont_count, acronyms, ontologies = LinkedData::SampleData::Ontology.create_ontologies_and_submissions( + ont_count: 1, submission_count: submission_count, process_submission: process_submissions) ontologies[0].bring(:submissions) if ontologies[0].bring?(:submissions) ontologies[0].submissions.each do |sub| sub.bring_remaining() From 23316314217ec2e6b5cba8d66f3b8a491f53da4c Mon Sep 17 00:00:00 2001 From: mdorf Date: Sat, 16 Dec 2023 17:54:36 -0800 Subject: [PATCH 56/62] implemented the first pass at bmir-radx/radx-project#37 --- lib/ncbo_cron/ontology_submission_parser.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/ncbo_cron/ontology_submission_parser.rb b/lib/ncbo_cron/ontology_submission_parser.rb index f493eced..8d33f89d 100644 --- a/lib/ncbo_cron/ontology_submission_parser.rb +++ b/lib/ncbo_cron/ontology_submission_parser.rb @@ -29,6 +29,8 @@ def process_queue_submissions(options={}) redis.hdel(QUEUE_HOLDER, key) # if :remote_pull is one of the actions, pull the ontology and halt if no new submission is found + # if a new submission is found, replace the submission ID with the new one and proceed with + # processing the remaining actions on the new submission if actions.key?(:remote_pull) && actions[:remote_pull] acronym = NcboCron::Helpers::OntologyHelper.acronym_from_submission_id(realKey) new_submission = NcboCron::Helpers::OntologyHelper.do_ontology_pull(acronym, enable_pull_umls: false, From 07107b1aa08e41bb4ce8de311149355de70738e5 Mon Sep 17 00:00:00 2001 From: Alex Skrenchuk Date: Tue, 19 Dec 2023 15:08:28 -0800 Subject: [PATCH 57/62] set bundler version to be comptatible with ruby 2.7 + AG v8 --- Dockerfile | 4 ++++ docker-compose.yml | 15 ++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index dfc03492..73e1379c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,10 @@ COPY Gemfile* *.gemspec /srv/ontoportal/ncbo_cron/ WORKDIR /srv/ontoportal/ncbo_cron +# set rubygem and bundler to the last version supported by ruby 2.7 +# remove version after ruby v3 upgrade +RUN gem update --system '3.4.22' +RUN gem install bundler -v '2.4.22' RUN gem update --system RUN gem install bundler ENV BUNDLE_PATH=/srv/ontoportal/bundle diff --git a/docker-compose.yml b/docker-compose.yml index 0045ce12..5f4e9307 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,7 +4,7 @@ x-app: &app args: RUBY_VERSION: '2.7' # Increase the version number in the image tag every time Dockerfile or its arguments is changed - image: ncbo_cron:0.0.1 + image: ncbo_cron:0.0.2 environment: &env BUNDLE_PATH: /srv/ontoportal/bundle # default bundle config resolves to /usr/local/bundle/config inside of the container @@ -19,12 +19,10 @@ x-app: &app SOLR_TERM_SEARCH_URL: http://solr-ut:8983/solr/term_search_core1 SOLR_PROP_SEARCH_URL: http://solr-ut:8983/solr/prop_search_core1 MGREP_HOST: mgrep-ut - MGREP_PORT: 55555 + MGREP_PORT: 55556 stdin_open: true tty: true command: "bundle exec rackup -o 0.0.0.0 --port 9393" - ports: - - 9393:9393 volumes: # bundle volume for hosting gems installed by bundle; it helps in local development with gem udpates - bundle:/srv/ontoportal/bundle @@ -104,18 +102,17 @@ services: retries: 5 mgrep-ut: - image: ontoportal/mgrep:0.0.1 + image: ontoportal/mgrep:0.0.2 platform: linux/amd64 healthcheck: - test: ["CMD", "nc", "-z", "-v", "localhost", "55555"] + test: ["CMD", "nc", "-z", "-v", "localhost", "55556"] start_period: 3s interval: 10s timeout: 5s retries: 5 agraph-ut: - #image: franzinc/agraph:v7.3.1 - image: ontoportal/agraph:v7.3.1-patch1 + image: franzinc/agraph:v8.0.0 platform: linux/amd64 environment: - AGRAPH_SUPER_USER=test @@ -131,7 +128,7 @@ services: ; tail -f /agraph/data/agraph.log" healthcheck: test: ["CMD-SHELL", "agtool storage-report bioportal_test || exit 1"] - start_period: 10s + start_period: 20s interval: 60s timeout: 5s retries: 3 From bada7d2a26903cba03d6b881637576275e143f68 Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Wed, 27 Dec 2023 13:31:02 +0100 Subject: [PATCH 58/62] refactor ontologies analytics job to handle the new google analytics migration --- bin/generate_ua_analytics_file.rb | 126 -------- bin/ncbo_cron | 2 +- bin/ncbo_ontology_analytics_rebuild | 2 +- lib/ncbo_cron.rb | 2 +- lib/ncbo_cron/analytics/object_analytics.rb | 269 ++++++++++++++++++ .../analytics/object_analytics_job.rb | 87 ++++++ .../analytics/ontology_visits_analytics.rb | 110 +++++++ lib/ncbo_cron/ontology_analytics.rb | 233 --------------- 8 files changed, 469 insertions(+), 362 deletions(-) delete mode 100755 bin/generate_ua_analytics_file.rb create mode 100644 lib/ncbo_cron/analytics/object_analytics.rb create mode 100644 lib/ncbo_cron/analytics/object_analytics_job.rb create mode 100644 lib/ncbo_cron/analytics/ontology_visits_analytics.rb delete mode 100644 lib/ncbo_cron/ontology_analytics.rb diff --git a/bin/generate_ua_analytics_file.rb b/bin/generate_ua_analytics_file.rb deleted file mode 100755 index 0a432a92..00000000 --- a/bin/generate_ua_analytics_file.rb +++ /dev/null @@ -1,126 +0,0 @@ -require 'logger' -require 'google/apis/analytics_v3' -require 'google/api_client/auth/key_utils' - -module NcboCron - module Models - - class OntologyAnalyticsUA - - def initialize(logger) - @logger = logger - end - - def run - redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port) - ontology_analytics = fetch_ontology_analytics - File.open(NcboCron.settings.analytics_path_to_ua_data_file, 'w') do |f| - f.write(ontology_analytics.to_json) - end - end - - def fetch_ontology_analytics - google_client = authenticate_google - aggregated_results = Hash.new - start_year = Date.parse(NcboCron.settings.analytics_start_date).year || 2013 - ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map {|o| o.acronym} - # ont_acronyms = ["NCIT", "ONTOMA", "CMPO", "AEO", "SNOMEDCT"] - filter_str = (NcboCron.settings.analytics_filter_str.nil? || NcboCron.settings.analytics_filter_str.empty?) ? "" : ";#{NcboCron.settings.analytics_filter_str}" - - ont_acronyms.each do |acronym| - max_results = 10000 - num_results = 10000 - start_index = 1 - results = nil - - loop do - results = google_client.get_ga_data( - ids = NcboCron.settings.analytics_profile_id, - start_date = NcboCron.settings.analytics_start_date, - end_date = Date.today.to_s, - metrics = 'ga:pageviews', - { - dimensions: 'ga:pagePath,ga:year,ga:month', - filters: "ga:pagePath=~^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$#{filter_str}", - start_index: start_index, - max_results: max_results - } - ) - results.rows ||= [] - start_index += max_results - num_results = results.rows.length - @logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}" - @logger.flush - - results.rows.each do |row| - if aggregated_results.has_key?(acronym) - # year - if aggregated_results[acronym].has_key?(row[1].to_i) - # month - if aggregated_results[acronym][row[1].to_i].has_key?(row[2].to_i) - aggregated_results[acronym][row[1].to_i][row[2].to_i] += row[3].to_i - else - aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i - end - else - aggregated_results[acronym][row[1].to_i] = Hash.new - aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i - end - else - aggregated_results[acronym] = Hash.new - aggregated_results[acronym][row[1].to_i] = Hash.new - aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i - end - end - - if num_results < max_results - # fill up non existent years - (start_year..Date.today.year).each do |y| - aggregated_results[acronym] = Hash.new if aggregated_results[acronym].nil? - aggregated_results[acronym][y] = Hash.new unless aggregated_results[acronym].has_key?(y) - end - # fill up non existent months with zeros - (1..12).each { |n| aggregated_results[acronym].values.each { |v| v[n] = 0 unless v.has_key?(n) } } - break - end - end - end - - @logger.info "Completed Universal Analytics pull..." - @logger.flush - - aggregated_results - end - - def authenticate_google - Google::Apis::ClientOptions.default.application_name = NcboCron.settings.analytics_app_name - Google::Apis::ClientOptions.default.application_version = NcboCron.settings.analytics_app_version - # enable google api call retries in order to - # minigate analytics processing failure due to occasional google api timeouts and other outages - Google::Apis::RequestOptions.default.retries = 5 - # uncoment to enable logging for debugging purposes - # Google::Apis.logger.level = Logger::DEBUG - # Google::Apis.logger = @logger - client = Google::Apis::AnalyticsV3::AnalyticsService.new - key = Google::APIClient::KeyUtils::load_from_pkcs12(NcboCron.settings.analytics_path_to_ua_key_file, 'notasecret') - client.authorization = Signet::OAuth2::Client.new( - :token_credential_uri => 'https://accounts.google.com/o/oauth2/token', - :audience => 'https://accounts.google.com/o/oauth2/token', - :scope => 'https://www.googleapis.com/auth/analytics.readonly', - :issuer => NcboCron.settings.analytics_service_account_email_address, - :signing_key => key - ).tap { |auth| auth.fetch_access_token! } - client - end - end - end -end - -require 'ontologies_linked_data' -require 'goo' -require 'ncbo_annotator' -require 'ncbo_cron/config' -require_relative '../config/config' -ontology_analytics_log_path = File.join("logs", "ontology-analytics-ua.log") -ontology_analytics_logger = Logger.new(ontology_analytics_log_path) -NcboCron::Models::OntologyAnalyticsUA.new(ontology_analytics_logger).run diff --git a/bin/ncbo_cron b/bin/ncbo_cron index 397d726d..9073f9a1 100755 --- a/bin/ncbo_cron +++ b/bin/ncbo_cron @@ -381,7 +381,7 @@ runner.execute do |opts| logger.info "Logging ontology analytics refresh details to #{ontology_analytics_log_path}"; logger.flush t0 = Time.now # Generate ontology analytics - NcboCron::Models::OntologyAnalytics.new(ontology_analytics_logger).run + NcboCron::Models::ObjectAnalyticsJob.new(logger).run # Generate ontology ranking NcboCron::Models::OntologyRank.new(ontology_analytics_logger).run logger.info "Ontology analytics refresh job completed in #{Time.now - t0} sec."; logger.flush diff --git a/bin/ncbo_ontology_analytics_rebuild b/bin/ncbo_ontology_analytics_rebuild index 5dfc082a..b37b8138 100755 --- a/bin/ncbo_ontology_analytics_rebuild +++ b/bin/ncbo_ontology_analytics_rebuild @@ -52,7 +52,7 @@ begin logger.info(msg) time = Benchmark.realtime do - NcboCron::Models::OntologyAnalytics.new(logger).run + NcboCron::Models::ObjectAnalyticsJob.new(logger).run end msg = "Completed rebuilding ontology analytics repository in #{(time/60).round(1)} minutes." puts msg diff --git a/lib/ncbo_cron.rb b/lib/ncbo_cron.rb index 884e6b33..83d67caa 100644 --- a/lib/ncbo_cron.rb +++ b/lib/ncbo_cron.rb @@ -11,7 +11,7 @@ require_relative 'ncbo_cron/scheduler' require_relative 'ncbo_cron/query_caching' require_relative 'ncbo_cron/ontologies_report' -require_relative 'ncbo_cron/ontology_analytics' +require_relative 'ncbo_cron/analytics/object_analytics_job' require_relative 'ncbo_cron/ontology_rank' require_relative 'ncbo_cron/spam_deletion' require_relative 'ncbo_cron/mapping_counts' diff --git a/lib/ncbo_cron/analytics/object_analytics.rb b/lib/ncbo_cron/analytics/object_analytics.rb new file mode 100644 index 00000000..f5275f82 --- /dev/null +++ b/lib/ncbo_cron/analytics/object_analytics.rb @@ -0,0 +1,269 @@ +require 'logger' +require 'json' +require 'benchmark' +require 'google/analytics/data' +require 'google/apis/analytics_v3' +require 'google/api_client/auth/key_utils' + +module NcboCron + module Models + UA_START_DATE = '2013-10-01' + GA4_START_DATE = '2023-06-01' + + class GoogleAnalyticsConnector + + attr_reader :ga_client + + def initialize + @ga_data_file = NcboCron.settings.analytics_path_to_ga_data_file + @ua_data_file = NcboCron.settings.analytics_path_to_ua_data_file + @app_id = NcboCron.settings.analytics_property_id + @app_key_file = NcboCron.settings.analytics_path_to_key_file + @ga_client = analytics_data_client + end + + def run_request(metrics:, dimensions:, date_ranges:, order_bys:, offset:, limit:, dimension_filter:) + request = Google::Analytics::Data::V1beta::RunReportRequest.new( + property: "properties/#{@app_id}", + metrics: metrics.map { |m| ga_metric(m) }, + dimension_filter: dimension_filter.empty? ? nil : ga_filter(*dimension_filter), + dimensions: dimensions.map { |d| ga_dimension(d) }, + date_ranges: [ga_date_range(*date_ranges)], + order_bys: order_bys.map { |o| ga_order_by(o) }, + offset: offset, + limit: limit + ) + + @ga_client.run_report request + end + + private + + def analytics_data_client + Google::Analytics::Data.analytics_data do |config| + config.credentials = @app_key_file + end + end + + def ga_metric(name) + Google::Analytics::Data::V1beta::Metric.new( + name: name + ) + end + + def ga_date_range(start_date, end_date) + Google::Analytics::Data::V1beta::DateRange.new( + start_date: start_date, + end_date: end_date + ) + end + + def ga_dimension(name) + Google::Analytics::Data::V1beta::Dimension.new( + name: name + ) + end + + def ga_filter(field_name, value) + string_filter = Google::Analytics::Data::V1beta::Filter::StringFilter.new( + match_type: Google::Analytics::Data::V1beta::Filter::StringFilter::MatchType::FULL_REGEXP, + value: value + ) + + filter = Google::Analytics::Data::V1beta::Filter.new( + field_name: field_name, + string_filter: string_filter + ) + Google::Analytics::Data::V1beta::FilterExpression.new(filter: filter) + end + + def ga_order_by(dimension_name, desc = false) + order = Google::Analytics::Data::V1beta::OrderBy::DimensionOrderBy.new( + dimension_name: dimension_name + ) + Google::Analytics::Data::V1beta::OrderBy.new( + desc: desc, + dimension: order + ) + end + + end + + # Old version of Google Analytics + class GoogleAnalyticsUAConnector + def initialize + @app_id = NcboCron.settings.analytics_profile_id + @app_name = NcboCron.settings.analytics_app_name + @app_version = NcboCron.settings.analytics_app_version + @analytics_key_file = NcboCron.settings.ua_analytics_path_to_key_file + @app_user = NcboCron.settings.analytics_service_account_email_address + @generated_file_path = NcboCron.settings.analytics_path_to_ua_data_file + @start_date = NcboCron.settings.analytics_start_date + @analytics_filter = NcboCron.settings.analytics_filter_str + @ga_client = authenticate_google + end + + def run_request(metrics:, dimensions:, filters:, start_index:, max_results:, dates_ranges:, sort:) + @ga_client.get_ga_data( + ids = @app_id, + start_date = dates_ranges.first, + end_date = dates_ranges.last, + metrics = metrics.map { |m| "ga:#{m}" }.join(','), + { + dimensions: dimensions.map { |d| "ga:#{d}" }.join(','), + filters: filters.empty? ? nil : filters.map { |f, v| "ga:#{f}=#{v}" }.join(','), + start_index: start_index, + max_results: max_results, + sort: sort.map { |d| "ga:#{d}" }.join(',') + } + ) + end + + private + + def authenticate_google + Google::Apis::ClientOptions.default.application_name = @app_name + Google::Apis::ClientOptions.default.application_version = @app_version + # enable google api call retries in order to + # minigate analytics processing failure due to occasional google api timeouts and other outages + Google::Apis::RequestOptions.default.retries = 5 + # uncoment to enable logging for debugging purposes + # Google::Apis.logger.level = Logger::DEBUG + # Google::Apis.logger = @logger + client = Google::Apis::AnalyticsV3::AnalyticsService.new + key = Google::APIClient::KeyUtils::load_from_pkcs12(@analytics_key_file, 'notasecret') + client.authorization = Signet::OAuth2::Client.new( + :token_credential_uri => 'https://accounts.google.com/o/oauth2/token', + :audience => 'https://accounts.google.com/o/oauth2/token', + :scope => 'https://www.googleapis.com/auth/analytics.readonly', + :issuer => @app_user, + :signing_key => key + ).tap { |auth| auth.fetch_access_token! } + client + end + + end + + class ObjectAnalytics + + attr_reader :redis_field + + def initialize(redis_field:, start_date:, old_data: {}) + @redis_field = redis_field + @start_date = Date.parse(start_date) rescue Date.parse(NcboCron.settings.analytics_start_date) + @old_data = old_data[@redis_field] || {} + end + + def full_data(logger, ga_conn, ua_conn) + + logger.info "Fetching GA4 analytics for all ontologies from #{@start_date} to today..." + logger.flush + new_ga_data = fetch_object_analytics(logger, ga_conn) + + if @start_date < Date.parse(GA4_START_DATE) + @old_data = {} + logger.info "Fetching UA analytics for all ontologies from #{@start_date} to today..." + logger.flush + ua_data = fetch_ua_object_analytics(logger, ua_conn) + logger.info "Completed Universal Analytics pull..." + logger.flush + new_ga_data = merge_and_fill_missing_data(new_ga_data, ua_data, logger) + end + merge_and_fill_missing_data(new_ga_data, @old_data, logger) + end + + # @param ga_conn GoogleAnalyticsConnector + def fetch_object_analytics(logger, ga_conn) + raise NotImplementedError, "Subclasses must implement this method" + end + + # @param ua_conn GoogleAnalyticsUAConnector + def fetch_ua_object_analytics(logger, ua_conn) + raise NotImplementedError, "Subclasses must implement this method" + end + + private + + def merge_and_fill_missing_data(new_data, old_data,logger, start_date = @start_date) + if !old_data.empty? + logger.info "Merging GA4 and UA data..." + logger.flush + old_data.keys.each do |acronym| + (start_date.year..Date.today.year).each do |year| + year = year.to_s + # add up hits for June of 2023 (the only intersecting month between UA and GA4) + if old_data[acronym].has_key?(year) + next unless new_data[acronym].has_key?(year) + + (1..Date.today.month).each do |month| + month = month.to_s + old_data[acronym][year][month] ||= 0 + unless old_data[acronym][year][month].eql?(new_data[acronym][year][month]) + old_data[acronym][year][month] += (new_data[acronym][year][month] || 0) + end + end + + elsif new_data[acronym][year] + old_data[acronym][year] = new_data[acronym][year] + end + end + end + old_data = fill_missing_data(old_data) + else + old_data = new_data + end + + # fill missing years and months + logger.info "Filling in missing years data..." + logger.flush + old_data + # sort_ga_data(old_data) + end + + def aggregate_results(aggregated_results, results) + results.each do |row| + + year = row[1].to_i.to_s + month = row[2].to_i.to_s + value = row[3].to_i + aggregated = aggregated_results + # year + if aggregated.has_key?(year) + # month + if aggregated[year].has_key?(month) + aggregated[year][month] += value + else + aggregated[year][month] = value + end + else + aggregated[year] = Hash.new + aggregated[year][month] = value + end + end + end + + def fill_missing_data(ga_data) + # fill up non existent years + start_year = Date.parse(UA_START_DATE).year + + ga_data.each do |acronym, _| + (start_year..Date.today.year).each do |y| + ga_data[acronym] = Hash.new if ga_data[acronym].nil? + ga_data[acronym][y.to_s] = Hash.new unless ga_data[acronym].has_key?(y.to_s) + end + # fill up non existent months with zeros + (1..12).each { |n| ga_data[acronym].values.each { |v| v[n.to_s] = 0 if v.is_a?(Hash) && !v.has_key?(n.to_s) } } + end + end + + def sort_ga_data(ga_data) + ga_data.transform_values { |value| + value.transform_values { |val| + val.sort_by { |key, _| key.to_i }.to_h + }.sort_by { |k, _| k.to_i }.to_h + }.sort.to_h + end + + end + end +end diff --git a/lib/ncbo_cron/analytics/object_analytics_job.rb b/lib/ncbo_cron/analytics/object_analytics_job.rb new file mode 100644 index 00000000..af2e2885 --- /dev/null +++ b/lib/ncbo_cron/analytics/object_analytics_job.rb @@ -0,0 +1,87 @@ +require 'logger' +require 'json' +require 'benchmark' +require 'google/analytics/data' +require 'google/apis/analytics_v3' +require 'google/api_client/auth/key_utils' + +require_relative 'ontology_visits_analytics' +require_relative 'user_visits_analytics' +require_relative 'page_visits_analytics' + +module NcboCron + module Models + class ObjectAnalyticsJob + def initialize(logger) + @redis_host = LinkedData.settings.ontology_analytics_redis_host + @redis_port = LinkedData.settings.ontology_analytics_redis_port + + @data_file = NcboCron.settings.analytics_path_to_ga_data_file + @ua_data_file = NcboCron.settings.analytics_path_to_ua_data_file + + @ga_conn = GoogleAnalyticsConnector.new + @ua_conn = GoogleAnalyticsUAConnector.new + + @logger = logger + @logger.info "Authenticating with the Google Analytics Endpoint..." + @logger.flush + + @analytics_objects = [ + NcboCron::Models::OntologyVisitsAnalytics, + NcboCron::Models::UsersVisitsAnalytics, + NcboCron::Models::PageVisitsAnalytics, + ] + end + + # @param analytics_objects ObjectAnalytics[] + def run + redis = Redis.new(:host => @redis_host, :port => @redis_port) + @logger.info "Starting Google Analytics refresh..." + @logger.flush + time = Benchmark.realtime do + @logger.info "Fetching all ontology acronyms from backend..." + @logger.flush + save = {} + @old_data = read_old_data + @analytics_objects.each do |analytic_object| + analytic_object = analytic_object.new(start_date: detect_latest_date, old_data: @old_data) + new_data = analytic_object.full_data(@logger, @ga_conn, @ua_conn) + save[analytic_object.redis_field] = new_data + redis.set(analytic_object.redis_field, Marshal.dump(new_data)) + end + save_data(save) + end + @logger.info "Completed Google Analytics refresh in #{(time / 60).round(1)} minutes." + @logger.flush + end + + private + def read_old_data + return {} unless File.exists?(@data_file) && !File.zero?(@data_file) + JSON.parse(File.read(@data_file)) + end + + def detect_latest_date + begin + input_date = Date.parse(@old_data['latest_date_save']).prev_month(6) + start_of_month = Date.new(input_date.year, input_date.month, 1) + start_of_month.to_s + rescue + nil + end + + end + + def save_data(new_data) + new_data["latest_date_save"] = Date.today.to_s + # Ensure the directory exists before creating the file + FileUtils.mkdir_p(File.dirname(@data_file)) + # Open the file with 'w+' mode to create if not exist and write + File.open(@data_file, 'w+') do |f| + f.write(new_data.to_json) + end + end + end + + end +end diff --git a/lib/ncbo_cron/analytics/ontology_visits_analytics.rb b/lib/ncbo_cron/analytics/ontology_visits_analytics.rb new file mode 100644 index 00000000..6aa64e79 --- /dev/null +++ b/lib/ncbo_cron/analytics/ontology_visits_analytics.rb @@ -0,0 +1,110 @@ +require 'logger' +require 'json' +require 'benchmark' +require_relative 'object_analytics' + +module NcboCron + module Models + + class OntologyVisitsAnalytics < ObjectAnalytics + + ONTOLOGY_ANALYTICS_REDIS_FIELD = 'ontology_analytics' + + def initialize(start_date: , old_data: {}) + super(redis_field: ONTOLOGY_ANALYTICS_REDIS_FIELD, start_date: start_date, old_data: old_data) + @ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map { |o| o.acronym } + @ont_acronyms = ['AGROVOC', 'E-PHY', 'CROPUSAGE'] + end + + + def fetch_ua_object_analytics(logger, ua_conn) + @logger = logger + @ua_conn = ua_conn + aggregated_results = Hash.new + start_year = Date.parse(UA_START_DATE).year || 2013 + filter_str = (@analytics_filter.nil? || @analytics_filter.empty?) ? "" : ";#{@analytics_filter}" + + @ont_acronyms.each do |acronym| + max_results = 10000 + start_index = 1 + loop do + results = @ua_conn.run_request( + metrics: ['pageviews'], + dimensions: %w[pagePath year month], + filters: [['pagePath', "~^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$#{filter_str}"]], + start_index: start_index, + max_results: max_results, + dates_ranges: [UA_START_DATE, Date.today.to_s], + sort: %w[year month] + ) + results.rows ||= [] + start_index += max_results + num_results = results.rows.length + @logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}" + @logger.flush + aggregated_results[acronym] = Hash.new unless aggregated_results.has_key?(acronym) + aggregate_results(aggregated_results[acronym], results.rows) + + if num_results < max_results + # fill up non existent years + (start_year..Date.today.year).each do |y| + aggregated_results[acronym] = Hash.new if aggregated_results[acronym].nil? + aggregated_results[acronym][y.to_s] = Hash.new unless aggregated_results[acronym].has_key?(y.to_s) + end + # fill up non existent months with zeros + (1..12).each { |n| aggregated_results[acronym].values.each { |v| v[n.to_s] = 0 unless v.has_key?(n.to_s) } } + break + end + end + end + sort_ga_data(aggregated_results) + end + + def fetch_object_analytics(logger, ga_conn) + @logger = logger + @ga_conn = ga_conn + + aggregated_results = Hash.new + max_results = 10000 + + @ont_acronyms.each do |acronym| + start_index = 0 + filer_regex = "^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$" + + loop do + response = @ga_conn.run_request( + date_ranges: [[@start_date, Date.parse(GA4_START_DATE)].max.to_s, Date.today.to_s], + metrics: ['screenPageViews'], + dimensions: %w[pagePath year month], + order_bys: %w[year month], + dimension_filter: ['pagePath', filer_regex], + offset: start_index, + limit: max_results + ) + + response.rows ||= [] + num_results = response.rows.length + @logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}" + @logger.flush + start_index += max_results + results = [] + + response.rows.each do |row| + row_h = row.to_h + year_month_hits = row_h[:dimension_values].map.with_index { + |v, i| i > 0 ? v[:value].to_s : row_h[:metric_values][0][:value].to_s + }.rotate(1) + results << ([acronym] + year_month_hits) + end + aggregated_results[acronym] = Hash.new unless aggregated_results.has_key?(acronym) + aggregate_results(aggregated_results[acronym], results) + break if num_results < max_results + end + end + aggregated_results + end + + end + + end +end diff --git a/lib/ncbo_cron/ontology_analytics.rb b/lib/ncbo_cron/ontology_analytics.rb deleted file mode 100644 index c5a4de00..00000000 --- a/lib/ncbo_cron/ontology_analytics.rb +++ /dev/null @@ -1,233 +0,0 @@ -require 'logger' -require 'json' -require 'benchmark' -require 'google/analytics/data' - - -module NcboCron - module Models - - class OntologyAnalytics - ONTOLOGY_ANALYTICS_REDIS_FIELD = 'ontology_analytics' - UA_START_DATE = '2013-10-01' - GA4_START_DATE = '2023-06-01' - - def initialize(logger) - @logger = logger - end - - def run - redis = Redis.new(:host => LinkedData.settings.ontology_analytics_redis_host, :port => LinkedData.settings.ontology_analytics_redis_port) - ontology_analytics = fetch_ontology_analytics - File.open(NcboCron.settings.analytics_path_to_ga_data_file, 'w') do |f| - f.write(ontology_analytics.to_json) - end - redis.set(ONTOLOGY_ANALYTICS_REDIS_FIELD, Marshal.dump(ontology_analytics)) - end - - def fetch_ontology_analytics - @logger.info "Starting Google Analytics refresh..." - @logger.flush - full_data = nil - - time = Benchmark.realtime do - max_results = 10000 - aggregated_results = Hash.new - - @logger.info "Fetching all ontology acronyms from backend..." - @logger.flush - ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map {|o| o.acronym} - # ont_acronyms = ["NCIT", "SNOMEDCT", "MEDDRA"] - @logger.info "Authenticating with the Google Analytics Endpoint..." - @logger.flush - google_client = authenticate_google - - date_range = Google::Analytics::Data::V1beta::DateRange.new( - start_date: GA4_START_DATE, - end_date: Date.today.to_s - ) - metrics_page_views = Google::Analytics::Data::V1beta::Metric.new( - name: "screenPageViews" - ) - dimension_path = Google::Analytics::Data::V1beta::Dimension.new( - name: "pagePath" - ) - dimension_year = Google::Analytics::Data::V1beta::Dimension.new( - name: "year" - ) - dimension_month = Google::Analytics::Data::V1beta::Dimension.new( - name: "month" - ) - string_filter = Google::Analytics::Data::V1beta::Filter::StringFilter.new( - match_type: Google::Analytics::Data::V1beta::Filter::StringFilter::MatchType::FULL_REGEXP - ) - filter = Google::Analytics::Data::V1beta::Filter.new( - field_name: "pagePath", - string_filter: string_filter - ) - filter_expression = Google::Analytics::Data::V1beta::FilterExpression.new( - filter: filter - ) - order_year = Google::Analytics::Data::V1beta::OrderBy::DimensionOrderBy.new( - dimension_name: "year" - ) - orderby_year = Google::Analytics::Data::V1beta::OrderBy.new( - desc: false, - dimension: order_year - ) - order_month = Google::Analytics::Data::V1beta::OrderBy::DimensionOrderBy.new( - dimension_name: "month" - ) - orderby_month = Google::Analytics::Data::V1beta::OrderBy.new( - desc: false, - dimension: order_month - ) - @logger.info "Fetching GA4 analytics for all ontologies..." - @logger.flush - - ont_acronyms.each do |acronym| - start_index = 0 - string_filter.value = "^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$" - - loop do - request = Google::Analytics::Data::V1beta::RunReportRequest.new( - property: "properties/#{NcboCron.settings.analytics_property_id}", - metrics: [metrics_page_views], - dimension_filter: filter_expression, - dimensions: [dimension_path, dimension_year, dimension_month], - date_ranges: [date_range], - order_bys: [orderby_year, orderby_month], - offset: start_index, - limit: max_results - ) - response = google_client.run_report request - - response.rows ||= [] - start_index += max_results - num_results = response.rows.length - @logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}" - @logger.flush - - response.rows.each do |row| - row_h = row.to_h - year_month_hits = row_h[:dimension_values].map.with_index { - |v, i| i > 0 ? v[:value].to_i.to_s : row_h[:metric_values][0][:value].to_i - }.rotate(1) - - if aggregated_results.has_key?(acronym) - # year - if aggregated_results[acronym].has_key?(year_month_hits[0]) - # month - if aggregated_results[acronym][year_month_hits[0]].has_key?(year_month_hits[1]) - aggregated_results[acronym][year_month_hits[0]][year_month_hits[1]] += year_month_hits[2] - else - aggregated_results[acronym][year_month_hits[0]][year_month_hits[1]] = year_month_hits[2] - end - else - aggregated_results[acronym][year_month_hits[0]] = Hash.new - aggregated_results[acronym][year_month_hits[0]][year_month_hits[1]] = year_month_hits[2] - end - else - aggregated_results[acronym] = Hash.new - aggregated_results[acronym][year_month_hits[0]] = Hash.new - aggregated_results[acronym][year_month_hits[0]][year_month_hits[1]] = year_month_hits[2] - end - end - break if num_results < max_results - end # loop - end # ont_acronyms - @logger.info "Refresh complete" - @logger.flush - full_data = merge_and_fill_missing_data(aggregated_results) - end # Benchmark.realtime - @logger.info "Completed Google Analytics refresh in #{(time/60).round(1)} minutes." - @logger.flush - full_data - end - - def merge_and_fill_missing_data(ga4_data) - ua_data = {} - - if File.exists?(NcboCron.settings.analytics_path_to_ua_data_file) && - !File.zero?(NcboCron.settings.analytics_path_to_ua_data_file) - @logger.info "Merging GA4 and UA data..." - @logger.flush - ua_data_file = File.read(NcboCron.settings.analytics_path_to_ua_data_file) - ua_data = JSON.parse(ua_data_file) - ua_ga4_intersecting_year = Date.parse(GA4_START_DATE).year.to_s - ua_ga4_intersecting_month = Date.parse(GA4_START_DATE).month.to_s - - # add up hits for June of 2023 (the only intersecting month between UA and GA4) - ua_data.each do |acronym, _| - if ga4_data.has_key?(acronym) - if ga4_data[acronym][ua_ga4_intersecting_year].has_key?(ua_ga4_intersecting_month) - ua_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] += - ga4_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] - # delete data for June of 2023 from ga4_data to avoid overwriting when merging - ga4_data[acronym][ua_ga4_intersecting_year].delete(ua_ga4_intersecting_month) - end - end - end - end - - # merge ua and ga4 data - merged_data = ua_data.deep_merge(ga4_data) - # fill missing years and months - @logger.info "Filling in missing years data..." - @logger.flush - fill_missing_data(merged_data) - # sort acronyms, years and months - @logger.info "Sorting final data..." - @logger.flush - sort_ga_data(merged_data) - end - - def fill_missing_data(ga_data) - # fill up non existent years - start_year = Date.parse(UA_START_DATE).year - - ga_data.each do |acronym, _| - (start_year..Date.today.year).each do |y| - ga_data[acronym] = Hash.new if ga_data[acronym].nil? - ga_data[acronym][y.to_s] = Hash.new unless ga_data[acronym].has_key?(y.to_s) - end - # fill up non existent months with zeros - (1..12).each { |n| ga_data[acronym].values.each { |v| v[n.to_s] = 0 unless v.has_key?(n.to_s) } } - end - end - - def sort_ga_data(ga_data) - ga_data.transform_values { |value| - value.transform_values { |val| - val.sort_by { |key, _| key.to_i }.to_h - }.sort_by { |k, _| k.to_i }.to_h - }.sort.to_h - end - - def authenticate_google - Google::Analytics::Data.analytics_data do |config| - config.credentials = NcboCron.settings.analytics_path_to_key_file - end - end - end # class - - end -end - -class ::Hash - def deep_merge(second) - merger = proc { |key, v1, v2| Hash === v1 && Hash === v2 ? v1.merge(v2, &merger) : v2 } - self.merge(second, &merger) - end -end - -# require 'ontologies_linked_data' -# require 'goo' -# require 'ncbo_annotator' -# require 'ncbo_cron/config' -# require_relative '../../config/config' -# # ontology_analytics_log_path = File.join("logs", "ontology-analytics.log") -# # ontology_analytics_logger = Logger.new(ontology_analytics_log_path) -# ontology_analytics_logger = Logger.new(STDOUT) -# NcboCron::Models::OntologyAnalytics.new(ontology_analytics_logger).run -# ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontologies-report true --disable-mapping-counts true --disable-spam-deletion true --ontology-analytics '14 * * * *' From 08f5e4228f9f7fbc5bb7a6da020213968e77245a Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Wed, 27 Dec 2023 13:31:51 +0100 Subject: [PATCH 59/62] add user analytics fetching the monthly user visits count --- .../analytics/user_visits_analytics.rb | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 lib/ncbo_cron/analytics/user_visits_analytics.rb diff --git a/lib/ncbo_cron/analytics/user_visits_analytics.rb b/lib/ncbo_cron/analytics/user_visits_analytics.rb new file mode 100644 index 00000000..793bf0ca --- /dev/null +++ b/lib/ncbo_cron/analytics/user_visits_analytics.rb @@ -0,0 +1,94 @@ +require 'logger' +require 'json' +require 'benchmark' +require_relative 'object_analytics' + +module NcboCron + module Models + class UsersVisitsAnalytics < ObjectAnalytics + def initialize(start_date: , old_data: {}) + super(redis_field: 'user_analytics', start_date: start_date, old_data: old_data) + end + + private + + def fetch_object_analytics(logger, ga_conn) + @logger = logger + @ga_conn = ga_conn + + aggregated_results = Hash.new + max_results = 10000 + + + + start_index = 0 + loop do + response = @ga_conn.run_request( + date_ranges: [[@start_date, Date.parse(GA4_START_DATE)].max.to_s, Date.today.to_s], + metrics: ['newUsers'], + dimensions: %w[year month], + order_bys: %w[year month], + dimension_filter: [], + offset: start_index, + limit: max_results + ) + + response.rows ||= [] + num_results = response.rows.length + @logger.info "Results: #{num_results}, Start Index: #{start_index}" + @logger.flush + start_index += max_results + results = [] + response.rows.each do |row| + row_h = row.to_h + year_month_hits = row_h[:dimension_values].map{ |x| x[:value] } + [row_h[:metric_values].first[:value]] + results << ([-1] + year_month_hits) + end + aggregate_results(aggregated_results, results) + break if num_results < max_results + + end + {"all_users" => aggregated_results} + end + + def fetch_ua_object_analytics(logger, ua_conn) + + aggregated_results = Hash.new + start_year = Date.parse(UA_START_DATE).year || 2013 + + max_results = 10000 + start_index = 1 + loop do + results = ua_conn.run_request( + metrics: ['newUsers'], + dimensions: %w[year month], + filters: [], + start_index: start_index, + max_results: max_results, + dates_ranges: [UA_START_DATE, Date.today.to_s], + sort: %w[year month] + ) + results.rows ||= [] + start_index += max_results + num_results = results.rows.length + logger.info "Results: #{num_results}, Start Index: #{start_index}" + logger.flush + aggregate_results(aggregated_results, results.rows.map{|row| [-1] + row}) + + if num_results < max_results + # fill up non existent years + (start_year..Date.today.year).each do |y| + aggregated_results = Hash.new if aggregated_results.nil? + aggregated_results[y.to_s] = Hash.new unless aggregated_results.has_key?(y.to_s) + end + # fill up non existent months with zeros + (1..12).each { |n| aggregated_results.values.each { |v| v[n.to_s] = 0 unless v.has_key?(n.to_s) } } + break + end + end + { "all_users" => aggregated_results} + end + end + end +end + From 6f21df6f2640853617b072f64638a7fea293963c Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Wed, 27 Dec 2023 13:32:50 +0100 Subject: [PATCH 60/62] add page visits analytics fetching last month most visited pages --- Gemfile | 4 +- Gemfile.lock | 49 +++++++------- docker-compose.yml | 4 ++ .../analytics/page_visits_analytics.rb | 65 +++++++++++++++++++ 4 files changed, 97 insertions(+), 25 deletions(-) create mode 100644 lib/ncbo_cron/analytics/page_visits_analytics.rb diff --git a/Gemfile b/Gemfile index 7003d3ac..1d7413db 100644 --- a/Gemfile +++ b/Gemfile @@ -25,9 +25,9 @@ gem 'request_store' # Monitoring gem 'cube-ruby', require: 'cube' -gem 'goo', github: 'ontoportal-lirmm/goo', branch: 'development' +gem 'goo', github: 'ontoportal-lirmm/goo', branch: 'master' gem 'sparql-client', github: 'ontoportal-lirmm/sparql-client', branch: 'master' -gem 'ontologies_linked_data', github: 'ontoportal-lirmm/ontologies_linked_data', branch: 'development' +gem 'ontologies_linked_data', github: 'ontoportal-lirmm/ontologies_linked_data', branch: 'master' gem 'ncbo_annotator', github: 'ontoportal-lirmm/ncbo_annotator', branch: 'master' # Testing group :test do diff --git a/Gemfile.lock b/Gemfile.lock index ec0671de..d752fe7d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ GIT remote: https://github.com/ontoportal-lirmm/goo.git - revision: 69466682c1e9cb2c338539195013dbec9714ca7d - branch: development + revision: 03da25b671d2ffa515b5dce51c6bd35980ae60c7 + branch: master specs: goo (0.0.2) addressable (~> 2.8) @@ -26,8 +26,8 @@ GIT GIT remote: https://github.com/ontoportal-lirmm/ontologies_linked_data.git - revision: 1204ede68ed0a5af5e3fb355172496d5e0134544 - branch: development + revision: e98b884999e5ce917a8be5fdc37f7b4797a1559e + branch: master specs: ontologies_linked_data (0.0.1) activesupport @@ -77,7 +77,8 @@ GEM addressable (2.8.6) public_suffix (>= 2.0.2, < 6.0) base64 (0.2.0) - bcrypt (3.1.19) + bcrypt (3.1.20) + bigdecimal (3.1.5) binding_of_caller (1.0.0) debug_inspector (>= 0.0.1) builder (3.2.4) @@ -86,14 +87,15 @@ GEM connection_pool (2.4.1) cube-ruby (0.0.3) dante (0.2.0) - debug_inspector (1.1.0) + debug_inspector (1.2.0) + declarative (0.0.20) docile (1.4.0) domain_name (0.6.20231109) email_spec (2.1.1) htmlentities (~> 4.3.3) launchy (~> 2.1) mail (~> 2.6) - faraday (2.7.12) + faraday (2.8.1) base64 faraday-net_http (>= 2.0, < 3.1) ruby2_keywords (>= 0.0.4) @@ -101,7 +103,7 @@ GEM faraday-retry (2.2.0) faraday (~> 2.0) ffi (1.16.3) - gapic-common (0.20.0) + gapic-common (0.21.1) faraday (>= 1.9, < 3.a) faraday-retry (>= 1.0, < 3.a) google-protobuf (~> 3.18) @@ -132,25 +134,22 @@ GEM google-cloud-env (2.1.0) faraday (>= 1.0, < 3.a) google-cloud-errors (1.3.1) - google-protobuf (3.24.4-x86_64-darwin) - google-protobuf (3.24.4-x86_64-linux) + google-protobuf (3.25.1-x86_64-darwin) googleapis-common-protos (1.4.0) google-protobuf (~> 3.14) googleapis-common-protos-types (~> 1.2) grpc (~> 1.27) - googleapis-common-protos-types (1.9.0) + googleapis-common-protos-types (1.11.0) google-protobuf (~> 3.18) - googleauth (1.8.1) - faraday (>= 0.17.3, < 3.a) + googleauth (1.9.1) + faraday (>= 1.0, < 3.a) + google-cloud-env (~> 2.1) jwt (>= 1.4, < 3.0) multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) - grpc (1.58.0-x86_64-darwin) - google-protobuf (~> 3.23) - googleapis-common-protos-types (~> 1.0) - grpc (1.59.2-x86_64-linux) - google-protobuf (~> 3.24) + grpc (1.60.0-x86_64-darwin) + google-protobuf (~> 3.25) googleapis-common-protos-types (~> 1.0) htmlentities (4.3.4) http-accept (1.7.0) @@ -173,7 +172,8 @@ GEM method_source (1.0.0) mime-types (3.5.1) mime-types-data (~> 3.2015) - mime-types-data (3.2023.1003) + mime-types-data (3.2023.1205) + mini_mime (1.1.5) minitest (4.7.5) mlanett-redis-lock (0.2.7) redis @@ -200,8 +200,12 @@ GEM addressable (>= 2.2) redis (5.0.8) redis-client (>= 0.17.0) - redis-client (0.18.0) + redis-client (0.19.1) connection_pool + representable (3.2.0) + declarative (< 0.1.0) + trailblazer-option (>= 0.1.1, < 0.2.0) + uber (< 0.2.0) request_store (1.5.1) rack (>= 1.4) rest-client (2.1.0) @@ -247,8 +251,7 @@ GEM webrick (1.8.1) PLATFORMS - x86_64-darwin-21 - x86_64-linux + x86_64-darwin-23 DEPENDENCIES binding_of_caller (~> 1.0) @@ -279,4 +282,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.3.15 + 2.4.21 diff --git a/docker-compose.yml b/docker-compose.yml index 783d33f2..9f5d628d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -95,6 +95,8 @@ services: && 4s-httpd -D -s-1 -p 9000 ontoportal_kb" profiles: - 4store + ports: + - 9000:9000 solr-ut: image: solr:8 @@ -122,6 +124,8 @@ services: interval: 10s timeout: 5s retries: 5 + ports: + - 55556:55555 agraph-ut: image: franzinc/agraph:v8.0.0 diff --git a/lib/ncbo_cron/analytics/page_visits_analytics.rb b/lib/ncbo_cron/analytics/page_visits_analytics.rb new file mode 100644 index 00000000..8ecd21c0 --- /dev/null +++ b/lib/ncbo_cron/analytics/page_visits_analytics.rb @@ -0,0 +1,65 @@ +require 'logger' +require 'json' +require 'benchmark' +require_relative 'object_analytics' + +module NcboCron + module Models + class PageVisitsAnalytics < ObjectAnalytics + def initialize(start_date: Date.today.prev_month, old_data: {}) + super(redis_field: 'pages_analytics', start_date: start_date, old_data: { }) + end + + private + + def fetch_object_analytics(logger, ga_conn) + @logger = logger + @ga_conn = ga_conn + + aggregated_results = Hash.new + max_results = 10000 + + start_index = 0 + loop do + response = @ga_conn.run_request( + date_ranges: [[@start_date, Date.parse(GA4_START_DATE)].max.to_s, Date.today.to_s], + metrics: ['screenPageViews'], + dimensions: %w[pagePathPlusQueryString], + order_bys: %w[screenPageViews], + dimension_filter: [], + offset: start_index, + limit: max_results + ) + + response.rows ||= [] + num_results = response.rows.length + @logger.info "Results: #{num_results}, Start Index: #{start_index}" + @logger.flush + start_index += max_results + results = [] + aggregated_results = {} + response.rows.each do |row| + row_h = row.to_h + year_month_hits = row_h[:dimension_values].map{ |x| x[:value] } + [row_h[:metric_values].first[:value]] + results << year_month_hits + page_count = year_month_hits[1].to_i + page_path = year_month_hits[0] + page_path = year_month_hits[0].chop if page_path.end_with?('/') && !page_path.eql?('/') + if page_count >= 10 + old_page_count = aggregated_results[page_path] || 0 + aggregated_results[page_path] = old_page_count + page_count + end + end + + break if num_results < max_results + end + {"all_pages" => aggregated_results } + end + + def fetch_ua_object_analytics(logger, ua_conn) + {"all_pages" => {} } # we fetch only the current month views UA is at least 6 month past + end + end + end +end + From 080629e5a0a0098bb0021260e07b7bc59d2a6735 Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Wed, 27 Dec 2023 15:22:59 +0100 Subject: [PATCH 61/62] extract google analytics UA import code to a script to make current code clean of it --- Gemfile | 2 +- bin/import_google_ua_analytics_data | 235 ++++++++++++++++++ bin/ncbo_ontology_analytics_rebuild | 8 +- config/config.rb.sample | 5 +- lib/ncbo_cron/analytics/object_analytics.rb | 128 +++------- .../analytics/object_analytics_job.rb | 25 +- .../analytics/ontology_visits_analytics.rb | 46 ---- .../analytics/page_visits_analytics.rb | 7 +- .../analytics/user_visits_analytics.rb | 36 --- 9 files changed, 283 insertions(+), 209 deletions(-) create mode 100755 bin/import_google_ua_analytics_data diff --git a/Gemfile b/Gemfile index 1d7413db..54646cf4 100644 --- a/Gemfile +++ b/Gemfile @@ -5,7 +5,7 @@ gemspec gem 'ffi' # This is needed temporarily to pull the Google Universal Analytics (UA) -# data and store it in a file. See (bin/generate_ua_analytics_file.rb) +# data and store it in a file. See (bin/import_google_ua_analytics_data) # The ability to pull this data from Google will cease on July 1, 2024 gem "google-apis-analytics_v3" diff --git a/bin/import_google_ua_analytics_data b/bin/import_google_ua_analytics_data new file mode 100755 index 00000000..eadfd936 --- /dev/null +++ b/bin/import_google_ua_analytics_data @@ -0,0 +1,235 @@ +#!/usr/bin/env ruby +require 'logger' +require 'optparse' +require 'google/apis/analytics_v3' +require 'google/api_client/auth/key_utils' +require_relative '../lib/ncbo_cron/analytics/object_analytics_job' + +module NcboCron + module Models + + class OntologyVisitsAnalytics + def fetch_ua_object_analytics(logger, ua_conn) + @logger = logger + @ua_conn = ua_conn + aggregated_results = Hash.new + start_year = @start_date.year || 2013 + filter_str = (@analytics_filter.nil? || @analytics_filter.empty?) ? "" : ";#{@analytics_filter}" + + @ont_acronyms.each do |acronym| + max_results = 10000 + start_index = 1 + loop do + results = @ua_conn.run_request( + metrics: ['pageviews'], + dimensions: %w[pagePath year month], + filters: [['pagePath', "~^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$#{filter_str}"]], + start_index: start_index, + max_results: max_results, + dates_ranges: [@start_date, Date.today.to_s], + sort: %w[year month] + ) + results.rows ||= [] + start_index += max_results + num_results = results.rows.length + @logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}" + aggregated_results[acronym] = Hash.new unless aggregated_results.has_key?(acronym) + aggregate_results(aggregated_results[acronym], results.rows) + + if num_results < max_results + # fill up non existent years + (start_year..Date.today.year).each do |y| + aggregated_results[acronym] = Hash.new if aggregated_results[acronym].nil? + aggregated_results[acronym][y.to_s] = Hash.new unless aggregated_results[acronym].has_key?(y.to_s) + end + # fill up non existent months with zeros + (1..12).each { |n| aggregated_results[acronym].values.each { |v| v[n.to_s] = 0 unless v.has_key?(n.to_s) } } + break + end + end + end + sort_ga_data(aggregated_results) + end + end + + class UsersVisitsAnalytics + def fetch_ua_object_analytics(logger, ua_conn) + + aggregated_results = Hash.new + start_year = @start_date.year || 2013 + + max_results = 10000 + start_index = 1 + loop do + results = ua_conn.run_request( + metrics: ['newUsers'], + dimensions: %w[year month], + filters: [], + start_index: start_index, + max_results: max_results, + dates_ranges: [@start_date.to_s, Date.today.to_s], + sort: %w[year month] + ) + results.rows ||= [] + start_index += max_results + num_results = results.rows.length + logger.info "Results: #{num_results}, Start Index: #{start_index}" + + aggregate_results(aggregated_results, results.rows.map { |row| [-1] + row }) + + if num_results < max_results + # fill up non existent years + (start_year..Date.today.year).each do |y| + aggregated_results = Hash.new if aggregated_results.nil? + aggregated_results[y.to_s] = Hash.new unless aggregated_results.has_key?(y.to_s) + end + # fill up non existent months with zeros + (1..12).each { |n| aggregated_results.values.each { |v| v[n.to_s] = 0 unless v.has_key?(n.to_s) } } + break + end + end + { "all_users" => aggregated_results } + end + end + end + module GoogleAnalyticsUAMigrator + # Old version of Google Analytics connector + class GoogleAnalyticsUAConnector + def initialize(app_id:, app_name:, app_version:, analytics_key_file:, app_user:, start_date:, analytics_filter:) + @app_id = app_id + @app_name = app_name + @app_version = app_version + @analytics_key_file = analytics_key_file + @app_user = app_user + @generated_file_path = NcboCron.settings.analytics_path_to_ga_data_file + @start_date = start_date + @analytics_filter = analytics_filter + @ga_client = authenticate_google + end + + def run_request(metrics:, dimensions:, filters:, start_index:, max_results:, dates_ranges:, sort:) + @ga_client.get_ga_data( + ids = @app_id, + start_date = dates_ranges.first, + end_date = dates_ranges.last, + metrics = metrics.map { |m| "ga:#{m}" }.join(','), + { + dimensions: dimensions.map { |d| "ga:#{d}" }.join(','), + filters: filters.empty? ? nil : filters.map { |f, v| "ga:#{f}=#{v}" }.join(','), + start_index: start_index, + max_results: max_results, + sort: sort.map { |d| "ga:#{d}" }.join(',') + } + ) + end + + private + + def authenticate_google + Google::Apis::ClientOptions.default.application_name = @app_name + Google::Apis::ClientOptions.default.application_version = @app_version + # enable google api call retries in order to + # minigate analytics processing failure due to occasional google api timeouts and other outages + Google::Apis::RequestOptions.default.retries = 5 + # uncoment to enable logging for debugging purposes + # Google::Apis.logger.level = Logger::DEBUG + # Google::Apis.logger = @logger + client = Google::Apis::AnalyticsV3::AnalyticsService.new + key = Google::APIClient::KeyUtils::load_from_pkcs12(@analytics_key_file, 'notasecret') + client.authorization = Signet::OAuth2::Client.new( + :token_credential_uri => 'https://accounts.google.com/o/oauth2/token', + :audience => 'https://accounts.google.com/o/oauth2/token', + :scope => 'https://www.googleapis.com/auth/analytics.readonly', + :issuer => @app_user, + :signing_key => key + ).tap { |auth| auth.fetch_access_token! } + client + end + end + + def self.run(logger, options) + @start_date = options[:start_date] + @ua_conn = GoogleAnalyticsUAConnector.new(options) + + logger.info "Fetching UA analytics for all ontologies from #{@start_date} to today..." + save = {} + analytics_to_migrate = [NcboCron::Models::OntologyVisitsAnalytics, + NcboCron::Models::UsersVisitsAnalytics] + analytics_to_migrate.each do |analytic_object| + analytic_object = analytic_object.new(start_date: @start_date) + ua_data = analytic_object.fetch_ua_object_analytics(logger, @ua_conn) + save[analytic_object.redis_field] = ua_data + end + new_ga_start_date = NcboCron::Models::ObjectAnalytics::GA4_START_DATE + NcboCron::Models::ObjectAnalyticsJob.new(logger).send(:save_data_in_file, save, new_ga_start_date) + logger.info "Completed Universal Analytics pull..." + logger.close + end + end +end +require 'bundler/setup' +require_relative '../lib/ncbo_cron' +require_relative '../config/config' + + +# # Google Analytics UA config +options = { + app_id: nil, + app_name: nil, + app_version: nil, + analytics_key_file: nil, + app_user: nil, + start_date: nil, + analytics_filter: nil, + logfile: nil +} + +help_text = < e msg = "Failed rebuilding ontology analytics repository with exception: #{e.class}: #{e.message}\n#{e.backtrace.join("\n")}" logger.error(msg) diff --git a/config/config.rb.sample b/config/config.rb.sample index 79ce75de..10cf7e90 100644 --- a/config/config.rb.sample +++ b/config/config.rb.sample @@ -79,12 +79,11 @@ NcboCron.config do |config| # Google Analytics GA4 config config.analytics_path_to_key_file = "config/your_analytics_key.json" config.analytics_property_id = "123456789" - # path to the Universal Analytics data, which stopped collecting on June 1st, 2023 - config.analytics_path_to_ua_data_file = "data/your_ua_data.json" + # path to the file that will hold your Google Analytics data # this is in addition to storing it in Redis config.analytics_path_to_ga_data_file = "data/your_ga_data.json" - + config.analytics_start_date = '2015-01-01' # this is a Base64.encode64 encoded personal access token # you need to run Base64.decode64 on it before using it in your code # this is a workaround because Github does not allow storing access tokens in a repo diff --git a/lib/ncbo_cron/analytics/object_analytics.rb b/lib/ncbo_cron/analytics/object_analytics.rb index f5275f82..d4a7f3da 100644 --- a/lib/ncbo_cron/analytics/object_analytics.rb +++ b/lib/ncbo_cron/analytics/object_analytics.rb @@ -7,16 +7,14 @@ module NcboCron module Models - UA_START_DATE = '2013-10-01' - GA4_START_DATE = '2023-06-01' + + class GoogleAnalyticsConnector attr_reader :ga_client def initialize - @ga_data_file = NcboCron.settings.analytics_path_to_ga_data_file - @ua_data_file = NcboCron.settings.analytics_path_to_ua_data_file @app_id = NcboCron.settings.analytics_property_id @app_key_file = NcboCron.settings.analytics_path_to_key_file @ga_client = analytics_data_client @@ -89,63 +87,8 @@ def ga_order_by(dimension_name, desc = false) end - # Old version of Google Analytics - class GoogleAnalyticsUAConnector - def initialize - @app_id = NcboCron.settings.analytics_profile_id - @app_name = NcboCron.settings.analytics_app_name - @app_version = NcboCron.settings.analytics_app_version - @analytics_key_file = NcboCron.settings.ua_analytics_path_to_key_file - @app_user = NcboCron.settings.analytics_service_account_email_address - @generated_file_path = NcboCron.settings.analytics_path_to_ua_data_file - @start_date = NcboCron.settings.analytics_start_date - @analytics_filter = NcboCron.settings.analytics_filter_str - @ga_client = authenticate_google - end - - def run_request(metrics:, dimensions:, filters:, start_index:, max_results:, dates_ranges:, sort:) - @ga_client.get_ga_data( - ids = @app_id, - start_date = dates_ranges.first, - end_date = dates_ranges.last, - metrics = metrics.map { |m| "ga:#{m}" }.join(','), - { - dimensions: dimensions.map { |d| "ga:#{d}" }.join(','), - filters: filters.empty? ? nil : filters.map { |f, v| "ga:#{f}=#{v}" }.join(','), - start_index: start_index, - max_results: max_results, - sort: sort.map { |d| "ga:#{d}" }.join(',') - } - ) - end - - private - - def authenticate_google - Google::Apis::ClientOptions.default.application_name = @app_name - Google::Apis::ClientOptions.default.application_version = @app_version - # enable google api call retries in order to - # minigate analytics processing failure due to occasional google api timeouts and other outages - Google::Apis::RequestOptions.default.retries = 5 - # uncoment to enable logging for debugging purposes - # Google::Apis.logger.level = Logger::DEBUG - # Google::Apis.logger = @logger - client = Google::Apis::AnalyticsV3::AnalyticsService.new - key = Google::APIClient::KeyUtils::load_from_pkcs12(@analytics_key_file, 'notasecret') - client.authorization = Signet::OAuth2::Client.new( - :token_credential_uri => 'https://accounts.google.com/o/oauth2/token', - :audience => 'https://accounts.google.com/o/oauth2/token', - :scope => 'https://www.googleapis.com/auth/analytics.readonly', - :issuer => @app_user, - :signing_key => key - ).tap { |auth| auth.fetch_access_token! } - client - end - - end - class ObjectAnalytics - + GA4_START_DATE = '2023-06-01' attr_reader :redis_field def initialize(redis_field:, start_date:, old_data: {}) @@ -154,21 +97,12 @@ def initialize(redis_field:, start_date:, old_data: {}) @old_data = old_data[@redis_field] || {} end - def full_data(logger, ga_conn, ua_conn) + def full_data(logger, ga_conn) - logger.info "Fetching GA4 analytics for all ontologies from #{@start_date} to today..." + logger.info "Fetching GA4 analytics for #{@redis_field} from #{@start_date} to today..." logger.flush new_ga_data = fetch_object_analytics(logger, ga_conn) - if @start_date < Date.parse(GA4_START_DATE) - @old_data = {} - logger.info "Fetching UA analytics for all ontologies from #{@start_date} to today..." - logger.flush - ua_data = fetch_ua_object_analytics(logger, ua_conn) - logger.info "Completed Universal Analytics pull..." - logger.flush - new_ga_data = merge_and_fill_missing_data(new_ga_data, ua_data, logger) - end merge_and_fill_missing_data(new_ga_data, @old_data, logger) end @@ -177,47 +111,41 @@ def fetch_object_analytics(logger, ga_conn) raise NotImplementedError, "Subclasses must implement this method" end - # @param ua_conn GoogleAnalyticsUAConnector - def fetch_ua_object_analytics(logger, ua_conn) - raise NotImplementedError, "Subclasses must implement this method" - end - private def merge_and_fill_missing_data(new_data, old_data,logger, start_date = @start_date) - if !old_data.empty? - logger.info "Merging GA4 and UA data..." + if !new_data.empty? + logger.info "Merging old Google Analytics and the new data..." logger.flush - old_data.keys.each do |acronym| - (start_date.year..Date.today.year).each do |year| - year = year.to_s - # add up hits for June of 2023 (the only intersecting month between UA and GA4) - if old_data[acronym].has_key?(year) - next unless new_data[acronym].has_key?(year) - - (1..Date.today.month).each do |month| - month = month.to_s - old_data[acronym][year][month] ||= 0 - unless old_data[acronym][year][month].eql?(new_data[acronym][year][month]) - old_data[acronym][year][month] += (new_data[acronym][year][month] || 0) + new_data.keys.each do |acronym| + if old_data.has_key?(acronym) + (start_date.year..Date.today.year).each do |year| + year = year.to_s + if new_data[acronym].has_key?(year) + if old_data[acronym].has_key?(year) + (1..Date.today.month).each do |month| + month = month.to_s + old_data[acronym][year][month] ||= 0 + unless old_data[acronym][year][month].eql?(new_data[acronym][year][month]) + old_data[acronym][year][month] += (new_data[acronym][year][month] || 0) + end + end + else + old_data[acronym][year] = new_data[acronym][year] end end - - elsif new_data[acronym][year] - old_data[acronym][year] = new_data[acronym][year] end + else + old_data[acronym]= new_data[acronym] end end + # fill missing years and months + logger.info "Filling in missing years data..." old_data = fill_missing_data(old_data) - else - old_data = new_data end - # fill missing years and months - logger.info "Filling in missing years data..." - logger.flush - old_data # sort_ga_data(old_data) + old_data end def aggregate_results(aggregated_results, results) @@ -244,7 +172,7 @@ def aggregate_results(aggregated_results, results) def fill_missing_data(ga_data) # fill up non existent years - start_year = Date.parse(UA_START_DATE).year + start_year = @start_date.year ga_data.each do |acronym, _| (start_year..Date.today.year).each do |y| diff --git a/lib/ncbo_cron/analytics/object_analytics_job.rb b/lib/ncbo_cron/analytics/object_analytics_job.rb index af2e2885..8133aeb0 100644 --- a/lib/ncbo_cron/analytics/object_analytics_job.rb +++ b/lib/ncbo_cron/analytics/object_analytics_job.rb @@ -17,14 +17,12 @@ def initialize(logger) @redis_port = LinkedData.settings.ontology_analytics_redis_port @data_file = NcboCron.settings.analytics_path_to_ga_data_file - @ua_data_file = NcboCron.settings.analytics_path_to_ua_data_file - @ga_conn = GoogleAnalyticsConnector.new - @ua_conn = GoogleAnalyticsUAConnector.new + @logger = logger @logger.info "Authenticating with the Google Analytics Endpoint..." - @logger.flush + @ga_conn = GoogleAnalyticsConnector.new @analytics_objects = [ NcboCron::Models::OntologyVisitsAnalytics, @@ -33,23 +31,22 @@ def initialize(logger) ] end - # @param analytics_objects ObjectAnalytics[] def run redis = Redis.new(:host => @redis_host, :port => @redis_port) @logger.info "Starting Google Analytics refresh..." - @logger.flush time = Benchmark.realtime do - @logger.info "Fetching all ontology acronyms from backend..." - @logger.flush + @logger.info "Fetching saved analytics data..." save = {} @old_data = read_old_data @analytics_objects.each do |analytic_object| analytic_object = analytic_object.new(start_date: detect_latest_date, old_data: @old_data) - new_data = analytic_object.full_data(@logger, @ga_conn, @ua_conn) + @logger.info "Start fetching new #{analytic_object.redis_field} data..." + new_data = analytic_object.full_data(@logger, @ga_conn) save[analytic_object.redis_field] = new_data redis.set(analytic_object.redis_field, Marshal.dump(new_data)) + @logger.info "Completed fetching #{analytic_object.redis_field} data..." end - save_data(save) + save_data_in_file(save) end @logger.info "Completed Google Analytics refresh in #{(time / 60).round(1)} minutes." @logger.flush @@ -72,12 +69,12 @@ def detect_latest_date end - def save_data(new_data) - new_data["latest_date_save"] = Date.today.to_s + def save_data_in_file(new_data, saved_date = Date.today.to_s, data_file = @data_file) + new_data["latest_date_save"] = saved_date # Ensure the directory exists before creating the file - FileUtils.mkdir_p(File.dirname(@data_file)) + FileUtils.mkdir_p(File.dirname(data_file)) # Open the file with 'w+' mode to create if not exist and write - File.open(@data_file, 'w+') do |f| + File.open(data_file, 'w+') do |f| f.write(new_data.to_json) end end diff --git a/lib/ncbo_cron/analytics/ontology_visits_analytics.rb b/lib/ncbo_cron/analytics/ontology_visits_analytics.rb index 6aa64e79..41d6edfd 100644 --- a/lib/ncbo_cron/analytics/ontology_visits_analytics.rb +++ b/lib/ncbo_cron/analytics/ontology_visits_analytics.rb @@ -13,51 +13,6 @@ class OntologyVisitsAnalytics < ObjectAnalytics def initialize(start_date: , old_data: {}) super(redis_field: ONTOLOGY_ANALYTICS_REDIS_FIELD, start_date: start_date, old_data: old_data) @ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map { |o| o.acronym } - @ont_acronyms = ['AGROVOC', 'E-PHY', 'CROPUSAGE'] - end - - - def fetch_ua_object_analytics(logger, ua_conn) - @logger = logger - @ua_conn = ua_conn - aggregated_results = Hash.new - start_year = Date.parse(UA_START_DATE).year || 2013 - filter_str = (@analytics_filter.nil? || @analytics_filter.empty?) ? "" : ";#{@analytics_filter}" - - @ont_acronyms.each do |acronym| - max_results = 10000 - start_index = 1 - loop do - results = @ua_conn.run_request( - metrics: ['pageviews'], - dimensions: %w[pagePath year month], - filters: [['pagePath', "~^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$#{filter_str}"]], - start_index: start_index, - max_results: max_results, - dates_ranges: [UA_START_DATE, Date.today.to_s], - sort: %w[year month] - ) - results.rows ||= [] - start_index += max_results - num_results = results.rows.length - @logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}" - @logger.flush - aggregated_results[acronym] = Hash.new unless aggregated_results.has_key?(acronym) - aggregate_results(aggregated_results[acronym], results.rows) - - if num_results < max_results - # fill up non existent years - (start_year..Date.today.year).each do |y| - aggregated_results[acronym] = Hash.new if aggregated_results[acronym].nil? - aggregated_results[acronym][y.to_s] = Hash.new unless aggregated_results[acronym].has_key?(y.to_s) - end - # fill up non existent months with zeros - (1..12).each { |n| aggregated_results[acronym].values.each { |v| v[n.to_s] = 0 unless v.has_key?(n.to_s) } } - break - end - end - end - sort_ga_data(aggregated_results) end def fetch_object_analytics(logger, ga_conn) @@ -103,7 +58,6 @@ def fetch_object_analytics(logger, ga_conn) end aggregated_results end - end end diff --git a/lib/ncbo_cron/analytics/page_visits_analytics.rb b/lib/ncbo_cron/analytics/page_visits_analytics.rb index 8ecd21c0..b2354c00 100644 --- a/lib/ncbo_cron/analytics/page_visits_analytics.rb +++ b/lib/ncbo_cron/analytics/page_visits_analytics.rb @@ -7,7 +7,7 @@ module NcboCron module Models class PageVisitsAnalytics < ObjectAnalytics def initialize(start_date: Date.today.prev_month, old_data: {}) - super(redis_field: 'pages_analytics', start_date: start_date, old_data: { }) + super(redis_field: 'pages_analytics', start_date: Date.today.prev_month, old_data: { }) end private @@ -56,8 +56,9 @@ def fetch_object_analytics(logger, ga_conn) {"all_pages" => aggregated_results } end - def fetch_ua_object_analytics(logger, ua_conn) - {"all_pages" => {} } # we fetch only the current month views UA is at least 6 month past + # we don't the missing data in this case + def fill_missing_data(ga_data) + ga_data end end end diff --git a/lib/ncbo_cron/analytics/user_visits_analytics.rb b/lib/ncbo_cron/analytics/user_visits_analytics.rb index 793bf0ca..9b2b6631 100644 --- a/lib/ncbo_cron/analytics/user_visits_analytics.rb +++ b/lib/ncbo_cron/analytics/user_visits_analytics.rb @@ -51,43 +51,7 @@ def fetch_object_analytics(logger, ga_conn) {"all_users" => aggregated_results} end - def fetch_ua_object_analytics(logger, ua_conn) - aggregated_results = Hash.new - start_year = Date.parse(UA_START_DATE).year || 2013 - - max_results = 10000 - start_index = 1 - loop do - results = ua_conn.run_request( - metrics: ['newUsers'], - dimensions: %w[year month], - filters: [], - start_index: start_index, - max_results: max_results, - dates_ranges: [UA_START_DATE, Date.today.to_s], - sort: %w[year month] - ) - results.rows ||= [] - start_index += max_results - num_results = results.rows.length - logger.info "Results: #{num_results}, Start Index: #{start_index}" - logger.flush - aggregate_results(aggregated_results, results.rows.map{|row| [-1] + row}) - - if num_results < max_results - # fill up non existent years - (start_year..Date.today.year).each do |y| - aggregated_results = Hash.new if aggregated_results.nil? - aggregated_results[y.to_s] = Hash.new unless aggregated_results.has_key?(y.to_s) - end - # fill up non existent months with zeros - (1..12).each { |n| aggregated_results.values.each { |v| v[n.to_s] = 0 unless v.has_key?(n.to_s) } } - break - end - end - { "all_users" => aggregated_results} - end end end end From 1154c7c8e621bb0edecaa55404f0daa2bae887f4 Mon Sep 17 00:00:00 2001 From: Syphax Bouazzouni Date: Wed, 27 Dec 2023 18:00:46 +0100 Subject: [PATCH 62/62] add option to force submission archiving even if already archived --- bin/ncbo_ontology_archive_old_submissions | 13 +++++++++---- lib/ncbo_cron/ontology_helper.rb | 1 + 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/bin/ncbo_ontology_archive_old_submissions b/bin/ncbo_ontology_archive_old_submissions index 1b2268a5..34fe073c 100755 --- a/bin/ncbo_ontology_archive_old_submissions +++ b/bin/ncbo_ontology_archive_old_submissions @@ -13,11 +13,15 @@ abort("Please create a config/config.rb file using the config/config.rb.sample a require_relative '../config/config' require 'optparse' -options = { delete: false } +options = { delete: false , force_archiving: false} opt_parser = OptionParser.new do |opts| # Set a banner, displayed at the top of the help screen. opts.banner = "Usage: #{File.basename(__FILE__)} [options]" + opts.on('-f', '--force-re-archiving', 'Force to re-archive already archived submissions.') do + options[:force_archiving] = true + end + options[:logfile] = STDOUT opts.on( '-l', '--logfile FILE', "Write log to FILE (default is STDOUT)" ) do |filename| options[:logfile] = filename @@ -40,12 +44,12 @@ logfile = options[:logfile] if File.file?(logfile); File.delete(logfile); end logger = Logger.new(logfile) process_actions = { process_rdf: false, generate_labels: false, index_search: false, index_commit: false, - process_annotator: false, diff: false, run_metrics: false, archive: true } + process_annotator: false, diff: false, run_metrics: false, archive: true } onts = LinkedData::Models::Ontology.all onts.each { |ont| ont.bring(:acronym, :submissions) } onts.sort! { |a, b| a.acronym <=> b.acronym } bad_submissions = {} - +force_archiving = options[:force_archiving] onts.each do |ont| latest_sub = ont.latest_submission @@ -73,7 +77,7 @@ onts.each do |ont| } old_subs.sort! { |a, b| a.submissionId <=> b.submissionId } old_subs.each do |sub| - unless sub.archived? + unless sub.archived? || force_archiving msg = "#{ont.acronym}: found un-archived old submission with ID #{sub.submissionId}." puts msg logger.info msg @@ -119,6 +123,7 @@ onts.each do |ont| end end + puts if bad_submissions.empty? diff --git a/lib/ncbo_cron/ontology_helper.rb b/lib/ncbo_cron/ontology_helper.rb index 42534768..4d1f0716 100644 --- a/lib/ncbo_cron/ontology_helper.rb +++ b/lib/ncbo_cron/ontology_helper.rb @@ -9,6 +9,7 @@ module OntologyHelper PROCESS_ACTIONS = { :process_rdf => true, :generate_labels => true, + :extract_metadata => true, :index_search => true, :index_properties => true, :run_metrics => true,