Skip to content

Commit

Permalink
Reverse the ID value (#543)
Browse files Browse the repository at this point in the history
* Reverse the ID value

This will give us an even b-tree distribution in fedora, which will
improve performance as the repository grows.

"ModeShape has been designed to efficiently handle a single node having a
large number (100K-200k) of child nodes out-of-the-box. However, if an
application stores a lot more nodes than that under a single parent,
performance in terms of latency and memory consumption starts to
decrease proportionally to the number of stored children."
Source:
https://docs.jboss.org/author/display/MODE40/Large+numbers+of+child+nodes#Largenumbersofchildnodes-Unorderedlargecollections

Since most (all?) of the UCLA ark identifiers start with the same
string, unless we reverse the ID values we'll have everything stored
under the same node in Fedora and hit performance issues when the system
reaches > 200k objects.

This PR also adds unhappy path tests to ARK-->ID generation

* Update ARK to be stored as a single valued symbol
  • Loading branch information
bess authored Apr 4, 2019
1 parent 751f9cc commit adb1cf5
Show file tree
Hide file tree
Showing 14 changed files with 47 additions and 19 deletions.
4 changes: 2 additions & 2 deletions app/controllers/catalog_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def self.modified_field
config.add_show_field solr_name('resource_type', :stored_searchable), label: 'Resource Type'
config.add_show_field solr_name('format', :stored_searchable)
config.add_show_field solr_name('identifier', :stored_searchable)
config.add_show_field 'ark_ssm', label: 'ARK'
config.add_show_field 'ark_ssi', label: 'ARK'

config.add_show_field solr_name('caption', :stored_searchable)
config.add_show_field solr_name('dimensions', :stored_searchable)
Expand Down Expand Up @@ -129,7 +129,7 @@ def self.modified_field
# solr request handler? The one set in config[:default_solr_parameters][:qt],
# since we aren't specifying it otherwise.
config.add_search_field('all_fields', label: 'All Fields') do |field|
search_fields = 'title_tesim subject_tesim named_subject_tesim location_tesim description_tesim caption_tesim identifier_tesim local_identifier_sim ark_sim normalized_date_tesim photographer_tesim'
search_fields = 'title_tesim subject_tesim named_subject_tesim location_tesim description_tesim caption_tesim identifier_tesim local_identifier_sim ark_ssi normalized_date_tesim photographer_tesim'

field.solr_parameters = {
qf: search_fields,
Expand Down
2 changes: 1 addition & 1 deletion app/importers/actor_record_importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def find_existing_record(record)
return unless record.respond_to?(deduplication_field)
return if record.mapper.send(deduplication_field).nil?
return if record.mapper.send(deduplication_field).empty?
existing_records = import_type.where(ark_sim: record.mapper.send(deduplication_field).to_s)
existing_records = import_type.where(ark_ssi: record.mapper.send(deduplication_field).to_s)
raise "More than one record matches deduplication_field #{deduplication_field} with value #{record.mapper.send(deduplication_field)}" if existing_records.count > 1
existing_records&.first
end
Expand Down
2 changes: 1 addition & 1 deletion app/importers/californica_csv_cleaner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def clean
Work.where(identifier: ark).each do |work|
work&.destroy!
end
Work.where(ark_sim: ark).each do |work|
Work.where(ark_ssi: ark).each do |work|
work&.destroy!
end
end
Expand Down
13 changes: 9 additions & 4 deletions app/lib/californica/id_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@ module Californica
module IdGenerator
# Take an ark value and return a valid fedora identifier
def self.id_from_ark(ark)
split = ark.split('/')
shoulder = split[-2]
blade = split[-1]
"#{blade}-#{shoulder}"
id = ark.gsub(/ark:?\/?/, '')
shoulder, blade, extension = id.split('/')

# didn't see {shoulder}/{blade} format ark
raise ArgumentError, 'Could not parse ARK shoulder and blade' if blade.nil?
# looks like an extended ark
raise ArgumentError, 'ARK appears to have too many segments or extensions' unless extension.nil?

"#{shoulder}-#{blade}".reverse
end
end
end
2 changes: 1 addition & 1 deletion app/models/collection.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class Collection < ActiveFedora::Base
# @param ark [String] The ARK
# @return [Collection] The Collection with that ARK
def self.find_by_ark(ark)
where(ark_sim: ark).limit(1).first
where(ark_ssi: ark).limit(1).first
end

# @param ark [String] The ARK
Expand Down
2 changes: 1 addition & 1 deletion app/models/solr_document.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class SolrDocument
use_extension(Hydra::ContentNegotiation)

def ark
self[:ark_ssm]
self[:ark_ssi]
end

def extent
Expand Down
2 changes: 1 addition & 1 deletion app/models/ucla_metadata.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ module UclaMetadata

included do
property :ark, predicate: ::RDF::Vocab::DC11.identifier, multiple: false do |index|
index.as :displayable, :facetable
index.as :stored_sortable
end

property :extent, predicate: ::RDF::Vocab::DC11.format do |index|
Expand Down
2 changes: 1 addition & 1 deletion spec/importers/californica_importer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
it 'creates a new collection with a modified ark as the id' do
importer.import
new_collection = Collection.first
expect(new_collection.id).to eq "zz00294nz8-21198"
expect(new_collection.id).to eq "8zn49200zz-89112"
end
it 'creates a new collection and adds the work to it' do
expect(Collection.count).to eq 0
Expand Down
2 changes: 1 addition & 1 deletion spec/indexers/collection_indexer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
let(:attributes) { { ark: 'ark:/123/456' } }

it 'add the fields to the solr document' do
expect(solr_document['ark_sim']).to eq ['ark:/123/456']
expect(solr_document['ark_ssi']).to eq 'ark:/123/456'
end
end
end
4 changes: 2 additions & 2 deletions spec/indexers/work_indexer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,15 @@
let(:attributes) { { ark: 'ark:/123/456' } }

it 'indexes as a single value "string"' do
expect(solr_document['ark_sim']).to eq ['ark:/123/456']
expect(solr_document['ark_ssi']).to eq 'ark:/123/456'
end
end

describe 'ark' do
let(:attributes) { { ark: 'ark:/123/456' } }

it 'does not duplicate the "ark:/"' do
expect(solr_document['ark_sim']).to eq ['ark:/123/456']
expect(solr_document['ark_ssi']).to eq 'ark:/123/456'
end
end
end
25 changes: 24 additions & 1 deletion spec/lib/californica/id_generator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,31 @@

RSpec.describe Californica::IdGenerator, :clean do
let(:ark) { 'ark:/13030/t8dn9c0x' }
let(:extended_ark) { 'ark:/13030/t8dn9c0x/001' }
let(:old_school) { '13030/t8dn9c0x' }
let(:wonky_ark) { 'ark:13030/t8dn9c0x' }
let(:non_ark) { 'this is not an ark' }

it 'makes a fedora id from an ark' do
id = Californica::IdGenerator.id_from_ark(ark)
expect(id).to eq 't8dn9c0x-13030'
expect(id).to eq 'x0c9nd8t-03031'
end

it 'makes an id from arks without prefixes' do
id = Californica::IdGenerator.id_from_ark(old_school)
expect(id).to eq 'x0c9nd8t-03031'
end

it 'has relaxed rules about prefixes' do
id = Californica::IdGenerator.id_from_ark(wonky_ark)
expect(id).to eq 'x0c9nd8t-03031'
end

it 'raises an error for arks with extensions' do
expect { Californica::IdGenerator.id_from_ark(extended_ark) }.to raise_error(ArgumentError, /ARK/)
end

it 'raises an error for arks that do not have a shoulder & blade' do
expect { Californica::IdGenerator.id_from_ark(non_ark) }.to raise_error(ArgumentError, /ARK/)
end
end
2 changes: 1 addition & 1 deletion spec/system/create_work_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
expect(page).to have_content("ark:/abc/123")
expect(page).to have_content("Your files are being processed")
work = Work.last
expect(work.id).to eq '123-abc'
expect(work.id).to eq '321-cba'
end
end
end
2 changes: 1 addition & 1 deletion spec/system/import_and_show_work_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
it "displays expected fields on show work page" do
importer.import
work = Work.last
expect(work.id).to eq "hb338nb26f-13030"
expect(work.id).to eq "f62bn833bh-03031"
visit("/concern/works/#{work.id}")
expect(page).to have_content "Communion at Plaza Church, Los Angeles, 1942-1952" # title
expect(page).to have_content "ark:/13030/hb338nb26f" # ark
Expand Down
2 changes: 1 addition & 1 deletion spec/system/new_collection_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
expect(find_field('Ark').value).to eq ark
expect(page).to have_content 'Collection was successfully created.'
collection = Collection.last
expect(collection.id).to eq '1234-abc'
expect(collection.id).to eq '4321-cba'
end
end
end

0 comments on commit adb1cf5

Please sign in to comment.