forked from traject/traject
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransformation.rb
162 lines (147 loc) · 5.81 KB
/
transformation.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
module Traject
module Macros
# Macros intended to be mixed into an Indexer and used in config
# as second or further args to #to_field, to transform existing accumulator values.
#
# They have the same form as any proc/block passed to #to_field, but
# operate on an existing accumulator, intended to be used as non-first-step
# transformations.
#
# Some of these are extracted from extract_marc options, so they can be
# used with any first-step extract methods. Some informed by current users.
module Transformation
# Maps all values on accumulator through a Traject::TranslationMap.
#
# A Traject::TranslationMap is hash-like mapping from input to output, usually
# defined in a yaml or dot-properties file, which can be looked up in load path
# with a file name as arg. See [Traject::TranslationMap](../translation_map.rb)
# header coments for details.
#
# Using this macro, you can pass in one TranslationMap initializer arg, but you can
# also pass in multiple, and they will be merged into each other (last one last), so
# you can use this to apply over-rides: Either from another on-disk map, or even from
# an inline hash (since a Hash is a valid TranslationMap initialization arg too).
#
# @example
# to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a")
#
# @example with override
# to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a", "local_marc_040a")
#
# @example with multiple overrides, including local hash
# to_field("something"), to_field "cataloging_agency", extract_marc("040a"), translation_map("marc_040a", "local_marc_040a", {"DLC" => "U.S. LoC"})
def translation_map(*translation_map_specifier)
translation_map = translation_map_specifier.
collect { |spec| Traject::TranslationMap.new(spec) }.
reduce(:merge)
lambda do |rec, acc|
translation_map.translate_array! acc
end
end
# Pass in a proc/lambda arg or a block (or both), that will be called on each
# value already in the accumulator, to transform it. (Ie, with `#map!`/`#collect!` on your proc(s)).
#
# Due to how ruby syntax precedence works, the block form is probably not too useful
# in traject config files, except with the `&:` trick.
#
# The "stabby lambda" may be convenient for passing an explicit proc argument.
#
# You can pass both an explicit proc arg and a block, in which case the proc arg
# will be applied first.
#
# @example
# to_field("something"), extract_marc("something"), transform(&:upcase)
#
# @example
# to_field("something"), extract_marc("something"), transform(->(val) { val.tr('^a-z', "\uFFFD") })
def transform(a_proc=nil, &block)
unless a_proc || block
raise ArgumentError, "Needs a transform proc arg or block arg"
end
transformer_callable = if a_proc && block
# need to make a combo wrapper.
->(val) { block.call(a_proc.call(val)) }
elsif a_proc
a_proc
else
block
end
lambda do |rec, acc|
acc.collect! do |value|
transformer_callable.call(value)
end
end
end
# Adds a literal to accumulator if accumulator was empty
#
# @example
# to_field "title", extract_marc("245abc"), default("Unknown Title")
def default(default_value)
lambda do |rec, acc|
if acc.empty?
acc << default_value
end
end
end
# Removes all but the first value from accumulator, if more values were present.
#
# @example
# to_field "main_author", extract_marc("100"), first_only
def first_only
lambda do |rec, acc|
# kind of esoteric, but slice used this way does mutating first, yep
acc.slice!(1, acc.length)
end
end
# calls ruby `uniq!` on accumulator, removes any duplicate values
#
# @example
# to_field "something", extract_marc("245:240"), unique
def unique
lambda do |rec, acc|
acc.uniq!
end
end
# For each value in accumulator, remove all leading or trailing whitespace
# (unique aware). Like ruby #strip, but whitespace-aware
#
# @example
# to_field "title", extract_marc("245"), strip
def strip
lambda do |rec, acc|
acc.collect! do |v|
# unicode whitespace class aware
v.sub(/\A[[:space:]]+/,'').sub(/[[:space:]]+\Z/, '')
end
end
end
# Run ruby `split` on each value in the accumulator, with separator
# given, flatten all results into single array as accumulator.
# Will generally result in more individual values in accumulator as output than were
# there in input, as input values are split up into multiple values.
def split(separator)
lambda do |rec, acc|
acc.replace( acc.flat_map { |v| v.split(separator) } )
end
end
# Append argument to end of each value in accumulator.
def append(suffix)
lambda do |rec, acc|
acc.collect! { |v| v + suffix }
end
end
# prepend argument to beginning of each value in accumulator.
def prepend(prefix)
lambda do |rec, acc|
acc.collect! { |v| prefix + v }
end
end
# Run ruby `gsub` on each value in accumulator, with pattern and replace value given.
def gsub(pattern, replace)
lambda do |rec, acc|
acc.collect! { |v| v.gsub(pattern, replace) }
end
end
end
end
end