Skip to content

Commit 1606357

Browse files
author
trizen
committed
new file: Text/boyer-moore_string_search_algorithm.sf
1 parent 423c796 commit 1606357

File tree

2 files changed

+176
-0
lines changed

2 files changed

+176
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ A simple collection of Sidef scripts.
178178
* Term
179179
* [Draw a clock](./Term/draw_a_clock.sf)
180180
* Text
181+
* [Boyer-moore string search algorithm](./Text/boyer-moore_string_search_algorithm.sf)
181182
* [Jaro distance](./Text/jaro_distance.sf)
182183
* [Longest common prefix](./Text/longest_common_prefix.sf)
183184
* [Longest common subsequence](./Text/longest_common_subsequence.sf)
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
#!/usr/bin/ruby
2+
3+
#
4+
## https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
5+
#
6+
7+
# Translation of the Python example.
8+
9+
func match_length(Array S, idx1, idx2) {
10+
11+
if (idx1 == idx2) {
12+
return (S.len - idx1)
13+
}
14+
15+
var match_count = 0
16+
while ((idx1 < S.len) && (idx2 < S.len) && (S[idx1] == S[idx2])) {
17+
++match_count
18+
++idx1
19+
++idx2
20+
}
21+
return match_count
22+
}
23+
24+
func fundamental_preprocess(Array S) {
25+
26+
if (S.len == 0) { # Handles case of empty string
27+
return []
28+
}
29+
30+
if (S.len == 1) { # Handles case of single-character string
31+
return [1]
32+
}
33+
34+
var z = S.len.of(0)
35+
z[0] = S.len
36+
z[1] = match_length(S, 0, 1)
37+
38+
for i in (2 .. z[1]) { # Optimization from exercise 1-5
39+
z[i] = (z[1] - i + 1)
40+
}
41+
42+
# Defines lower and upper limits of z-box
43+
var l = 0
44+
var r = 0
45+
46+
for i in (2+z[1] .. S.end) {
47+
if (i <= r) { # i falls within existing z-box
48+
var k = i-l
49+
var b = z[k]
50+
var a = (r - i + 1)
51+
if (b < a) { # b ends within existing z-box
52+
z[i] = b
53+
}
54+
else { # b ends at or after the end of the z-box, we need to do an explicit match to the right of the z-box
55+
z[i] = a+match_length(S, a, r+1)
56+
l = i
57+
r = (i + z[i] - 1)
58+
}
59+
}
60+
else { # i does not reside within existing z-box
61+
z[i] = match_length(S, 0, i)
62+
if (z[i] > 0) {
63+
l = i
64+
r = (i + z[i] - 1)
65+
}
66+
}
67+
}
68+
69+
return z
70+
}
71+
72+
func bad_character_table(Array S) {
73+
74+
if (S.len == 0) {
75+
return 256.of { [] }
76+
}
77+
78+
var R = 256.of { [-1] }
79+
var alpha = 256.of(-1)
80+
81+
S.each_kv { |i,c|
82+
alpha[c] = i
83+
84+
alpha.each_kv { |j, a|
85+
R[j].append(a)
86+
}
87+
}
88+
89+
return R
90+
}
91+
92+
func good_suffix_table(S) {
93+
var L = S.len.of(-1)
94+
var N = fundamental_preprocess(S.flip).flip
95+
96+
for j in (0 .. S.end) { # should the range be exclusive?
97+
var i = (S.len - N[j])
98+
if (i != S.len) {
99+
L[i] = j
100+
}
101+
}
102+
103+
return L
104+
}
105+
106+
func full_shift_table(S) {
107+
var F = S.len.of(0)
108+
var Z = fundamental_preprocess(S)
109+
110+
var longest = 0
111+
Z.flip.each_kv { |i,zv|
112+
longest = (zv == i+1 ? (zv `max` longest) : longest)
113+
F[-i - 1] = longest
114+
}
115+
116+
return F
117+
}
118+
119+
func string_search(Array P, Array T) {
120+
121+
if ((P.len == 0) || (T.len == 0) || (T.len < P.len)) {
122+
return []
123+
}
124+
125+
var matches = []
126+
127+
# Preprocessing
128+
var R = bad_character_table(P)
129+
var L = good_suffix_table(P)
130+
var F = full_shift_table(P)
131+
132+
var k = P.end # Represents alignment of end of P relative to T
133+
var previous_k = -1 # Represents alignment in previous phase (Galil's rule)
134+
135+
while (k < T.len) {
136+
var i = P.end # Character to compare in P
137+
var h = k # Character to compare in T
138+
139+
while ((i >= 0) && (h > previous_k) && (P[i] == T[h])) { # Matches starting from end of P
140+
--i
141+
--h
142+
}
143+
144+
if ((i == -1) || (h == previous_k)) { # Match has been found (Galil's rule)
145+
matches.append(k - P.len + 1)
146+
k += (P.len > 1 ? (P.len - F[1]) : 1)
147+
}
148+
else { # No match, shift by max of bad character and good suffix rules
149+
var char_shift = (i - R[T[h]][i])
150+
151+
var suffix_shift = given (i+1) { |t|
152+
case (t == P.len) { 1 } # Mismatch happened on first attempt
153+
case (L[t] == -1) { P.len - F[t] } # Matched suffix does not appear anywhere in P
154+
default { P.len - L[t] } # Matched suffix appears in P
155+
}
156+
157+
var shift = (char_shift `max` suffix_shift)
158+
previous_k = k if (shift >= i+1) # Galil's rule
159+
k += shift
160+
}
161+
}
162+
163+
return matches
164+
}
165+
166+
func string_search(String P, String T) {
167+
string_search(P.bytes, T.bytes)
168+
}
169+
170+
say(string_search("bar", "foobartestbarend")) #=> [3, 10]
171+
say(string_search("bar", "foo-bar-test-bar-end")) #=> [4, 13]
172+
say(string_search("Bar", "foo-bar-test-Bar-end-Bar")) #=> [13, 21]
173+
say(string_search("-test-", "foo-bar-test-Bar-end-Bar")) #=> [7]
174+
say(string_search("-Bar-end", "foo-Bar-test-Bar-end-Bar")) #=> [12]
175+
say(string_search("-Bar", "foo-Bar-test-Bar-end-Bar")) #=> [3, 12, 20]

0 commit comments

Comments
 (0)