Skip to content

Commit d7cb514

Browse files
committed
Add RSpec::Support::EncodedString
ref: rspec/rspec-support#249
1 parent fecd744 commit d7cb514

File tree

3 files changed

+448
-9
lines changed

3 files changed

+448
-9
lines changed

lib/encoded_string.rb

+153-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,155 @@
1-
require "encoded_string/version"
1+
module RSpec
2+
module Support
3+
# @private
4+
class EncodedString
5+
# Reduce allocations by storing constants.
6+
UTF_8 = "UTF-8"
7+
US_ASCII = "US-ASCII"
8+
#
9+
# In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence
10+
# see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176
11+
# https://www.ruby-forum.com/topic/6861247
12+
# https://twitter.com/nalsh/status/553413844685438976
13+
#
14+
# For example, given:
15+
# "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a
16+
#
17+
# On MRI 2.1 or above: 63 # '?'
18+
# else : 128 # "\x80"
19+
#
20+
# Ruby's default replacement string is:
21+
# U+FFFD ("\xEF\xBF\xBD"), for Unicode encoding forms, else
22+
# ? ("\x3F")
23+
REPLACE = "?"
24+
ENCODE_UNCONVERTABLE_BYTES = {
25+
:invalid => :replace,
26+
:undef => :replace,
27+
:replace => REPLACE
28+
}
29+
ENCODE_NO_CONVERTER = {
30+
:invalid => :replace,
31+
:replace => REPLACE
32+
}
233

3-
module EncodedString
4-
# Your code goes here...
34+
def initialize(string, encoding=nil)
35+
@encoding = encoding
36+
@source_encoding = detect_source_encoding(string)
37+
@string = matching_encoding(string)
38+
end
39+
attr_reader :source_encoding
40+
41+
delegated_methods = String.instance_methods.map(&:to_s) & %w[eql? lines == encoding empty?]
42+
delegated_methods.each do |name|
43+
define_method(name) { |*args, &block| @string.__send__(name, *args, &block) }
44+
end
45+
46+
def <<(string)
47+
@string << matching_encoding(string)
48+
end
49+
50+
def split(regex_or_string)
51+
@string.split(matching_encoding(regex_or_string))
52+
end
53+
54+
def to_s
55+
@string
56+
end
57+
alias :to_str :to_s
58+
59+
if String.method_defined?(:encoding)
60+
61+
private
62+
63+
# Encoding Exceptions:
64+
#
65+
# Raised by Encoding and String methods:
66+
# Encoding::UndefinedConversionError:
67+
# when a transcoding operation fails
68+
# if the String contains characters invalid for the target encoding
69+
# e.g. "\x80".encode('UTF-8','ASCII-8BIT')
70+
# vs "\x80".encode('UTF-8','ASCII-8BIT', undef: :replace, replace: '<undef>')
71+
# # => '<undef>'
72+
# Encoding::CompatibilityError
73+
# when Encoding.compatibile?(str1, str2) is nil
74+
# e.g. utf_16le_emoji_string.split("\n")
75+
# e.g. valid_unicode_string.encode(utf8_encoding) << ascii_string
76+
# Encoding::InvalidByteSequenceError:
77+
# when the string being transcoded contains a byte invalid for
78+
# either the source or target encoding
79+
# e.g. "\x80".encode('UTF-8','US-ASCII')
80+
# vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>')
81+
# # => '<byte>'
82+
# ArgumentError
83+
# when operating on a string with invalid bytes
84+
# e.g."\x80".split("\n")
85+
# TypeError
86+
# when a symbol is passed as an encoding
87+
# Encoding.find(:"UTF-8")
88+
# when calling force_encoding on an object
89+
# that doesn't respond to #to_str
90+
#
91+
# Raised by transcoding methods:
92+
# Encoding::ConverterNotFoundError:
93+
# when a named encoding does not correspond with a known converter
94+
# e.g. 'abc'.force_encoding('UTF-8').encode('foo')
95+
# or a converter path cannot be found
96+
# e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule')
97+
#
98+
# Raised by byte <-> char conversions
99+
# RangeError: out of char range
100+
# e.g. the UTF-16LE emoji: 128169.chr
101+
def matching_encoding(string)
102+
string = remove_invalid_bytes(string)
103+
string.encode(@encoding)
104+
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
105+
string.encode(@encoding, ENCODE_UNCONVERTABLE_BYTES)
106+
rescue Encoding::ConverterNotFoundError
107+
string.dup.force_encoding(@encoding).encode(ENCODE_NO_CONVERTER)
108+
end
109+
110+
# Prevents raising ArgumentError
111+
if String.method_defined?(:scrub)
112+
# https://github.com/ruby/ruby/blob/eeb05e8c11/doc/NEWS-2.1.0#L120-L123
113+
# https://github.com/ruby/ruby/blob/v2_1_0/string.c#L8242
114+
# https://github.com/hsbt/string-scrub
115+
# https://github.com/rubinius/rubinius/blob/v2.5.2/kernel/common/string.rb#L1913-L1972
116+
def remove_invalid_bytes(string)
117+
string.scrub(REPLACE)
118+
end
119+
else
120+
# http://stackoverflow.com/a/8711118/879854
121+
# Loop over chars in a string replacing chars
122+
# with invalid encoding, which is a pretty good proxy
123+
# for the invalid byte sequence that causes an ArgumentError
124+
def remove_invalid_bytes(string)
125+
string.chars.map do |char|
126+
char.valid_encoding? ? char : REPLACE
127+
end.join
128+
end
129+
end
130+
131+
def detect_source_encoding(string)
132+
string.encoding
133+
end
134+
135+
def self.pick_encoding(source_a, source_b)
136+
Encoding.compatible?(source_a, source_b) || Encoding.default_external
137+
end
138+
else
139+
140+
def self.pick_encoding(_source_a, _source_b)
141+
end
142+
143+
private
144+
145+
def matching_encoding(string)
146+
string
147+
end
148+
149+
def detect_source_encoding(_string)
150+
US_ASCII
151+
end
152+
end
153+
end
154+
end
5155
end
+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
require 'rspec/matchers'
2+
# Special matcher for comparing encoded strings so that
3+
# we don't run any expectation failures through the Differ,
4+
# which also relies on EncodedString. Instead, confirm the
5+
# strings have the same bytes.
6+
RSpec::Matchers.define :be_identical_string do |expected|
7+
8+
if String.method_defined?(:encoding)
9+
match do
10+
expected_encoding? &&
11+
actual.bytes.to_a == expected.bytes.to_a
12+
end
13+
14+
failure_message do
15+
"expected\n#{actual.inspect} (#{actual.encoding.name}) to be identical to\n"\
16+
"#{expected.inspect} (#{expected.encoding.name})\n"\
17+
"The exact bytes are printed below for more detail:\n"\
18+
"#{actual.bytes.to_a}\n"\
19+
"#{expected.bytes.to_a}\n"\
20+
end
21+
22+
# Depends on chaining :with_same_encoding for it to
23+
# check for string encoding.
24+
def expected_encoding?
25+
if defined?(@expect_same_encoding) && @expect_same_encoding
26+
actual.encoding == expected.encoding
27+
else
28+
true
29+
end
30+
end
31+
else
32+
match do
33+
actual.split(//) == expected.split(//)
34+
end
35+
36+
failure_message do
37+
"expected\n#{actual.inspect} to be identical to\n#{expected.inspect}\n"
38+
end
39+
end
40+
41+
chain :with_same_encoding do
42+
@expect_same_encoding ||= true
43+
end
44+
end
45+
RSpec::Matchers.alias_matcher :a_string_identical_to, :be_identical_string
46+
RSpec::Matchers.alias_matcher :be_diffed_as, :be_identical_string

0 commit comments

Comments
 (0)