Skip to content

Commit d577074

Browse files
committed
Added valid_utf8(...) function.
1 parent f1e9570 commit d577074

File tree

3 files changed

+96
-1
lines changed

3 files changed

+96
-1
lines changed

SpecUtils/StringAlgo.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,11 @@ namespace SpecUtils
224224
size_t utf8_str_size_limit( const char * const str,
225225
size_t num_in_bytes, const size_t max_bytes );
226226

227-
227+
/** Returns if the passed-in string is valid UTF-8.
228+
@param str The string to evaluate.
229+
@param num_in_bytes The length of the string - must not include (optional) null-terminator.
230+
*/
231+
bool valid_utf8( const char * const str, const size_t num_in_bytes );
228232

229233

230234
/** \brief parses a string of ascii characters to their floating point value.

src/StringAlgo.cpp

+36
Original file line numberDiff line numberDiff line change
@@ -1090,6 +1090,42 @@ namespace SpecUtils
10901090
}
10911091

10921092

1093+
bool valid_utf8( const char * const str, const size_t num_in_bytes )
1094+
{
1095+
int bytesToProcess = 0;
1096+
1097+
for( size_t i = 0; i < num_in_bytes; ++i )
1098+
{
1099+
const uint8_t c = reinterpret_cast<const uint8_t &>( str[i] );
1100+
if( bytesToProcess == 0 )
1101+
{
1102+
// Determine how many bytes to expect
1103+
if( (c & 0x80) == 0 )
1104+
continue; // 1-byte character (ASCII)
1105+
else if( (c & 0xE0) == 0xC0 )
1106+
bytesToProcess = 1; // 2-byte character
1107+
else if( (c & 0xF0) == 0xE0 )
1108+
bytesToProcess = 2; // 3-byte character
1109+
else if( (c & 0xF8) == 0xF0 )
1110+
bytesToProcess = 3; // 4-byte character
1111+
else
1112+
return false; // Invalid leading byte
1113+
}else
1114+
{
1115+
// Expecting continuation byte
1116+
if( (c & 0xC0) != 0x80 )
1117+
return false; // Not a valid continuation byte
1118+
bytesToProcess--;
1119+
assert( bytesToProcess >= 0 );
1120+
}
1121+
}//for( size_t i = 0; i < num_in_bytes; ++i )
1122+
1123+
assert( bytesToProcess >= 0 );
1124+
1125+
return (bytesToProcess == 0);
1126+
}//bool valid_utf8( const char * const str, size_t num_in_bytes )
1127+
1128+
10931129
template <class T>
10941130
bool split_to_integral_types( const char *input, const size_t length,
10951131
std::vector<T> &results )

unit_tests/test_string_functions.cpp

+55
Original file line numberDiff line numberDiff line change
@@ -823,3 +823,58 @@ TEST_CASE( "testPrintCompact" )
823823
check_range(-1000000.0,10000000.0);
824824
check_range(-1.0E32,1.0E32);
825825
}//void testPrintCompact()
826+
827+
828+
TEST_CASE( "testValidUtf8" )
829+
{
830+
using namespace SpecUtils;
831+
832+
// Valid UTF-8 strings
833+
const char* valid_utf8_strings[] = {
834+
"Hello, World!", // Basic ASCII
835+
"Привет", // Cyrillic
836+
"こんにちは", // Japanese
837+
"😊", // Emoji
838+
"\xE2\x9C\x94", // Check mark (U+2714)
839+
"\xF0\x9F\x98\x81", // Grinning face (U+1F600)
840+
"\xE2\x82\xAC", // Euro sign (U+20AC)
841+
"\xF0\x9F\x8C\x90", // Earth globe (U+1F30D)
842+
"\xF0\x9F\x92\xA9", // Money bag (U+1F4B0)
843+
};
844+
845+
for (const auto& str : valid_utf8_strings) {
846+
CHECK(valid_utf8(str, std::strlen(str)));
847+
}
848+
849+
// Invalid UTF-8 strings
850+
const char* invalid_utf8_strings[] = {
851+
"\x80", // Invalid start byte
852+
"\xC3\x28", // Invalid continuation byte
853+
"\xE2\x82\x28", // Invalid continuation byte
854+
"\xF0\x28\x8C\x28", // Invalid continuation byte
855+
"\xF0\x9F\x98\x28", // Invalid continuation byte
856+
"\xC3\xA9\xC3\x28", // Mixed valid and invalid
857+
"\xE2\x82\xAC\xE2\x28", // Mixed valid and invalid
858+
"\xF0\x9F\x92\xA9\xF0\x28", // Mixed valid and invalid
859+
};
860+
861+
for (const auto& str : invalid_utf8_strings) {
862+
CHECK(!valid_utf8(str, std::strlen(str)));
863+
}
864+
865+
866+
// Edge cases
867+
CHECK(valid_utf8("", 0)); // Empty string
868+
CHECK(valid_utf8("\xC2\xA9", 2)); // Single valid UTF-8 character (©)
869+
CHECK(!valid_utf8("\xC2", 1)); // Incomplete multibyte sequence
870+
CHECK(!valid_utf8("\xE2\x82", 2)); // Incomplete multibyte sequence
871+
CHECK(!valid_utf8("\xF0\x9F\x98", 3)); // Incomplete multibyte sequence
872+
873+
// Large valid UTF-8 string
874+
std::string large_valid_utf8(10000, 'a'); // A large string of 'a's
875+
CHECK(valid_utf8(large_valid_utf8.c_str(), large_valid_utf8.size()));
876+
877+
// Large invalid UTF-8 string
878+
std::string large_invalid_utf8(10000, '\x80'); // A large string of invalid bytes
879+
CHECK(!valid_utf8(large_invalid_utf8.c_str(), large_invalid_utf8.size()));
880+
}//TEST_CASE( "testValidUtf8" )

0 commit comments

Comments
 (0)