Skip to content

Commit 713e67b

Browse files
fix: Replace invalid UTF-8 characters in doc comments (#453)
The types for doc comments in our Protobuf code are strings, which means the the contents must be valid UTF-8. However, we were not doing validation before storing the contents. This PR adds a validation step, and if the validation fails, then we substitute invalid characters with the standard unicode replacement character. The utfcpp library was chosen as it is pretty decent at [benchmarks](https://thephd.dev/the-c-c++-rust-string-text-encoding-api-landscape) and has a very easy to use API for our purposes.
1 parent 12ab13c commit 713e67b

File tree

5 files changed

+27
-0
lines changed

5 files changed

+27
-0
lines changed

fetch_deps.bzl

+9
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ _WYHASH_COMMIT = "ea3b25e1aef55d90f707c3a292eeb9162e2615d8"
1212
_SPDLOG_COMMIT = "edc51df1bdad8667b628999394a1e7c4dc6f3658"
1313
_PROTOBUF_VERSION = "3.21.12"
1414
_SCIP_COMMIT = "aa0e511dcfefbacc3b96dcc2fe2abd9894416b1e"
15+
_UTFCPP_VERSION = "4.0.5"
1516
# ^ When bumping this version, check if any new fields are introduced
1617
# in the types for which we implement hashing and comparison in
1718
# indexer/ScipExtras.{h,cc}
@@ -234,3 +235,11 @@ def fetch_direct_dependencies():
234235
strip_prefix = "rules_python-%s" % _RULES_PYTHON_VERSION,
235236
url = "https://github.com/bazelbuild/rules_python/releases/download/{0}/rules_python-{0}.tar.gz".format(_RULES_PYTHON_VERSION),
236237
)
238+
239+
http_archive(
240+
name = "utfcpp",
241+
sha256 = "ffc668a310e77607d393f3c18b32715f223da1eac4c4d6e0579a11df8e6b59cf",
242+
build_file = "@scip_clang//third_party:utfcpp.BUILD",
243+
strip_prefix = "utfcpp-%s" % _UTFCPP_VERSION,
244+
url = "https://github.com/nemtrif/utfcpp/archive/refs/tags/v{0}.tar.gz".format(_UTFCPP_VERSION),
245+
)

indexer/BUILD

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ cc_library(
3737
"@llvm-project//clang:frontend",
3838
"@llvm-project//clang:tooling",
3939
"@scip",
40+
"@utfcpp",
4041
],
4142
)
4243

indexer/Indexer.cc

+8
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "absl/strings/strip.h"
1111
#include "perfetto/perfetto.h"
1212
#include "spdlog/fmt/fmt.h"
13+
#include "utfcpp/utf8.h"
1314

1415
#include "clang/AST/ASTContext.h"
1516
#include "clang/AST/CXXInheritance.h"
@@ -315,6 +316,13 @@ void DocComment::addTo(std::string &slot) {
315316
if (stripped.empty()) {
316317
return;
317318
}
319+
if (!utf8::is_valid(stripped)) {
320+
slot.clear();
321+
utf8::replace_invalid(stripped.begin(), stripped.end(),
322+
std::back_inserter(slot));
323+
this->contents.clear();
324+
return;
325+
}
318326
if (stripped.size() == this->contents.size()) {
319327
slot = std::move(this->contents);
320328
return;

indexer/ScipExtras.cc

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include "absl/algorithm/container.h"
1010
#include "absl/functional/function_ref.h"
1111
#include "perfetto/perfetto.h"
12+
#include "utfcpp/utf8.h"
1213

1314
#include "llvm/Support/Path.h"
1415
#include "llvm/Support/StringSaver.h"

third_party/utfcpp.BUILD

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
cc_library(
2+
name = "utfcpp",
3+
includes = ["source"],
4+
include_prefix = "utfcpp",
5+
strip_include_prefix = "source",
6+
hdrs = glob(["source/**/*.h"], allow_empty=False),
7+
visibility = ["//visibility:public"],
8+
)

0 commit comments

Comments
 (0)