Skip to content

Commit 79e2e88

Browse files
Do not longer remove surrogates
UTF8 characters can only be encoded using surrogates. By replacing them we lose most UTF8 emojis and such. If we ever encound actual unpaired surrogates on code we need a different fix.
1 parent b823397 commit 79e2e88

File tree

1 file changed

+2
-21
lines changed

1 file changed

+2
-21
lines changed

codepropertygraph/src/main/scala/io/shiftleft/utils/IOUtils.scala

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,6 @@ import scala.util.Using
1010

1111
object IOUtils {
1212

13-
private val surrogatePattern: Pattern = Pattern.compile("[^\u0000-\uffff]")
14-
1513
private val boms: Set[Char] = Set(
1614
'\uefbb', // UTF-8
1715
'\ufeff', // UTF-16 (BE)
@@ -39,27 +37,10 @@ object IOUtils {
3937
}
4038
}
4139

42-
/** Java strings are stored as sequences of 16-bit chars, but what they represent is sequences of unicode characters.
43-
* In unicode terminology, they are stored as code units, but model code points. Thus, it's somewhat meaningless to
44-
* talk about removing surrogates, which don't exist in the character / code point representation (unless you have
45-
* rogue single surrogates, in which case you have other problems). Rather, what you want to do is to remove any
46-
* characters which will require surrogates when encoded. That means any character which lies beyond the basic
47-
* multilingual plane. You can do that with a simple regular expression.
48-
*/
49-
private def replaceUnpairedSurrogates(input: String): String = {
50-
val matches = surrogatePattern.matcher(input)
51-
if (matches.find()) {
52-
val size = matches.end() - matches.start()
53-
matches.replaceAll("?" * size)
54-
} else {
55-
input
56-
}
57-
}
58-
5940
private def contentFromBufferedSource(bufferedSource: BufferedSource): Seq[String] = {
6041
val reader = bufferedSource.bufferedReader()
6142
skipBOMIfPresent(reader)
62-
reader.lines().iterator().asScala.map(replaceUnpairedSurrogates).toSeq
43+
reader.lines().iterator().asScala.toSeq
6344
}
6445

6546
private def contentStringFromBufferedSource(bufferedSource: BufferedSource): String = {
@@ -78,7 +59,7 @@ object IOUtils {
7859
}
7960
}
8061

81-
replaceUnpairedSurrogates(stringBuilder.toString)
62+
stringBuilder.toString
8263
}
8364

8465
/** Reads a file at the given path and:

0 commit comments

Comments
 (0)