Skip to content

Commit d8c4cfb

Browse files
committed
better string constant representation. properly handle latin-1 chars
1 parent 2f4f0b9 commit d8c4cfb

File tree

2 files changed

+46
-12
lines changed

2 files changed

+46
-12
lines changed

src/bytecode.rs

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use smallvec::smallvec;
1010
pub enum LispObject {
1111
Symbol(String),
1212
Keyword(String),
13+
UnibyteStr(Vec<u8>),
1314
Str(String),
1415
Int(i64),
1516
Float(String), // use string for Eq and Ord
@@ -41,23 +42,43 @@ impl LispObject {
4142
LispObject::Keyword(s) => format!(":{}", s),
4243
LispObject::Str(s) => {
4344
let mut result = String::new();
45+
result.reserve(s.len() * 2 + 2);
4446
result.push('"');
45-
let mut last_is_escape = false;
4647
for c in s.chars() {
47-
if c == '"' || c == '\\' {
48+
if c == '\"' || c == '\\' {
4849
result.push('\\');
4950
result.push(c);
50-
last_is_escape = false;
51-
} else if (c as u32) < 32 || ((c as u32) > 126 && (c as u32) < 256) {
52-
result += &format!("\\{:o}", c as u32);
53-
last_is_escape = true;
51+
} else if (c as u32) < 32 || (c as u32) == 127 { // not printable
52+
result += &format!("\\{:03o}", c as u32);
5453
} else {
55-
// https://www.gnu.org/software/emacs/manual/html_node/elisp/Non_002dASCII-in-Strings.html
56-
if last_is_escape && ('0'..='8').contains(&c) {
57-
result += "\\ ";
58-
}
5954
result.push(c);
60-
last_is_escape = false;
55+
}
56+
}
57+
result.push('"');
58+
result
59+
},
60+
LispObject::UnibyteStr(vec) => {
61+
let mut result = String::new();
62+
result.reserve(vec.len() * 4 + 2);
63+
result.push('"');
64+
for c in vec {
65+
match *c {
66+
7 => result += "\\a",
67+
8 => result += "\\b",
68+
9 => result += "\\t",
69+
10 => result += "\\n",
70+
11 => result += "\\v",
71+
12 => result += "\\f",
72+
13 => result += "\\r",
73+
127 => result += "\\d",
74+
27 => result += "\\e",
75+
0..=26 => { // \^@ \^A \^B ... \^Z
76+
result += &format!("\\^{}", (*c as u32 + 64) as u8 as char);
77+
},
78+
27..=31 | 128..=255 | 34 | 92 => { // oct, for unprintable and '"' and '\\'
79+
result += &format!("\\{:03o}", *c as u32);
80+
},
81+
_ => result.push(*c as char), // printable
6182
}
6283
}
6384
result.push('"');
@@ -386,7 +407,7 @@ impl BytecodeCompiler {
386407
fn into_repl(self) -> Result<String> {
387408
let (code, constants, max_stack_size) = self.into_bytecode()?;
388409
Ok(format!("#[0 {} {} {}]",
389-
LispObject::Str(code.into_iter().map(|x| x as char).collect()).to_repl(),
410+
LispObject::UnibyteStr(code).to_repl(),
390411
LispObject::Vector(constants).to_repl(),
391412
max_stack_size))
392413
}
@@ -401,3 +422,13 @@ pub fn generate_bytecode_repl(value: &json::Value, options: BytecodeOptions) ->
401422
compiler.compile(value);
402423
compiler.into_repl()
403424
}
425+
426+
427+
#[test]
428+
fn test_string_repl() {
429+
assert_eq!(LispObject::UnibyteStr("\x00".into()).to_repl(), r#""\^@""#);
430+
assert_eq!(LispObject::UnibyteStr("\x1a".into()).to_repl(), r#""\^Z""#);
431+
assert_eq!(LispObject::UnibyteStr("\x20".into()).to_repl(), r#"" ""#);
432+
assert_eq!(LispObject::UnibyteStr("\x7f".into()).to_repl(), r#""\d""#);
433+
assert_eq!(LispObject::UnibyteStr(vec![0xff]).to_repl(), r#""\377""#);
434+
}

tests/bytecode_test.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ fn run_one_test(json_str: &str, object_type: bytecode::ObjectType) -> Result<()>
4949

5050
#[test]
5151
fn test_bytecode() {
52+
// unicode test
53+
run_one_test(r#"{"a":"ÀÁÂÃÄÅÆÇÈÉÊËÌ abcd \n 你好世界"}"#, bytecode::ObjectType::Plist).unwrap();
54+
5255
for object_type in vec![bytecode::ObjectType::Plist,
5356
bytecode::ObjectType::Alist,
5457
bytecode::ObjectType::Hashtable] {

0 commit comments

Comments
 (0)