fix: Handle margin cutting when encountering multibyte chars

Muscraft · Muscraft · commit b61153d0a198 · 2025-04-16T13:02:44.000-06:00
diff --git a/src/renderer/mod.rs b/src/renderer/mod.rs
@@ -954,12 +954,19 @@ impl Renderer {
         let line_offset = buffer.num_lines();
 
         // Left trim
-        let left = margin.left(source_string.len());
+        let left = margin.left(str_width(&source_string));
 
         // FIXME: This looks fishy. See #132860.
         // Account for unicode characters of width !=0 that were removed.
-        let left = source_string.chars().take(left).map(char_width).sum();
+        let mut taken = 0;
+        source_string.chars().for_each(|ch| {
+            let next = char_width(ch);
+            if taken + next <= left {
+                taken += next;
+            }
+        });
 
+        let left = taken;
         self.draw_line(
             buffer,
             &source_string,
@@ -2018,48 +2025,81 @@ impl Renderer {
     ) {
         // Tabs are assumed to have been replaced by spaces in calling code.
         debug_assert!(!source_string.contains('\t'));
-        let line_len = source_string.len();
+        let line_len = str_width(source_string);
         // Create the source line we will highlight.
         let left = margin.left(line_len);
         let right = margin.right(line_len);
         // FIXME: The following code looks fishy. See #132860.
         // On long lines, we strip the source line, accounting for unicode.
         let mut taken = 0;
+        let mut skipped = 0;
         let code: String = source_string
             .chars()
-            .skip(left)
+            .skip_while(|ch| {
+                skipped += char_width(*ch);
+                skipped <= left
+            })
             .take_while(|ch| {
                 // Make sure that the trimming on the right will fall within the terminal width.
-                let next = char_width(*ch);
-                if taken + next > right - left {
-                    return false;
-                }
-                taken += next;
-                true
+                taken += char_width(*ch);
+                taken <= (right - left)
             })
             .collect();
 
         buffer.puts(line_offset, code_offset, &code, ElementStyle::Quotation);
         let placeholder = self.margin();
-        if margin.was_cut_left() {
+        let padding = str_width(placeholder);
+        let (width_taken, bytes_taken) = if margin.was_cut_left() {
             // We have stripped some code/whitespace from the beginning, make it clear.
+            let mut bytes_taken = 0;
+            let mut width_taken = 0;
+            for ch in code.chars() {
+                width_taken += char_width(ch);
+                bytes_taken += ch.len_utf8();
+
+                if width_taken >= padding {
+                    break;
+                }
+            }
             buffer.puts(
                 line_offset,
                 code_offset,
-                placeholder,
+                &format!("{placeholder:>width_taken$}"),
                 ElementStyle::LineNumber,
             );
-        }
+            (width_taken, bytes_taken)
+        } else {
+            (0, 0)
+        };
+
+        buffer.puts(
+            line_offset,
+            code_offset + width_taken,
+            &code[bytes_taken..],
+            ElementStyle::Quotation,
+        );
+
         if margin.was_cut_right(line_len) {
-            let padding = str_width(placeholder);
-            // We have stripped some code after the rightmost span end, make it clear we did so.
+            // We have stripped some code/whitespace from the beginning, make it clear.
+            let mut char_taken = 0;
+            let mut width_taken_inner = 0;
+            for ch in code.chars().rev() {
+                width_taken_inner += char_width(ch);
+                char_taken += 1;
+
+                if width_taken_inner >= padding {
+                    break;
+                }
+            }
+
             buffer.puts(
                 line_offset,
-                code_offset + taken - padding,
+                code_offset + width_taken + code[bytes_taken..].chars().count() - char_taken,
                 placeholder,
                 ElementStyle::LineNumber,
             );
         }
+
         buffer.puts(
             line_offset,
             0,
diff --git a/tests/formatter.rs b/tests/formatter.rs
@@ -2136,7 +2136,7 @@ error: title
 2 |   # Ensure that the spans from toml handle utf-8 correctly
 3 |   authors = [
   |  ___________^
-4 | |     { name = "Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯...A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘", email = 1 }
+4 | |     { name = "Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘", email = 1 }
 5 | | ]
   | |_^ annotation
 "#]];
@@ -2162,8 +2162,8 @@ fn unicode_cut_handling2() {
     let expected = str![[r#"
 error: expected item, found `?`
   |
-1.|....
-  |^ expected item
+1 |  ...的。这是宽的。这是宽的。这是宽的。这是宽的。这是宽的。*/?
+  |                                                             ^ expected item
   = note: for a full list of items that can appear in modules, see <https://doc.rust-lang.org/reference/items.html>
 "#]];
 
@@ -2189,8 +2189,8 @@ fn unicode_cut_handling3() {
     let expected = str![[r#"
 error: expected item, found `?`
   |
-1 | ...的。这是宽的。*/?       ...
-^ | expected item
+1 |  ...。这是宽的。这是宽的。这是宽的...
+  |            ^^ expected item
   = note: for a full list of items that can appear in modules, see <https://doc.rust-lang.org/reference/items.html>
 "#]];
 
@@ -2256,10 +2256,10 @@ fn main() {
 error[E0308]: mismatched types
   --> $DIR/non-whitespace-trimming-unicode.rs:4:415
    |
-LL | ...♰♱♲♳♴♵♶♷♸♹♺♻♼♽♾♿⚀⚁⚂⚃⚄⚅⚆⚈⚉4"; let _: () = 42;  let _: &str = "🦀☀☁☂☃☄★☆☇☈☉☊☋☌☍☎☏☐☑☒☓  ☖☗☘☙☚☛☜☝☞☟☠☡☢☣☤☥☦☧☨☩☪☫☬☭☮☯☰☱☲☳☴☵☶☷☸☹☺☻☼☽☾☿♀♁♂...
-   |                                         --   ^^ expected `()`, found integer
-   |                                         |
-   |                                         expected due to this
+LL | ...♧♨♩♪♫♬♭♮♯♰♱♲♳♴♵♶♷♸♹♺♻♼♽♾♿⚀⚁⚂⚃⚄⚅⚆⚈⚉4"; let _: () = 42;  let _: &str = "🦀☀☁☂☃☄★☆☇☈☉☊☋☌☍☎☏☐☑☒☓  ☖☗☘☙☚☛☜☝☞☟☠☡☢☣☤☥☦☧☨☩☪☫☬☭☮☯☰☱☲☳☴☵☶☷...
+   |                                                  --   ^^ expected `()`, found integer
+   |                                                  |
+   |                                                  expected due to this
 "#]];
 
     let renderer = Renderer::plain().anonymized_line_numbers(true);
@@ -2315,11 +2315,11 @@ fn main() {
 error[E0369]: cannot add `&str` to `&str`
    ╭▸ $DIR/non-1-width-unicode-multiline-label.rs:7:260
    │
-LL │ …ཽཾཿ྄ཱྀྀྂྃ྅྆྇ྈྉྊྋྌྍྎྏྐྑྒྒྷྔྕྖྗ྘ྙྚྛྜྜྷྞྟྠྡྡྷྣྤྥྦྦྷྨྩྪྫྫྷྭྮྯྰྱྲླྴྵྶྷྸྐྵྺྻྼ྽྾྿࿀࿁࿂࿃࿄࿅࿆࿇࿈࿉࿊࿋…࿍࿎࿏࿐࿑࿒࿓࿔࿕࿖࿗࿘࿙࿚"; let _a = unicode_is_fun + " really fun!";
-   │                                                  ┬───────────── ┯ ────────────── &str
-   │                                                  │              │
-   │                                                  │              `+` cannot be used to concatenate two `&str` strings
-   │                                                  &str
+LL │ …࿆࿇࿈࿉࿊࿋࿌࿍࿎࿏࿐࿑࿒࿓࿔࿕࿖࿗࿘࿙࿚"; let _a = unicode_is_fun + " really fun!";
+   │                                  ┬───────────── ┯ ────────────── &str
+   │                                  │              │
+   │                                  │              `+` cannot be used to concatenate two `&str` strings
+   │                                  &str
    │
    ╰ note: string concatenation requires an owned `String` on the left
 help: create an owned `String` from a string reference
@@ -2377,7 +2377,7 @@ LL |     include!("not-utf8.bin");
 note: byte `193` is not valid utf-8
   --> $DIR/not-utf8.bin:1:1
    |
-LL | �|�␂!5�cc␕␂�Ӻi��WWj�ȥ�'�}�␒�J�ȉ��W�␞O�@����␜w�V���LO����␔[ ␃_�'���SQ�~ذ��ų&��-    ��lN~��!@␌ _#���kQ��h�␝�:�...
+LL | �|�␂!5�cc␕␂�Ӻi��WWj�ȥ�'�}�␒�J�ȉ��W�␞O�@����␜w�V���LO����␔[ ␃_�'���SQ�~ذ��ų&��-    ��lN~��!@␌ _#���kQ��h�␝�:�␜␇�
    | ^
    = note: this error originates in the macro `include` (in Nightly builds, run with -Z macro-backtrace for more info)
 "#]];