Merge pull request #3 from cto-af/break-last

hildjj · web-flow · commit 790449cfaf97 · 2023-06-17T10:25:44.000-06:00
Mark the last break with a boolean
diff --git a/README.md b/README.md
@@ -19,10 +19,14 @@ const sw = new StringWidth()
 sw.width('foo') // 3
 sw.width('\u{1F4A9}') // 2: Emoji take two cells
 sw.width('#\ufe0f\u20e3') // 2: More complicated emoji
-sw.break('foobar', 3) // [{string: 'foo', cells: 3}, {string: 'bar', cells: 3}]
+sw.break('foobar', 3) // [
+  //   {string: 'foo', cells: 3, last: false},
+  //   {string: 'bar', cells: 3, last: true}
+  // ]
 
 const custom = new StringWidth({
   locale: 'ko-KR',
+  isCJK: true,
   extraWidths: new Map([
     // This example is not actually useful, but demonstrates how to customize
 
@@ -47,13 +51,30 @@ const custom = new StringWidth({
   it's worth a performance shortcut
 - For each grapheme cluster:
   - Get the width of the first code point from extraWidths or the Trie.
-  - If the width is AMBIGUOUS, check the script of the locale to see if we're
-    in an East Asian context.
+  - If the width is AMBIGUOUS, return 2 if we're in a CJK context, otherwise 1.
   - If the width is POTENTIAL_EMOJI, check if the whole grapheme cluster is an
     emoji
 - Since backspace has a negative width, ensure that the total width is never
   less than zero.
 
+## Chinese, Japanese, or Korean (CJK) contexts
+
+Some code points have ambiguous length, which depends upon whether we are
+counting in a CJK context or not.  By default, StringWidth will look at the
+locale that is given (or derived from the environment), and use the default
+script of that locale to decide if this is a Chinese, Japanese, or Korean
+context.  The script identifiers `'Hans'`, `'Hant'`, `'Jpan'`, and `'Kore'`
+signal CJK context.  If desired, this detection can be overridden by passing
+in the `isCJK` field in the constructor options.
+
+## Width breaking
+
+The `break(string, N)` method slices a string into chunks, each of which is at
+most N cells.  This was so entangled with the width logic that it made sense
+to be in this library.  It is useful for strings that are longer than N that
+need to have a hyphen inserted between each of the segments, ensuring that the
+hyphen doesn't go in the middle of a grapheme cluster.
+
 ## Development
 
 On a new Unicode version being released, delete the `tools/*.txt` files, then
diff --git a/lib/index.js b/lib/index.js
@@ -39,6 +39,8 @@ const NO_EXTRAS = {
  * @prop {ExtraWidths} [extraWidths] A lookup map for code points whose width
  *   you would like to override.  Might be a Map<number, number>, UnicodeTrie,
  *   or anything else that has a `get(codePoint: number) => number` method.
+ * @prop {boolean} [isCJK] If specified, override using the script of the
+ *   locale to determine whether we are in a CJK context.
  */
 
 export class StringWidth {
@@ -58,15 +60,20 @@ export class StringWidth {
     const options = {
       locale: DEFAULT_LOCALE,
       extraWidths: NO_EXTRAS,
+      isCJK: false,
       ...opts,
     }
     this.#graphemes = new Intl.Segmenter(options.locale, {
       granularity: 'grapheme',
     })
-    const loc = new Intl.Locale(this.locale).maximize()
-    // Script is never undefined after going through Segmenter
-    this.#isCJK = ['Hans', 'Hant', 'Kore', 'Jpan']
-      .includes(/** @type {string} */ (loc.script))
+    if (typeof opts.isCJK === 'boolean') {
+      this.#isCJK = opts.isCJK
+    } else {
+      const loc = new Intl.Locale(this.locale).maximize()
+      // Script is never undefined after going through Segmenter
+      this.#isCJK = ['Hans', 'Hant', 'Kore', 'Jpan']
+        .includes(/** @type {string} */ (loc.script))
+    }
     this.#extraWidths = options.extraWidths
   }
 
@@ -141,6 +148,13 @@ export class StringWidth {
     return ret < 0 ? 0 : ret
   }
 
+  /**
+   * @typedef {object} WidthBreak
+   * @prop {string} string
+   * @prop {number} cells
+   * @prop {boolean} last
+   */
+
   /**
    * Break a string into multiple parts, such that each part has maximum
    * cell length of width, unless a particular grapheme cluster is longer
@@ -149,14 +163,14 @@ export class StringWidth {
    *
    * @param {string} str String to segment
    * @param {number} width Maximum number of display cells for chunks
-   * @returns {{string: string, cells: number}[]}
+   * @returns {WidthBreak[]}
    */
   break(str, width) {
     if (width < 1) {
       throw new RangeError(`Width must be >= 1.  Got ${width}.`)
     }
 
-    /** @type {{string: string, cells: number}[]} */
+    /** @type {WidthBreak[]} */
     const ret = []
     let string = ''
     let cells = 0
@@ -168,32 +182,37 @@ export class StringWidth {
         ret.push({
           string,
           cells: string.length, // Might be less than width for last chunk
+          last: false,
         })
       }
-      return ret
-    }
-
-    for (const segment of this.graphemes(str)) {
-      const w = this.#segmentWidth(segment)
-      if (w > width) {
-        // Skinny width, fat grapheme cluster
-        if (cells > 0) {
-          ret.push({string, cells})
-          string = ''
-          cells = 0
+    } else {
+      for (const segment of this.graphemes(str)) {
+        const w = this.#segmentWidth(segment)
+        if (w > width) {
+          // Skinny width, fat grapheme cluster
+          if (cells > 0) {
+            ret.push({string, cells, last: false})
+            string = ''
+            cells = 0
+          }
+          ret.push({string: segment, cells: w, last: false})
+        } else if (cells + w > width) {
+          ret.push({string, cells, last: false})
+          string = segment
+          cells = w
+        } else {
+          string += segment
+          cells += w
         }
-        ret.push({string: segment, cells: w})
-      } else if (cells + w > width) {
-        ret.push({string, cells})
-        string = segment
-        cells = w
-      } else {
-        string += segment
-        cells += w
+      }
+      if (cells > 0) {
+        ret.push({string, cells, last: true})
       }
     }
-    if (cells > 0) {
-      ret.push({string, cells})
+
+    // Several paths lead here.
+    if (ret.length > 0) {
+      ret[ret.length - 1].last = true
     }
     return ret
   }
diff --git a/lib/widths.js b/lib/widths.js
@@ -3,7 +3,7 @@ import { UnicodeTrie } from '@cto.af/unicode-trie'
 
 export const version = '15.0.0'
 export const inputFileDate = new Date('2022-08-05T22:17:05.000Z')
-export const generatedDate = new Date('2023-06-15T16:15:39.827Z')
+export const generatedDate = new Date('2023-06-17T16:24:28.610Z')
 export const Width = new UnicodeTrie(Buffer.from(
   `AAARAAAAAADdBAAAGx95MJ4Jts2CbayTN4fNM2YRXbY9d4GqY3iCF4gAAHWynzUpqFcLIMD/
    d+1TX5FlY+LXCARlVrgCTwHl6pJCPc1/uZ8maVnYPetMVaWq9NUvn1NEeGIXX6+R8yD4EARB
diff --git a/test/index.test.js b/test/index.test.js
@@ -20,6 +20,10 @@ describe('string widths', () => {
     const ko = new StringWidth({locale: 'ko'})
     assert.equal(ko.width('\xa1'), 2)
   })
+  it('allows overriding CJK', () => {
+    const ko = new StringWidth({locale: 'ko', isCJK: false})
+    assert.equal(ko.isCJK, false)
+  })
   it('handles flags', () => {
     assert.equal(sw.width('\u{1F1F9}\u{1F1FC}b'), 3)
   })
@@ -53,29 +57,29 @@ describe('info', () => {
 
 describe('string breaks', () => {
   it('handles ascii only', () => {
-    assert.deepEqual(sw.break('foo', 10), [{string: 'foo', cells: 3}])
+    assert.deepEqual(sw.break('foo', 10), [{string: 'foo', cells: 3, last: true}])
     assert.deepEqual(sw.break('foobar', 3), [
-      {string: 'foo', cells: 3},
-      {string: 'bar', cells: 3},
+      {string: 'foo', cells: 3, last: false},
+      {string: 'bar', cells: 3, last: true},
     ])
     assert.throws(() => sw.break('foo', 0))
   })
   it('handles non-ascii', () => {
     assert.deepEqual(sw.break('foo\u0308', 10), [
-      {string: 'foo\u0308', cells: 3},
+      {string: 'foo\u0308', cells: 3, last: true},
     ])
     assert.deepEqual(sw.break('foo\u0308ba\u0308r', 3), [
-      {string: 'foo\u0308', cells: 3},
-      {string: 'ba\u0308r', cells: 3},
+      {string: 'foo\u0308', cells: 3, last: false},
+      {string: 'ba\u0308r', cells: 3, last: true},
     ])
     assert.deepEqual(sw.break('\u{1F1F9}\u{1F1FC}b', 1), [
-      {string: '\u{1F1F9}\u{1F1FC}', cells: 2},
-      {string: 'b', cells: 1},
+      {string: '\u{1F1F9}\u{1F1FC}', cells: 2, last: false},
+      {string: 'b', cells: 1, last: true},
     ])
     assert.deepEqual(sw.break('a\u{1F1F9}\u{1F1FC}b', 1), [
-      {string: 'a', cells: 1},
-      {string: '\u{1F1F9}\u{1F1FC}', cells: 2},
-      {string: 'b', cells: 1},
+      {string: 'a', cells: 1, last: false},
+      {string: '\u{1F1F9}\u{1F1FC}', cells: 2, last: false},
+      {string: 'b', cells: 1, last: true},
     ])
   })
 })