Skip to content

Commit 790449c

Browse files
authored
Merge pull request #3 from cto-af/break-last
Mark the last break with a boolean
2 parents c7b9506 + fca478f commit 790449c

File tree

4 files changed

+86
-42
lines changed

4 files changed

+86
-42
lines changed

README.md

+24-3
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,14 @@ const sw = new StringWidth()
1919
sw.width('foo') // 3
2020
sw.width('\u{1F4A9}') // 2: Emoji take two cells
2121
sw.width('#\ufe0f\u20e3') // 2: More complicated emoji
22-
sw.break('foobar', 3) // [{string: 'foo', cells: 3}, {string: 'bar', cells: 3}]
22+
sw.break('foobar', 3) // [
23+
// {string: 'foo', cells: 3, last: false},
24+
// {string: 'bar', cells: 3, last: true}
25+
// ]
2326

2427
const custom = new StringWidth({
2528
locale: 'ko-KR',
29+
isCJK: true,
2630
extraWidths: new Map([
2731
// This example is not actually useful, but demonstrates how to customize
2832

@@ -47,13 +51,30 @@ const custom = new StringWidth({
4751
it's worth a performance shortcut
4852
- For each grapheme cluster:
4953
- Get the width of the first code point from extraWidths or the Trie.
50-
- If the width is AMBIGUOUS, check the script of the locale to see if we're
51-
in an East Asian context.
54+
- If the width is AMBIGUOUS, return 2 if we're in a CJK context, otherwise 1.
5255
- If the width is POTENTIAL_EMOJI, check if the whole grapheme cluster is an
5356
emoji
5457
- Since backspace has a negative width, ensure that the total width is never
5558
less than zero.
5659

60+
## Chinese, Japanese, or Korean (CJK) contexts
61+
62+
Some code points have ambiguous length, which depends upon whether we are
63+
counting in a CJK context or not. By default, StringWidth will look at the
64+
locale that is given (or derived from the environment), and use the default
65+
script of that locale to decide if this is a Chinese, Japanese, or Korean
66+
context. The script identifiers `'Hans'`, `'Hant'`, `'Jpan'`, and `'Kore'`
67+
signal CJK context. If desired, this detection can be overridden by passing
68+
in the `isCJK` field in the constructor options.
69+
70+
## Width breaking
71+
72+
The `break(string, N)` method slices a string into chunks, each of which is at
73+
most N cells. This was so entangled with the width logic that it made sense
74+
to be in this library. It is useful for strings that are longer than N that
75+
need to have a hyphen inserted between each of the segments, ensuring that the
76+
hyphen doesn't go in the middle of a grapheme cluster.
77+
5778
## Development
5879

5980
On a new Unicode version being released, delete the `tools/*.txt` files, then

lib/index.js

+46-27
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ const NO_EXTRAS = {
3939
* @prop {ExtraWidths} [extraWidths] A lookup map for code points whose width
4040
* you would like to override. Might be a Map<number, number>, UnicodeTrie,
4141
* or anything else that has a `get(codePoint: number) => number` method.
42+
* @prop {boolean} [isCJK] If specified, override using the script of the
43+
* locale to determine whether we are in a CJK context.
4244
*/
4345

4446
export class StringWidth {
@@ -58,15 +60,20 @@ export class StringWidth {
5860
const options = {
5961
locale: DEFAULT_LOCALE,
6062
extraWidths: NO_EXTRAS,
63+
isCJK: false,
6164
...opts,
6265
}
6366
this.#graphemes = new Intl.Segmenter(options.locale, {
6467
granularity: 'grapheme',
6568
})
66-
const loc = new Intl.Locale(this.locale).maximize()
67-
// Script is never undefined after going through Segmenter
68-
this.#isCJK = ['Hans', 'Hant', 'Kore', 'Jpan']
69-
.includes(/** @type {string} */ (loc.script))
69+
if (typeof opts.isCJK === 'boolean') {
70+
this.#isCJK = opts.isCJK
71+
} else {
72+
const loc = new Intl.Locale(this.locale).maximize()
73+
// Script is never undefined after going through Segmenter
74+
this.#isCJK = ['Hans', 'Hant', 'Kore', 'Jpan']
75+
.includes(/** @type {string} */ (loc.script))
76+
}
7077
this.#extraWidths = options.extraWidths
7178
}
7279

@@ -141,6 +148,13 @@ export class StringWidth {
141148
return ret < 0 ? 0 : ret
142149
}
143150

151+
/**
152+
* @typedef {object} WidthBreak
153+
* @prop {string} string
154+
* @prop {number} cells
155+
* @prop {boolean} last
156+
*/
157+
144158
/**
145159
* Break a string into multiple parts, such that each part has maximum
146160
* cell length of width, unless a particular grapheme cluster is longer
@@ -149,14 +163,14 @@ export class StringWidth {
149163
*
150164
* @param {string} str String to segment
151165
* @param {number} width Maximum number of display cells for chunks
152-
* @returns {{string: string, cells: number}[]}
166+
* @returns {WidthBreak[]}
153167
*/
154168
break(str, width) {
155169
if (width < 1) {
156170
throw new RangeError(`Width must be >= 1. Got ${width}.`)
157171
}
158172

159-
/** @type {{string: string, cells: number}[]} */
173+
/** @type {WidthBreak[]} */
160174
const ret = []
161175
let string = ''
162176
let cells = 0
@@ -168,32 +182,37 @@ export class StringWidth {
168182
ret.push({
169183
string,
170184
cells: string.length, // Might be less than width for last chunk
185+
last: false,
171186
})
172187
}
173-
return ret
174-
}
175-
176-
for (const segment of this.graphemes(str)) {
177-
const w = this.#segmentWidth(segment)
178-
if (w > width) {
179-
// Skinny width, fat grapheme cluster
180-
if (cells > 0) {
181-
ret.push({string, cells})
182-
string = ''
183-
cells = 0
188+
} else {
189+
for (const segment of this.graphemes(str)) {
190+
const w = this.#segmentWidth(segment)
191+
if (w > width) {
192+
// Skinny width, fat grapheme cluster
193+
if (cells > 0) {
194+
ret.push({string, cells, last: false})
195+
string = ''
196+
cells = 0
197+
}
198+
ret.push({string: segment, cells: w, last: false})
199+
} else if (cells + w > width) {
200+
ret.push({string, cells, last: false})
201+
string = segment
202+
cells = w
203+
} else {
204+
string += segment
205+
cells += w
184206
}
185-
ret.push({string: segment, cells: w})
186-
} else if (cells + w > width) {
187-
ret.push({string, cells})
188-
string = segment
189-
cells = w
190-
} else {
191-
string += segment
192-
cells += w
207+
}
208+
if (cells > 0) {
209+
ret.push({string, cells, last: true})
193210
}
194211
}
195-
if (cells > 0) {
196-
ret.push({string, cells})
212+
213+
// Several paths lead here.
214+
if (ret.length > 0) {
215+
ret[ret.length - 1].last = true
197216
}
198217
return ret
199218
}

lib/widths.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { UnicodeTrie } from '@cto.af/unicode-trie'
33

44
export const version = '15.0.0'
55
export const inputFileDate = new Date('2022-08-05T22:17:05.000Z')
6-
export const generatedDate = new Date('2023-06-15T16:15:39.827Z')
6+
export const generatedDate = new Date('2023-06-17T16:24:28.610Z')
77
export const Width = new UnicodeTrie(Buffer.from(
88
`AAARAAAAAADdBAAAGx95MJ4Jts2CbayTN4fNM2YRXbY9d4GqY3iCF4gAAHWynzUpqFcLIMD/
99
d+1TX5FlY+LXCARlVrgCTwHl6pJCPc1/uZ8maVnYPetMVaWq9NUvn1NEeGIXX6+R8yD4EARB

test/index.test.js

+15-11
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ describe('string widths', () => {
2020
const ko = new StringWidth({locale: 'ko'})
2121
assert.equal(ko.width('\xa1'), 2)
2222
})
23+
it('allows overriding CJK', () => {
24+
const ko = new StringWidth({locale: 'ko', isCJK: false})
25+
assert.equal(ko.isCJK, false)
26+
})
2327
it('handles flags', () => {
2428
assert.equal(sw.width('\u{1F1F9}\u{1F1FC}b'), 3)
2529
})
@@ -53,29 +57,29 @@ describe('info', () => {
5357

5458
describe('string breaks', () => {
5559
it('handles ascii only', () => {
56-
assert.deepEqual(sw.break('foo', 10), [{string: 'foo', cells: 3}])
60+
assert.deepEqual(sw.break('foo', 10), [{string: 'foo', cells: 3, last: true}])
5761
assert.deepEqual(sw.break('foobar', 3), [
58-
{string: 'foo', cells: 3},
59-
{string: 'bar', cells: 3},
62+
{string: 'foo', cells: 3, last: false},
63+
{string: 'bar', cells: 3, last: true},
6064
])
6165
assert.throws(() => sw.break('foo', 0))
6266
})
6367
it('handles non-ascii', () => {
6468
assert.deepEqual(sw.break('foo\u0308', 10), [
65-
{string: 'foo\u0308', cells: 3},
69+
{string: 'foo\u0308', cells: 3, last: true},
6670
])
6771
assert.deepEqual(sw.break('foo\u0308ba\u0308r', 3), [
68-
{string: 'foo\u0308', cells: 3},
69-
{string: 'ba\u0308r', cells: 3},
72+
{string: 'foo\u0308', cells: 3, last: false},
73+
{string: 'ba\u0308r', cells: 3, last: true},
7074
])
7175
assert.deepEqual(sw.break('\u{1F1F9}\u{1F1FC}b', 1), [
72-
{string: '\u{1F1F9}\u{1F1FC}', cells: 2},
73-
{string: 'b', cells: 1},
76+
{string: '\u{1F1F9}\u{1F1FC}', cells: 2, last: false},
77+
{string: 'b', cells: 1, last: true},
7478
])
7579
assert.deepEqual(sw.break('a\u{1F1F9}\u{1F1FC}b', 1), [
76-
{string: 'a', cells: 1},
77-
{string: '\u{1F1F9}\u{1F1FC}', cells: 2},
78-
{string: 'b', cells: 1},
80+
{string: 'a', cells: 1, last: false},
81+
{string: '\u{1F1F9}\u{1F1FC}', cells: 2, last: false},
82+
{string: 'b', cells: 1, last: true},
7983
])
8084
})
8185
})

0 commit comments

Comments
 (0)