Skip to content

Commit 5623c32

Browse files
feat(*): add suppport for non regular spaces
support em/en, third/quarter, thin/hair, medium math spaces & regular/narrow nbsp references winkjs/wink-eng-lite-web-model#15
1 parent 8baf2dd commit 5623c32

File tree

3 files changed

+51
-2
lines changed

3 files changed

+51
-2
lines changed

src/tokenizer.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ var tokenizer = function ( trex, categories, preserve ) {
197197
// Skip empty (`''`) token.
198198
if ( !t ) continue; // eslint-disable-line no-continue
199199
// Non-empty token:
200-
const hasNBSP = ( /\u00a0/ ).test( t );
200+
const hasNBSP = ( /[\u00a0\u2002-\u2005\u2009\u200a\u202f\u205f]/ ).test( t );
201201
if ( t[ 0 ] === ' ' || hasNBSP ) {
202202
// This indicates spaces: count them.
203203
precedingSpaces = t.length;

test/test-model/languages/cur/models/eng-core-web-model.json

+1-1
Large diffs are not rendered by default.

test/wink-nlp-specs.js

+49
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,16 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
144144
expect( nlp.readDoc( nbspTokensArray.join(' \u00a0\u00a0') ).out() ).to.equal( nbspTokensArray.join(' \u00a0\u00a0') );
145145
} );
146146

147+
it( 'should tokenize/detokenize the text with non-regular spaces', function () {
148+
// Reconstruction.
149+
expect( nlp.readDoc( nbspTokensArray.join('\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join('\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
150+
expect( nlp.readDoc( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
151+
expect( nlp.readDoc( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
152+
expect( nlp.readDoc( nbspTokensArray.join('\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join('\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
153+
expect( nlp.readDoc( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
154+
expect( nlp.readDoc( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') ).out() ).to.equal( nbspTokensArray.join(' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f') );
155+
} );
156+
147157
it( 'should tokenize/detokenize a sentence with non-breaking spaces', function () {
148158
var textWith2S = 'I met Mr.\u00a0Gandhi. Mr.\u00a0Gandhi is a nice person.';
149159
var sentences = nlp.readDoc( textWith2S ).sentences();
@@ -154,6 +164,16 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
154164
} );
155165
} );
156166

167+
it( 'should tokenize/detokenize a sentence with non-regular spaces', function () {
168+
var textWith2S = 'I met Mr.\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fGandhi. Mr.\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fGandhi is a nice person.';
169+
var sentences = nlp.readDoc( textWith2S ).sentences();
170+
var sentencesText = [ 'I met Mr.\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fGandhi.', 'Mr.\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fGandhi is a nice person.' ];
171+
// Reconstruction.
172+
sentences.each( ( s, k ) => {
173+
expect( s.out() ).to.equal( sentencesText[ k ] );
174+
} );
175+
} );
176+
157177
it( 'should tokenize/detokenize the entities\' value as text with non-breaking spaces', function () {
158178
var textWith2S = 'I purchased 10 mangoes on March\u00a010th for US$\u00a099.00.';
159179
var entities = nlp.readDoc( textWith2S ).entities();
@@ -165,6 +185,18 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
165185
} );
166186
} );
167187

188+
it( 'should tokenize/detokenize the entities\' value as text with non-regular spaces', function () {
189+
var textWith2S = 'I purchased 10 mangoes on March\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f10th for US$\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f99.00.';
190+
var entities = nlp.readDoc( textWith2S ).entities();
191+
var entitiesText = [ '10', 'March\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f10th', 'US$\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f99.00' ];
192+
console.log(666666, entities.out(),nlp.readDoc( textWith2S ).tokens().out());
193+
// Reconstruction.
194+
entities.each( ( e, k ) => {
195+
expect( e.out( ) ).to.equal( entitiesText[ k ] );
196+
expect( e.out( its.value, as.text ) ).to.equal( entitiesText[ k ] );
197+
} );
198+
} );
199+
168200
it( 'should preserve non-breaking spaces with mark up', function () {
169201
var textWith2S = 'I purchased mangoes on March\u00a010th for US$\u00a099.00.';
170202
var doc4mark = nlp.readDoc( textWith2S );
@@ -174,6 +206,15 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
174206
expect( doc4mark.out(its.markedUpText) ).to.equal( markedText );
175207
} );
176208

209+
it( 'should preserve non-regular spaces with mark up', function () {
210+
var textWith2S = 'I purchased mangoes on March\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f10th for US$\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f99.00.';
211+
var doc4mark = nlp.readDoc( textWith2S );
212+
doc4mark.entities().each((e) => e.markup());
213+
var markedText = 'I purchased mangoes on <mark>March\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f10th</mark> for <mark>US$\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f99.00</mark>.';
214+
// Reconstruction.
215+
expect( doc4mark.out(its.markedUpText) ).to.equal( markedText );
216+
} );
217+
177218
it( 'should correctly reconstruct non-breaking spaces with its.precedingSpaces', function () {
178219
var text = 'U.S.A is my birth place. \u00a0 I was born\u00a0on 06.12.1924.';
179220
var reconstructed = [];
@@ -182,6 +223,14 @@ describe( 'wink-nlp test-coverage and basic behavior', function () {
182223
expect( reconstructed.join( '' ) ).to.equal( ' \u00a0 I was born\u00a0on 06.12.1924.' );
183224
} );
184225

226+
it( 'should correctly reconstruct non-regular spaces with its.precedingSpaces', function () {
227+
var text = 'U.S.A is my birth place. \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f I was born\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fon 06.12.1924.';
228+
var reconstructed = [];
229+
nlp.readDoc( text ).sentences().itemAt(1).tokens().each( ( t ) => reconstructed.push( t.out(its.precedingSpaces), t.out() ));
230+
// Reconstruction.
231+
expect( reconstructed.join( '' ) ).to.equal( ' \u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205f I was born\u2002\u2003\u2004\u2005\u2009\u200a\u202f\u205fon 06.12.1924.' );
232+
} );
233+
185234
it( 'should not contain empty tokens', function () {
186235
var doc = nlp.readDoc( sentence );
187236
expect( findEmptyTokens( doc ) ).deep.equal( [] );

0 commit comments

Comments
 (0)