Skip to content

Commit fe891c6

Browse files
test(*): add accented chars & word joiner test cases
1 parent ab70a32 commit fe891c6

File tree

3 files changed

+14
-1
lines changed

3 files changed

+14
-1
lines changed

Diff for: test/its-specs.js

+11
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,19 @@ describe( 'its functions for .out()', function () {
8585
it( 'its.shape', function () {
8686
expect( nlp.readDoc( 'The' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxx' );
8787
expect( nlp.readDoc( 'TheOne' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'XxxXxx' );
88+
expect( nlp.readDoc( 'The\u2060One' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'XxxXxx' );
8889
expect( nlp.readDoc( 'A1' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xd' );
8990
expect( nlp.readDoc( 'Abcdef123456' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxxdddd' );
91+
expect( nlp.readDoc( 'Poincar\u00e9' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
92+
expect( nlp.readDoc( 'Poin\u2060car\u00e9' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
93+
expect( nlp.readDoc( 'Poincare\u0301' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
94+
} );
95+
96+
it( 'its.shape special cases', function () {
97+
expect( nlp.readDoc( 'The\u2060One' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'XxxXxx' );
98+
expect( nlp.readDoc( 'Poincar\u00e9' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
99+
expect( nlp.readDoc( 'Poin\u2060car\u00e9' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
100+
expect( nlp.readDoc( 'Poincare\u0301' ).tokens().itemAt( 0 ).out( its.shape ) ).to.equal( 'Xxxxx' );
90101
} );
91102

92103
it( 'its.type', function () {

Diff for: test/test-model/feature.js

+2
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ var feature = function ( config, lang, featuresData, isLexicographer ) {
9696
const rgxLC = /^[a-z][a-z\-\–\—\.]*$/;
9797
const rgxUC = /^[A-Z][A-Z\-\–\—\.]*$/;
9898
const rgxTC = /^[A-Z][a-z\-\–\—\.]*$/;
99+
var rgxDiacriticalWordJoiner = /[\u0300-\u036f\u2060]/g;
99100

100101
// The Regex, Category pair goes in to this array for category detection &
101102
// assignment.
@@ -114,6 +115,7 @@ var feature = function ( config, lang, featuresData, isLexicographer ) {
114115
var shape = function ( word ) {
115116
return (
116117
word
118+
.normalize( 'NFD' ).replace( rgxDiacriticalWordJoiner, '' )
117119
.replace( /[A-Z]{4,}/g, 'XXXX' )
118120
// Handle <4 Caps
119121
.replace( /[A-Z]/g, 'X' )

Diff for: test/test-model/languages/cur/models/eng-core-web-model.json

+1-1
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)