Skip to content

Commit a13fdc4

Browse files
committed
Mop-up for commit 85feb77.
Adjust commentary in regc_pg_locale.c to remove mention of the possibility of not having <wctype.h> functions, since we no longer consider that. Eliminate duplicate code in wparser_def.c by generalizing the p_iswhat macro to take a parameter saying what to return for non-ASCII chars in C locale. (That's not really a consequence of the USE_WIDE_UPPER_LOWER-ectomy, but I noticed it while doing that.)
1 parent d7c4fa3 commit a13fdc4

File tree

3 files changed

+59
-125
lines changed

3 files changed

+59
-125
lines changed

Diff for: expected/pg_tsparser.out

+28
Original file line numberDiff line numberDiff line change
@@ -236,3 +236,31 @@ SELECT to_tsvector('english_ts', 'test2.com');
236236
'com':3 'test2':2 'test2.com':1
237237
(1 row)
238238

239+
-- Test non-ASCII symbols
240+
SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф');
241+
tokid | token
242+
-------+--------
243+
17 | аб_вгд
244+
10 | аб
245+
12 | _
246+
10 | вгд
247+
12 |
248+
15 | 12_абв
249+
9 | 12
250+
12 | _
251+
10 | абв
252+
12 |
253+
15 | 12-абв
254+
9 | 12
255+
12 | -
256+
10 | абв
257+
12 |
258+
2 | абв
259+
12 | .
260+
2 | рф
261+
12 |
262+
3 | абв2
263+
12 | .
264+
2 | рф
265+
(22 rows)
266+

Diff for: sql/pg_tsparser.sql

+3
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,6 @@ SELECT to_tsvector('english_ts', '12_abc');
2626
SELECT to_tsvector('english_ts', '12-abc');
2727
SELECT to_tsvector('english_ts', 'test.com');
2828
SELECT to_tsvector('english_ts', 'test2.com');
29+
30+
-- Test non-ASCII symbols
31+
SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф');

Diff for: tsparser.c

+28-125
Original file line numberDiff line numberDiff line change
@@ -249,11 +249,9 @@ typedef struct TParser
249249
/* string and position information */
250250
char *str; /* multibyte string */
251251
int lenstr; /* length of mbstring */
252-
#ifdef USE_WIDE_UPPER_LOWER
253252
wchar_t *wstr; /* wide character string */
254253
pg_wchar *pgwstr; /* wide character string for C-locale */
255254
bool usewide;
256-
#endif
257255

258256
/* State of parse */
259257
int charmaxlen;
@@ -302,8 +300,6 @@ TParserInit(char *str, int len)
302300
prs->str = str;
303301
prs->lenstr = len;
304302

305-
#ifdef USE_WIDE_UPPER_LOWER
306-
307303
/*
308304
* Use wide char code only when max encoding length > 1.
309305
*/
@@ -331,7 +327,6 @@ TParserInit(char *str, int len)
331327
}
332328
else
333329
prs->usewide = false;
334-
#endif
335330

336331
prs->state = newTParserPosition(NULL);
337332
prs->state->state = TPS_Base;
@@ -368,15 +363,12 @@ TParserCopyInit(const TParser *orig)
368363
prs->charmaxlen = orig->charmaxlen;
369364
prs->str = orig->str + orig->state->posbyte;
370365
prs->lenstr = orig->lenstr - orig->state->posbyte;
371-
372-
#ifdef USE_WIDE_UPPER_LOWER
373366
prs->usewide = orig->usewide;
374367

375368
if (orig->pgwstr)
376369
prs->pgwstr = orig->pgwstr + orig->state->poschar;
377370
if (orig->wstr)
378371
prs->wstr = orig->wstr + orig->state->poschar;
379-
#endif
380372

381373
prs->state = newTParserPosition(NULL);
382374
prs->state->state = TPS_Base;
@@ -401,12 +393,10 @@ TParserClose(TParser *prs)
401393
prs->state = ptr;
402394
}
403395

404-
#ifdef USE_WIDE_UPPER_LOWER
405396
if (prs->wstr)
406397
pfree(prs->wstr);
407398
if (prs->pgwstr)
408399
pfree(prs->pgwstr);
409-
#endif
410400

411401
#ifdef WPARSER_TRACE
412402
fprintf(stderr, "closing parser\n");
@@ -445,96 +435,45 @@ TParserCopyClose(TParser *prs)
445435
* - if locale is C then we use pgwstr instead of wstr.
446436
*/
447437

448-
#ifdef USE_WIDE_UPPER_LOWER
449-
450-
#define p_iswhat(type) \
438+
#define p_iswhat(type, nonascii) \
439+
\
451440
static int \
452-
p_is##type(TParser *prs) { \
453-
Assert( prs->state ); \
454-
if ( prs->usewide ) \
441+
p_is##type(TParser *prs) \
442+
{ \
443+
Assert(prs->state); \
444+
if (prs->usewide) \
455445
{ \
456-
if ( prs->pgwstr ) \
446+
if (prs->pgwstr) \
457447
{ \
458448
unsigned int c = *(prs->pgwstr + prs->state->poschar); \
459-
if ( c > 0x7f ) \
460-
return 0; \
461-
return is##type( c ); \
449+
if (c > 0x7f) \
450+
return nonascii; \
451+
return is##type(c); \
462452
} \
463-
return isw##type( *( prs->wstr + prs->state->poschar ) ); \
453+
return isw##type(*(prs->wstr + prs->state->poschar)); \
464454
} \
465-
\
466-
return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
467-
} \
455+
return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
456+
} \
468457
\
469458
static int \
470-
p_isnot##type(TParser *prs) { \
459+
p_isnot##type(TParser *prs) \
460+
{ \
471461
return !p_is##type(prs); \
472462
}
473463

474-
static int
475-
p_isalnum(TParser *prs)
476-
{
477-
Assert(prs->state);
478-
479-
if (prs->usewide)
480-
{
481-
if (prs->pgwstr)
482-
{
483-
unsigned int c = *(prs->pgwstr + prs->state->poschar);
484-
485-
/*
486-
* any non-ascii symbol with multibyte encoding with C-locale is
487-
* an alpha character
488-
*/
489-
if (c > 0x7f)
490-
return 1;
491-
492-
return isalnum(c);
493-
}
494-
495-
return iswalnum(*(prs->wstr + prs->state->poschar));
496-
}
497-
498-
return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
499-
}
500-
static int
501-
p_isnotalnum(TParser *prs)
502-
{
503-
return !p_isalnum(prs);
504-
}
505-
506-
static int
507-
p_isalpha(TParser *prs)
508-
{
509-
Assert(prs->state);
510-
511-
if (prs->usewide)
512-
{
513-
if (prs->pgwstr)
514-
{
515-
unsigned int c = *(prs->pgwstr + prs->state->poschar);
516-
517-
/*
518-
* any non-ascii symbol with multibyte encoding with C-locale is
519-
* an alpha character
520-
*/
521-
if (c > 0x7f)
522-
return 1;
523-
524-
return isalpha(c);
525-
}
526-
527-
return iswalpha(*(prs->wstr + prs->state->poschar));
528-
}
529-
530-
return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
531-
}
532-
533-
static int
534-
p_isnotalpha(TParser *prs)
535-
{
536-
return !p_isalpha(prs);
537-
}
464+
/*
465+
* In C locale with a multibyte encoding, any non-ASCII symbol is considered
466+
* an alpha character, but not a member of other char classes.
467+
*/
468+
p_iswhat(alnum, 1)
469+
p_iswhat(alpha, 1)
470+
p_iswhat(digit, 0)
471+
p_iswhat(lower, 0)
472+
p_iswhat(print, 0)
473+
p_iswhat(punct, 0)
474+
p_iswhat(space, 0)
475+
p_iswhat(upper, 0)
476+
p_iswhat(xdigit, 0)
538477

539478
/* p_iseq should be used only for ascii symbols */
540479

@@ -544,39 +483,6 @@ p_iseq(TParser *prs, char c)
544483
Assert(prs->state);
545484
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
546485
}
547-
#else /* USE_WIDE_UPPER_LOWER */
548-
549-
#define p_iswhat(type) \
550-
static int \
551-
p_is##type(TParser *prs) { \
552-
Assert( prs->state ); \
553-
return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
554-
} \
555-
\
556-
static int \
557-
p_isnot##type(TParser *prs) { \
558-
return !p_is##type(prs); \
559-
}
560-
561-
562-
static int
563-
p_iseq(TParser *prs, char c)
564-
{
565-
Assert(prs->state);
566-
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
567-
}
568-
569-
p_iswhat(alnum)
570-
p_iswhat(alpha)
571-
#endif /* USE_WIDE_UPPER_LOWER */
572-
573-
p_iswhat(digit)
574-
p_iswhat(lower)
575-
p_iswhat(print)
576-
p_iswhat(punct)
577-
p_iswhat(space)
578-
p_iswhat(upper)
579-
p_iswhat(xdigit)
580486

581487
static int
582488
p_isEOF(TParser *prs)
@@ -793,8 +699,6 @@ p_isspecial(TParser *prs)
793699
if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
794700
return 1;
795701

796-
#ifdef USE_WIDE_UPPER_LOWER
797-
798702
/*
799703
* Unicode Characters in the 'Mark, Spacing Combining' Category That
800704
* characters are not alpha although they are not breakers of word too.
@@ -1058,7 +962,6 @@ p_isspecial(TParser *prs)
1058962
StopHigh = StopMiddle;
1059963
}
1060964
}
1061-
#endif
1062965

1063966
return 0;
1064967
}

0 commit comments

Comments
 (0)