Skip to content

Commit 9b20e54

Browse files
committed
Merge branch 'master' into stable
2 parents c98c15f + a13fdc4 commit 9b20e54

File tree

5 files changed

+69
-126
lines changed

5 files changed

+69
-126
lines changed

Diff for: LICENSE

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
pg_tsparser is released under the PostgreSQL License, a liberal Open Source license, similar to the BSD or MIT licenses.
2+
3+
Copyright (c) 2016-2018, Postgres Professional
4+
5+
Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies.
6+
7+
IN NO EVENT SHALL POSTGRES PROFESSIONAL BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF POSTGRES PROFESSIONAL HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8+
9+
POSTGRES PROFESSIONAL SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND POSTGRES PROFESSIONAL HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.

Diff for: README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ SELECT to_tsvector('english', 'rel-3.2-A') as def_parser,
3434

3535
## License
3636

37-
This module available under the same license as
37+
This module available under the [license](LICENSE) similar to
3838
[PostgreSQL](http://www.postgresql.org/about/licence/).
3939

4040
## Installation

Diff for: expected/pg_tsparser.out

+28
Original file line numberDiff line numberDiff line change
@@ -236,3 +236,31 @@ SELECT to_tsvector('english_ts', 'test2.com');
236236
'com':3 'test2':2 'test2.com':1
237237
(1 row)
238238

239+
-- Test non-ASCII symbols
240+
SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф');
241+
tokid | token
242+
-------+--------
243+
17 | аб_вгд
244+
10 | аб
245+
12 | _
246+
10 | вгд
247+
12 |
248+
15 | 12_абв
249+
9 | 12
250+
12 | _
251+
10 | абв
252+
12 |
253+
15 | 12-абв
254+
9 | 12
255+
12 | -
256+
10 | абв
257+
12 |
258+
2 | абв
259+
12 | .
260+
2 | рф
261+
12 |
262+
3 | абв2
263+
12 | .
264+
2 | рф
265+
(22 rows)
266+

Diff for: sql/pg_tsparser.sql

+3
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,6 @@ SELECT to_tsvector('english_ts', '12_abc');
2626
SELECT to_tsvector('english_ts', '12-abc');
2727
SELECT to_tsvector('english_ts', 'test.com');
2828
SELECT to_tsvector('english_ts', 'test2.com');
29+
30+
-- Test non-ASCII symbols
31+
SELECT * from ts_parse('tsparser', 'аб_вгд 12_абв 12-абв абв.рф абв2.рф');

Diff for: tsparser.c

+28-125
Original file line numberDiff line numberDiff line change
@@ -249,11 +249,9 @@ typedef struct TParser
249249
/* string and position information */
250250
char *str; /* multibyte string */
251251
int lenstr; /* length of mbstring */
252-
#ifdef USE_WIDE_UPPER_LOWER
253252
wchar_t *wstr; /* wide character string */
254253
pg_wchar *pgwstr; /* wide character string for C-locale */
255254
bool usewide;
256-
#endif
257255

258256
/* State of parse */
259257
int charmaxlen;
@@ -302,8 +300,6 @@ TParserInit(char *str, int len)
302300
prs->str = str;
303301
prs->lenstr = len;
304302

305-
#ifdef USE_WIDE_UPPER_LOWER
306-
307303
/*
308304
* Use wide char code only when max encoding length > 1.
309305
*/
@@ -331,7 +327,6 @@ TParserInit(char *str, int len)
331327
}
332328
else
333329
prs->usewide = false;
334-
#endif
335330

336331
prs->state = newTParserPosition(NULL);
337332
prs->state->state = TPS_Base;
@@ -368,15 +363,12 @@ TParserCopyInit(const TParser *orig)
368363
prs->charmaxlen = orig->charmaxlen;
369364
prs->str = orig->str + orig->state->posbyte;
370365
prs->lenstr = orig->lenstr - orig->state->posbyte;
371-
372-
#ifdef USE_WIDE_UPPER_LOWER
373366
prs->usewide = orig->usewide;
374367

375368
if (orig->pgwstr)
376369
prs->pgwstr = orig->pgwstr + orig->state->poschar;
377370
if (orig->wstr)
378371
prs->wstr = orig->wstr + orig->state->poschar;
379-
#endif
380372

381373
prs->state = newTParserPosition(NULL);
382374
prs->state->state = TPS_Base;
@@ -401,12 +393,10 @@ TParserClose(TParser *prs)
401393
prs->state = ptr;
402394
}
403395

404-
#ifdef USE_WIDE_UPPER_LOWER
405396
if (prs->wstr)
406397
pfree(prs->wstr);
407398
if (prs->pgwstr)
408399
pfree(prs->pgwstr);
409-
#endif
410400

411401
#ifdef WPARSER_TRACE
412402
fprintf(stderr, "closing parser\n");
@@ -445,96 +435,45 @@ TParserCopyClose(TParser *prs)
445435
* - if locale is C then we use pgwstr instead of wstr.
446436
*/
447437

448-
#ifdef USE_WIDE_UPPER_LOWER
449-
450-
#define p_iswhat(type) \
438+
#define p_iswhat(type, nonascii) \
439+
\
451440
static int \
452-
p_is##type(TParser *prs) { \
453-
Assert( prs->state ); \
454-
if ( prs->usewide ) \
441+
p_is##type(TParser *prs) \
442+
{ \
443+
Assert(prs->state); \
444+
if (prs->usewide) \
455445
{ \
456-
if ( prs->pgwstr ) \
446+
if (prs->pgwstr) \
457447
{ \
458448
unsigned int c = *(prs->pgwstr + prs->state->poschar); \
459-
if ( c > 0x7f ) \
460-
return 0; \
461-
return is##type( c ); \
449+
if (c > 0x7f) \
450+
return nonascii; \
451+
return is##type(c); \
462452
} \
463-
return isw##type( *( prs->wstr + prs->state->poschar ) ); \
453+
return isw##type(*(prs->wstr + prs->state->poschar)); \
464454
} \
465-
\
466-
return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
467-
} \
455+
return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
456+
} \
468457
\
469458
static int \
470-
p_isnot##type(TParser *prs) { \
459+
p_isnot##type(TParser *prs) \
460+
{ \
471461
return !p_is##type(prs); \
472462
}
473463

474-
static int
475-
p_isalnum(TParser *prs)
476-
{
477-
Assert(prs->state);
478-
479-
if (prs->usewide)
480-
{
481-
if (prs->pgwstr)
482-
{
483-
unsigned int c = *(prs->pgwstr + prs->state->poschar);
484-
485-
/*
486-
* any non-ascii symbol with multibyte encoding with C-locale is
487-
* an alpha character
488-
*/
489-
if (c > 0x7f)
490-
return 1;
491-
492-
return isalnum(c);
493-
}
494-
495-
return iswalnum(*(prs->wstr + prs->state->poschar));
496-
}
497-
498-
return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
499-
}
500-
static int
501-
p_isnotalnum(TParser *prs)
502-
{
503-
return !p_isalnum(prs);
504-
}
505-
506-
static int
507-
p_isalpha(TParser *prs)
508-
{
509-
Assert(prs->state);
510-
511-
if (prs->usewide)
512-
{
513-
if (prs->pgwstr)
514-
{
515-
unsigned int c = *(prs->pgwstr + prs->state->poschar);
516-
517-
/*
518-
* any non-ascii symbol with multibyte encoding with C-locale is
519-
* an alpha character
520-
*/
521-
if (c > 0x7f)
522-
return 1;
523-
524-
return isalpha(c);
525-
}
526-
527-
return iswalpha(*(prs->wstr + prs->state->poschar));
528-
}
529-
530-
return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
531-
}
532-
533-
static int
534-
p_isnotalpha(TParser *prs)
535-
{
536-
return !p_isalpha(prs);
537-
}
464+
/*
465+
* In C locale with a multibyte encoding, any non-ASCII symbol is considered
466+
* an alpha character, but not a member of other char classes.
467+
*/
468+
p_iswhat(alnum, 1)
469+
p_iswhat(alpha, 1)
470+
p_iswhat(digit, 0)
471+
p_iswhat(lower, 0)
472+
p_iswhat(print, 0)
473+
p_iswhat(punct, 0)
474+
p_iswhat(space, 0)
475+
p_iswhat(upper, 0)
476+
p_iswhat(xdigit, 0)
538477

539478
/* p_iseq should be used only for ascii symbols */
540479

@@ -544,39 +483,6 @@ p_iseq(TParser *prs, char c)
544483
Assert(prs->state);
545484
return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
546485
}
547-
#else /* USE_WIDE_UPPER_LOWER */
548-
549-
#define p_iswhat(type) \
550-
static int \
551-
p_is##type(TParser *prs) { \
552-
Assert( prs->state ); \
553-
return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
554-
} \
555-
\
556-
static int \
557-
p_isnot##type(TParser *prs) { \
558-
return !p_is##type(prs); \
559-
}
560-
561-
562-
static int
563-
p_iseq(TParser *prs, char c)
564-
{
565-
Assert(prs->state);
566-
return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
567-
}
568-
569-
p_iswhat(alnum)
570-
p_iswhat(alpha)
571-
#endif /* USE_WIDE_UPPER_LOWER */
572-
573-
p_iswhat(digit)
574-
p_iswhat(lower)
575-
p_iswhat(print)
576-
p_iswhat(punct)
577-
p_iswhat(space)
578-
p_iswhat(upper)
579-
p_iswhat(xdigit)
580486

581487
static int
582488
p_isEOF(TParser *prs)
@@ -793,8 +699,6 @@ p_isspecial(TParser *prs)
793699
if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
794700
return 1;
795701

796-
#ifdef USE_WIDE_UPPER_LOWER
797-
798702
/*
799703
* Unicode Characters in the 'Mark, Spacing Combining' Category That
800704
* characters are not alpha although they are not breakers of word too.
@@ -1058,7 +962,6 @@ p_isspecial(TParser *prs)
1058962
StopHigh = StopMiddle;
1059963
}
1060964
}
1061-
#endif
1062965

1063966
return 0;
1064967
}

0 commit comments

Comments
 (0)