Skip to content

Commit 36ac726

Browse files
committed
Fix embedded newline scanning
Fixes NLnetLabs#109.
1 parent 398454b commit 36ac726

File tree

6 files changed

+44
-50
lines changed

6 files changed

+44
-50
lines changed

include/zone.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ struct zone_file {
255255
// enough to hold every token for a single read + terminators
256256
struct { const char **head, **tail, *tape[ZONE_TAPE_SIZE + 2]; } fields;
257257
struct { const char **head, **tail, *tape[ZONE_TAPE_SIZE + 1]; } delimiters;
258-
struct { uint16_t *head, *tail, tape[ZONE_TAPE_SIZE + 1]; } lines;
258+
struct { uint16_t *head, *tail, tape[ZONE_TAPE_SIZE + 1]; } newlines;
259259
};
260260

261261
typedef struct zone_parser zone_parser_t;

src/fallback/scanner.h

+5-5
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,14 @@ static really_inline const char *scan_quoted(
4242
if ((parser->file->state.is_escaped = (++start == end)))
4343
break;
4444
assert(start < end);
45-
*parser->file->lines.tail += (*start == '\n');
45+
*parser->file->newlines.tail += (*start == '\n');
4646
start++;
4747
} else if (*start == '\"') {
4848
parser->file->state.in_quoted = 0;
4949
*parser->file->delimiters.tail++ = start;
5050
return ++start;
5151
} else {
52-
*parser->file->lines.tail += (*start == '\n');
52+
*parser->file->newlines.tail += (*start == '\n');
5353
start++;
5454
}
5555
}
@@ -72,7 +72,7 @@ static really_inline const char *scan_contiguous(
7272
if ((parser->file->state.is_escaped = (++start == end)))
7373
break;
7474
assert(start < end);
75-
parser->file->lines.tail[0] += (*start == '\n');
75+
parser->file->newlines.tail[0] += (*start == '\n');
7676
}
7777
start++;
7878
} else {
@@ -105,9 +105,9 @@ static really_inline void scan(
105105
*parser->file->fields.tail++ = start;
106106
start = scan_contiguous(parser, start, end);
107107
} else if (code == LINE_FEED) {
108-
if (*parser->file->lines.tail) {
108+
if (*parser->file->newlines.tail) {
109109
*parser->file->fields.tail++ = line_feed;
110-
parser->file->lines.tail++;
110+
parser->file->newlines.tail++;
111111
} else {
112112
*parser->file->fields.tail++ = start;
113113
}

src/generic/parser.h

+9-9
Original file line numberDiff line numberDiff line change
@@ -320,9 +320,9 @@ static really_inline int32_t advance(parser_t *parser)
320320
int32_t code;
321321

322322
// save embedded line count (quoted or escaped newlines)
323-
parser->file->lines.tape[0] = parser->file->lines.tail[0];
324-
parser->file->lines.head = parser->file->lines.tape;
325-
parser->file->lines.tail = parser->file->lines.tape;
323+
parser->file->newlines.tape[0] = parser->file->newlines.tail[0];
324+
parser->file->newlines.head = parser->file->newlines.tape;
325+
parser->file->newlines.tail = parser->file->newlines.tape;
326326
// restore non-terminated token (partial quoted or contiguous)
327327
parser->file->fields.tape[0] = parser->file->fields.tail[1];
328328
parser->file->fields.head = parser->file->fields.tape;
@@ -436,7 +436,7 @@ static never_inline void maybe_take(parser_t *parser, token_t *token)
436436
return;
437437
} else if (token->code == LINE_FEED) {
438438
if (unlikely(token->data == line_feed))
439-
parser->file->span += *parser->file->lines.head++;
439+
parser->file->span += *parser->file->newlines.head++;
440440
parser->file->span++;
441441
parser->file->fields.head++;
442442
if (unlikely(parser->file->grouped))
@@ -493,7 +493,7 @@ static really_inline void take(parser_t *parser, token_t *token)
493493
return;
494494
} else if (token->code == LINE_FEED) {
495495
if (unlikely(token->data == line_feed))
496-
parser->file->span += *parser->file->lines.head++;
496+
parser->file->span += *parser->file->newlines.head++;
497497
parser->file->span++;
498498
parser->file->fields.head++;
499499
if (unlikely(parser->file->grouped))
@@ -606,7 +606,7 @@ static never_inline int32_t maybe_take_contiguous(
606606
parser->file->fields.head++;
607607
} else if (token->code == LINE_FEED) {
608608
if (token->data == line_feed)
609-
parser->file->span += *parser->file->lines.head++;
609+
parser->file->span += *parser->file->newlines.head++;
610610
parser->file->span++;
611611
if (!parser->file->grouped)
612612
SYNTAX_ERROR(parser, token, "Missing %s in %s", NAME(field), NAME(type));
@@ -707,7 +707,7 @@ static never_inline int32_t maybe_take_quoted(
707707
parser->file->fields.head++;
708708
} else if (token->code == LINE_FEED) {
709709
if (token->data == line_feed)
710-
parser->file->span += *parser->file->lines.head++;
710+
parser->file->span += *parser->file->newlines.head++;
711711
parser->file->span++;
712712
if (!parser->file->grouped)
713713
SYNTAX_ERROR(parser, token, "Missing %s in %s", NAME(field), NAME(type));
@@ -812,7 +812,7 @@ static never_inline int32_t maybe_take_contiguous_or_quoted(
812812
parser->file->fields.head++;
813813
} else if (token->code == LINE_FEED) {
814814
if (token->data == line_feed)
815-
parser->file->span += *parser->file->lines.head++;
815+
parser->file->span += *parser->file->newlines.head++;
816816
parser->file->span++;
817817
if (!parser->file->grouped)
818818
SYNTAX_ERROR(parser, token, "Missing %s in %s", NAME(field), NAME(type));
@@ -905,7 +905,7 @@ static never_inline int32_t maybe_take_delimiter(
905905
for (;;) {
906906
if (likely(token->code == LINE_FEED)) {
907907
if (unlikely(token->data == line_feed))
908-
parser->file->span += *parser->file->lines.head++;
908+
parser->file->span += *parser->file->newlines.head++;
909909
if (unlikely(parser->file->grouped)) {
910910
parser->file->span++;
911911
parser->file->fields.head++;

src/generic/scanner.h

+22-26
Original file line numberDiff line numberDiff line change
@@ -209,44 +209,40 @@ static really_inline void write_indexes(parser_t *parser, const block_t *block,
209209
uint64_t delimiter_count = count_ones(delimiters);
210210
// bulk of the data are contiguous and quoted character strings. field and
211211
// delimiter counts are therefore (mostly) equal. select the greater number
212-
// and write out indexes using a single loop, (hopefully) leveraging
213-
// superscalar properties of modern CPUs
212+
// and write out indexes in a single loop leveraging superscalar properties
213+
// of modern CPUs
214214
uint64_t count = field_count;
215215
if (delimiter_count > field_count)
216216
count = delimiter_count;
217217

218-
uint64_t newline = block->newline;
219-
const uint64_t in_string = block->contiguous | block->in_quoted;
220-
221218
// take slow path if (escaped) newlines appear in contiguous or quoted
222219
// character strings. edge case, but must be supported and handled in the
223220
// scanner for ease of use and to accommodate for parallel processing in the
224221
// parser. escaped newlines may have been present in the last block
225-
if (unlikely(parser->file->lines.tail[0] || (newline & in_string))) {
226-
// FIXME: test logic properly, likely eligable for simplification
227-
for (count=0; count < field_count; count++) {
228-
const uint64_t field = -fields & fields;
229-
if (field & newline) {
230-
parser->file->lines.tail++;
231-
parser->file->fields.tail[count] = line_feed;
232-
newline &= -field;
222+
uint64_t newlines = block->newline & (block->contiguous | block->in_quoted);
223+
224+
if (unlikely(*parser->file->newlines.tail || newlines)) {
225+
for (uint64_t i=0; i < count; i++) {
226+
const uint64_t field = fields & -fields;
227+
const uint64_t delimiter = delimiters & -delimiters;
228+
if (field & block->newline) {
229+
*parser->file->newlines.tail += count_ones(newlines & (field - 1));
230+
if (*parser->file->newlines.tail) {
231+
parser->file->fields.tail[i] = line_feed;
232+
parser->file->newlines.tail++;
233+
} else {
234+
parser->file->fields.tail[i] = base + trailing_zeroes(field);
235+
}
236+
newlines &= -field;
233237
} else {
234-
// count newlines here so number of newlines remains correct if last
235-
// token is start of contiguous or quoted and index must be reset
236-
*parser->file->lines.tail += count_ones(newline & ~(-field));
237-
parser->file->fields.tail[count] = base + trailing_zeroes(field);
238-
newline &= -field;
238+
parser->file->fields.tail[i] = base + trailing_zeroes(field);
239239
}
240-
parser->file->delimiters.tail[count] = base + trailing_zeroes(delimiters);
241-
fields = clear_lowest_bit(fields);
242-
delimiters = clear_lowest_bit(delimiters);
243-
}
244-
245-
for (; count < delimiter_count; count++) {
246-
parser->file->delimiters.tail[count] = base + trailing_zeroes(delimiters);
247-
delimiters = clear_lowest_bit(delimiters);
240+
parser->file->delimiters.tail[i] = base + trailing_zeroes(delimiter);
241+
fields &= ~field;
242+
delimiters &= ~delimiter;
248243
}
249244

245+
*parser->file->newlines.tail += count_ones(newlines);
250246
parser->file->fields.tail += field_count;
251247
parser->file->delimiters.tail += delimiter_count;
252248
} else {

src/zone.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,8 @@ static void initialize_file(
260260
file->fields.head = file->fields.tail = file->fields.tape;
261261
file->delimiters.tape[0] = NULL;
262262
file->delimiters.head = file->delimiters.tail = file->delimiters.tape;
263-
file->lines.tape[0] = 0;
264-
file->lines.head = file->lines.tail = file->lines.tape;
263+
file->newlines.tape[0] = 0;
264+
file->newlines.head = file->newlines.tail = file->newlines.tape;
265265
}
266266

267267
nonnull_all

tests/syntax.c

+5-7
Original file line numberDiff line numberDiff line change
@@ -71,11 +71,10 @@ static int32_t newline_test_accept_rr(
7171
/*!cmocka */
7272
void newlines(void **state)
7373
{
74-
#if 0
75-
static const char embedded_lf_text[] =
74+
static const char quoted_lf_text[] =
7675
PAD("1. TXT \"foo\nbar\n\"\n2. TXT \"foobar\"");
77-
// >> do the same thing for contiguous
78-
#endif
76+
static const char escaped_lf_text[] =
77+
PAD("1. TXT foo\\\nbar\\\n\n2. TXT \"foobar\"");
7978
static const char grouped_lf_text[] =
8079
PAD("1. TXT (\nfoo\nbar\n)\n2. TXT \"foobar\"");
8180
static const char plain_lf_text[] =
@@ -88,9 +87,8 @@ void newlines(void **state)
8887
static const uint8_t origin[] = { 0 };
8988

9089
static const struct newline_test tests[] = {
91-
#if 0
92-
{ embedded_lf_text, { 1, 4 } },
93-
#endif
90+
{ quoted_lf_text, { 1, 4 } },
91+
{ escaped_lf_text, { 1, 4 } },
9492
{ grouped_lf_text, { 1, 5 } },
9593
{ plain_lf_text, { 1, 2 } },
9694
{ control_lf_text, { 2, 3 } },

0 commit comments

Comments
 (0)