From 78ce36f1d9242eea32b06b972fa79f9f34f4363a Mon Sep 17 00:00:00 2001 From: zhangjipeng Date: Mon, 17 Feb 2025 21:53:00 +0800 Subject: [PATCH] add xml tokenizer --- ext/common/psx_array.h | 3 + ext/svg/psx_xml_token.cpp | 363 +++++++++++++++++++++++++++++++++++++- ext/svg/psx_xml_token.h | 2 +- 3 files changed, 365 insertions(+), 3 deletions(-) diff --git a/ext/common/psx_array.h b/ext/common/psx_array.h index e1b860b..87ac484 100644 --- a/ext/common/psx_array.h +++ b/ext/common/psx_array.h @@ -159,6 +159,9 @@ static INLINE bool psx_array_append(psx_array_t* array, const void* value) #define psx_array_get(array, i, type) \ (type*)psx_array_at((array), (i)) +#define psx_array_get_last(array, type) \ + psx_array_get((array), psx_array_size((array)) - 1, type) + #define psx_array_push_back(array, value) \ psx_array_append((array), (void*)&(value)) diff --git a/ext/svg/psx_xml_token.cpp b/ext/svg/psx_xml_token.cpp index c0578ca..b7ed520 100644 --- a/ext/svg/psx_xml_token.cpp +++ b/ext/svg/psx_xml_token.cpp @@ -55,7 +55,7 @@ enum { enum { NO_QUOTE = 0, SINGLE_QUOTE = 1, - SDOUBLE_QUOTE = 2, + DOUBLE_QUOTE = 2, }; enum { @@ -85,8 +85,367 @@ typedef struct { const char* end; } xml_token_state_t; -bool psx_xml_tokenizer(const char* xml_data, uint32_t data_len, xml_token_process, void* data) +static INLINE void _psx_token_init(psx_xml_token_t* token) { + token->flags = 0; + token->start = NULL; + token->end = NULL; + token->type = PSX_XML_CONTENT; + token->cur_attr = NULL; + psx_array_init_type(&token->attrs, psx_xml_token_attr_t); +} + +static INLINE void _psx_token_clear(psx_xml_token_t* token) +{ + token->flags = 0; + token->start = NULL; + token->end = NULL; + token->type = PSX_XML_CONTENT; + token->cur_attr = NULL; + psx_array_clear(&token->attrs); +} + +static INLINE psx_xml_token_attr_t* _create_xml_attr(psx_xml_token_t* token) +{ + psx_array_append(&token->attrs, NULL); + + psx_xml_token_attr_t* attr = psx_array_get_last(&token->attrs, psx_xml_token_attr_t); + memset(attr, 0, sizeof(psx_xml_token_attr_t)); + return attr; +} + +static INLINE bool _xml_token_process(psx_xml_token_t* token, xml_token_process cb, void* data) +{ + if (!token->start || TOKEN_LEN(token) == 0) { + return true; + } + + bool ret = cb(data, token); + _psx_token_clear(token); + return ret; +} + +static INLINE void _psx_set_state(xml_token_state_t* state, uint32_t bit) +{ + state->flags |= bit; +} + +static INLINE void _psx_clear_state(xml_token_state_t* state, uint32_t bit) +{ + state->flags &= ~bit; +} + +static INLINE bool _psx_is_state(xml_token_state_t* state, uint32_t bit) +{ + return state->flags & bit; +} + +static INLINE void _psx_set_tag_state(xml_token_state_t* state, uint32_t bit) +{ + state->flags = (state->flags & ~IN_TAG_MASK) | bit; +} + +static INLINE void _psx_set_quote_state(xml_token_state_t* state, uint32_t bit) +{ + state->flags = (state->flags & ~IN_QUOTE_MASK) | (bit << 6); +} + +static INLINE bool _psx_special_handles(xml_token_state_t* state) +{ + return state->flags & (IN_START_TAG | IN_SEARCH | IN_TAG_MASK | IN_ENTITY_MASK | IN_COMMENT | IN_SERVER_SIDE | IN_DOCTYPE | IN_XMLINST | IN_SCRIPT); +} + +static INLINE void _psx_proc_xml_inst(xml_token_state_t* state, psx_xml_token_t* token) +{ + // ignore xml inst + while (state->cur <= state->end) { + char ch = *(state->cur); + if (ch == '>' && (*(state->cur - 1)) == '?') { + _psx_clear_state(state, IN_XMLINST); + state->cur++; + break; + } + state->cur++; + } +} + +static INLINE void _psx_proc_comment(xml_token_state_t* state, psx_xml_token_t* token) +{ + // ignore comment + while (state->cur <= state->end) { + char ch = *(state->cur); + if (ch == '>' && (*(state->cur - 1)) == '-' && (*(state->cur - 2)) == '-') { + _psx_clear_state(state, IN_COMMENT); + state->cur++; + break; + } + state->cur++; + } +} + +static INLINE void _psx_proc_doctype(xml_token_state_t* state, psx_xml_token_t* token) +{ + // ignore doctype + while (state->cur <= state->end) { + char ch = *(state->cur); + if (ch == '>') { + _psx_clear_state(state, IN_DOCTYPE); + state->cur++; + break; + } + state->cur++; + } +} + +static INLINE bool _psx_proc_tag(xml_token_state_t* state, psx_xml_token_t* token, xml_token_process cb, void* data) +{ + while (state->cur <= state->end) { + switch (state->flags & IN_TAG_MASK) { + case NO_TAG: { + if (!_xml_token_process(token, cb, data)) { + return false; + } + state->cur++; + } + return true; + case TAG_NAME: { + char ch = *(state->cur); + if (ch == '/') { + token->type = PSX_XML_END; + state->cur++; + if (!token->start) { + token->start = state->cur; + } + continue; + } else if (ch == '>' || isspace(ch)) { + token->end = state->cur; + _psx_set_tag_state(state, ATTR_START); + continue; + } else { + if (!token->start) { + token->type = PSX_XML_BEGIN; + token->start = state->cur; + } + state->cur++; + continue; + } + } + break; + case ATTR_START: { + char ch = *(state->cur); + if (!isspace(ch) && ch != '\'' && ch != '\"') { + if (ch == '/') { + token->flags |= PSX_XML_TOKEN_FLAT; + state->cur++; + continue; + } + if (ch == '>') { + _psx_set_tag_state(state, NO_TAG); + } else { + token->cur_attr = NULL; + _psx_set_tag_state(state, ATTR_NAME); + } + continue; + } + } + break; + case ATTR_NAME: { + if (!token->cur_attr) { + token->cur_attr = _create_xml_attr(token); + } + char ch = *(state->cur); + if (isspace(ch) || ch == '=' || ch == '/' || ch == '>') { + token->cur_attr->name_end = state->cur; + _psx_set_tag_state(state, SEARCH_EQUAL); + continue; + } else { + if (!token->cur_attr->name_start) { + token->cur_attr->name_start = state->cur; + } + state->cur++; + continue; + } + } + break; + case SEARCH_EQUAL: { + char ch = *(state->cur); + if (!isspace(ch) && ch != '/' && ch != '\'' && ch != '\"') { + if (ch == '=') { + _psx_set_tag_state(state, SEARCH_VALUE); + } else { + // attr name has empty value + token->cur_attr = NULL; + _psx_set_tag_state(state, ATTR_START); + continue; + } + } + } + break; + case SEARCH_VALUE: { + char ch = *(state->cur); + if (!isspace(ch)) { + if (ch == '\'' || ch == '\"') { + if (ch == '\'') { + _psx_set_quote_state(state, SINGLE_QUOTE); + } else { + _psx_set_quote_state(state, DOUBLE_QUOTE); + } + _psx_set_tag_state(state, QUOTE_VALUE); + } else { + _psx_set_tag_state(state, VALUE); + continue; + } + } + } + break; + case QUOTE_VALUE: { + char ch = *(state->cur); + if ((ch == '\'' && ((state->flags & IN_QUOTE_MASK) >> 3) == SINGLE_QUOTE) + || (ch == '\"' && ((state->flags & IN_QUOTE_MASK) >> 3) == DOUBLE_QUOTE)) { + if (!token->cur_attr->value_start) { + token->cur_attr->value_start = state->cur; + } + token->cur_attr->value_end = state->cur; + _psx_set_quote_state(state, NO_QUOTE); + _psx_set_tag_state(state, ATTR_START); + continue; + } else { + if (!token->cur_attr->value_start) { + token->cur_attr->value_start = state->cur; + } + state->cur++; + continue; + } + } + break; + case VALUE: { + char ch = *(state->cur); + if (isspace(ch) || ch == '>' || ch == '/') { + if (!token->cur_attr->value_start) { + token->cur_attr->value_start = state->cur; + } + token->cur_attr->value_end = state->cur; + _psx_set_quote_state(state, NO_QUOTE); + _psx_set_tag_state(state, ATTR_START); + continue; + } else { + if (!token->cur_attr->value_start) { + token->cur_attr->value_start = state->cur; + } + state->cur++; + continue; + } + } + break; + } + state->cur++; + } + return true; +} + +bool psx_xml_tokenizer(const char* xml_data, uint32_t data_len, xml_token_process cb, void* data) +{ + psx_xml_token_t token; + _psx_token_init(&token); + xml_token_state_t state = { .flags = 0, .cur = xml_data, .end = xml_data + data_len }; + + while (state.cur < state.end) { + char ch = *(state.cur); + if (ch == '\r' || ch == '\n') { // skip LR character + state.cur++; + continue; + } else if (_psx_special_handles(&state)) { + if (_psx_is_state(&state, IN_START_TAG)) { + _psx_clear_state(&state, IN_START_TAG); + + switch (ch) { + case '/': // end tag + _psx_set_tag_state(&state, TAG_NAME); + break; + case '!': { + //