123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296 |
- #include <rtgui/rtgui_xml.h>
- /* Internal states that the parser can be in at any given time. */
- enum {
- STAT_START = 0, /* starting base state, default state */
- STAT_TEXT, /* text state */
- STAT_START_TAG, /* start tag state */
- STAT_START_TAGNAME, /* start tagname state */
- STAT_START_TAGNAME_END, /* start tagname ending state */
- STAT_END_TAG, /* end tag state */
- STAT_END_TAGNAME, /* end tag tagname state */
- STAT_END_TAGNAME_END, /* end tag tagname ending */
- STAT_EMPTY_TAG, /* empty tag state */
- STAT_SPACE, /* linear whitespace state */
- STAT_ATTR_NAME, /* attribute name state */
- STAT_ATTR_NAME_END, /* attribute name ending state */
- STAT_ATTR_VAL, /* attribute value starting state */
- STAT_ATTR_VAL2, /* attribute value state */
- STAT_ERROR /* error state */
- };
- /* character classes that we will match against; This could be expanded if
- need be, however, we are aiming for simple. */
- enum {
- CLASS_TYPE_NONE = 0, /* matches nothing, a base state */
- CLASS_TYPE_LEFT_ANGLE, /* matches start tag '<' */
- CLASS_TYPE_SLASH, /* matches forward slash */
- CLASS_TYPE_RIGHT_ANGLE, /* matches end tag '>' */
- CLASS_TYPE_EQUALS, /* matches equals sign */
- CLASS_TYPE_QUOTE, /* matches double-quotes */
- CLASS_TYPE_LETTERS, /* matches a-zA-Z letters and digits 0-9 */
- CLASS_TYPE_SPACE, /* matches whitespace */
- CLASS_TYPE_ANY /* matches any ASCII character; will match all
- above classes */
- };
- /* xml state transition table */
- struct rtgui_xml_state
- {
- rt_uint8_t state;
- rt_uint8_t class_type;
- rt_uint8_t next_state;
- rt_uint8_t event;
- };
- /* Note: States must be grouped in match order AND grouped together! */
- static const struct rtgui_xml_state RTGUI_XML_STATES [] = {
- /* [0-2] starting state, which also serves as the default state in case
- of error */
- { STAT_START, CLASS_TYPE_SPACE, STAT_SPACE, EVENT_NONE },
- { STAT_START, CLASS_TYPE_LEFT_ANGLE, STAT_START_TAG, EVENT_NONE },
- { STAT_START, CLASS_TYPE_ANY, STAT_TEXT, EVENT_COPY },
- /* [3-5] space state handles linear white space */
- { STAT_SPACE, CLASS_TYPE_SPACE, STAT_SPACE, EVENT_NONE },
- { STAT_SPACE, CLASS_TYPE_LEFT_ANGLE, STAT_START_TAG, EVENT_TEXT },
- { STAT_SPACE, CLASS_TYPE_ANY, STAT_TEXT, EVENT_COPY },
- /* [6-8] handle start tag */
- { STAT_START_TAG, CLASS_TYPE_LETTERS, STAT_START_TAGNAME, EVENT_COPY },
- { STAT_START_TAG, CLASS_TYPE_SLASH, STAT_END_TAG, EVENT_COPY },
- /* below added since some individuals get a little carried away with
- spacing around tag names, e.g. < tag > */
- { STAT_START_TAG, CLASS_TYPE_SPACE, STAT_START_TAG, EVENT_NONE },
- /* [9-12] handle start tag name */
- { STAT_START_TAGNAME, CLASS_TYPE_LETTERS, STAT_START_TAGNAME, EVENT_NONE },
- { STAT_START_TAGNAME, CLASS_TYPE_SPACE, STAT_START_TAGNAME_END, EVENT_START },
- /* below added for tags without any space between tag and ending
- slash, e.g., <br/> */
- { STAT_START_TAGNAME, CLASS_TYPE_SLASH, STAT_EMPTY_TAG, EVENT_END },
- { STAT_START_TAGNAME, CLASS_TYPE_RIGHT_ANGLE, STAT_START, EVENT_START },
- /* [13-16] handle start tag name end */
- { STAT_START_TAGNAME_END, CLASS_TYPE_LETTERS, STAT_ATTR_NAME, EVENT_COPY },
- /* below added to handle additional space in between attribute value
- pairs in start tags, e.g., <tag attr="2" attr2="test" > */
- { STAT_START_TAGNAME_END, CLASS_TYPE_SPACE, STAT_START_TAGNAME_END, EVENT_NONE },
- { STAT_START_TAGNAME_END, CLASS_TYPE_RIGHT_ANGLE, STAT_START, EVENT_START },
- /* below supports tags that are self-closing, e.g., <br /> */
- { STAT_START_TAGNAME_END, CLASS_TYPE_SLASH, STAT_EMPTY_TAG, EVENT_COPY },
- /* [17] handle empty tags, e.g., <br /> */
- { STAT_EMPTY_TAG, CLASS_TYPE_RIGHT_ANGLE, STAT_START, EVENT_END },
- /* [18] handle end tag, e.g., <tag /> */
- { STAT_END_TAG, CLASS_TYPE_LETTERS, STAT_END_TAGNAME, EVENT_NONE },
- /* [19-21] handle end tag name */
- { STAT_END_TAGNAME, CLASS_TYPE_LETTERS, STAT_END_TAGNAME, EVENT_NONE },
- { STAT_END_TAGNAME, CLASS_TYPE_RIGHT_ANGLE, STAT_START, EVENT_END },
- /* below adds support for spaces at the end of an end tag (before
- closing bracket) */
- { STAT_END_TAGNAME, CLASS_TYPE_SPACE, STAT_END_TAGNAME_END, EVENT_END },
- /* [22] handle ending of end tag name */
- { STAT_END_TAGNAME_END, CLASS_TYPE_SPACE, STAT_END_TAGNAME_END, EVENT_NONE },
- { STAT_END_TAGNAME_END, CLASS_TYPE_RIGHT_ANGLE,STAT_START, EVENT_NONE },
- /* [23-25] handle text */
- { STAT_TEXT, CLASS_TYPE_SPACE, STAT_SPACE, EVENT_NONE },
- { STAT_TEXT, CLASS_TYPE_LEFT_ANGLE, STAT_START_TAG, EVENT_TEXT },
- { STAT_TEXT, CLASS_TYPE_ANY, STAT_TEXT, EVENT_NONE },
- /* [26-30] handle attribute names */
- { STAT_ATTR_NAME, CLASS_TYPE_LETTERS, STAT_ATTR_NAME, EVENT_COPY },
- /* below add support for space before the equals sign, e.g, <tag
- attr ="2"> */
- { STAT_ATTR_NAME, CLASS_TYPE_SPACE, STAT_ATTR_NAME_END, EVENT_NAME },
- { STAT_ATTR_NAME, CLASS_TYPE_EQUALS, STAT_ATTR_VAL, EVENT_NAME },
- /* [31-33] attribute name end */
- { STAT_ATTR_NAME_END, CLASS_TYPE_SPACE, STAT_ATTR_NAME_END, EVENT_NONE },
- { STAT_ATTR_NAME_END, CLASS_TYPE_LETTERS, STAT_ATTR_NAME, EVENT_COPY },
- { STAT_ATTR_NAME_END, CLASS_TYPE_EQUALS, STAT_ATTR_VAL, EVENT_NONE },
- /* [34-35] handle attribute values, initial quote and spaces */
- { STAT_ATTR_VAL, CLASS_TYPE_QUOTE, STAT_ATTR_VAL2, EVENT_NONE },
- /* below handles initial spaces before quoted attribute value */
- { STAT_ATTR_VAL, CLASS_TYPE_SPACE, STAT_ATTR_VAL, EVENT_NONE },
- /* [36-37] handle actual attribute values */
- { STAT_ATTR_VAL2, CLASS_TYPE_QUOTE, STAT_START_TAGNAME_END, EVENT_VAL },
- { STAT_ATTR_VAL2, CLASS_TYPE_LETTERS, STAT_ATTR_VAL2, EVENT_COPY },
- { STAT_ATTR_VAL2, CLASS_TYPE_SLASH, STAT_ATTR_VAL2, EVENT_NONE },
- /* End of table marker */
- { STAT_ERROR, CLASS_TYPE_NONE, STAT_ERROR, EVENT_NONE }
- };
- struct rtgui_xml
- {
- /* event handler */
- rtgui_xml_event_handler_t event_handler;
- void* user;
- char* buffer; /* xml buffer */
- rt_size_t buffer_size; /* buffer size */
- rt_size_t position; /* current position in buffer */
- rt_uint16_t state, event; /* current state and event */
- rt_bool_t copy; /* copy text into tmp buffer */
- rt_bool_t halt; /* halt parsing of document */
- };
- rtgui_xml_t* rtgui_xml_create(rt_size_t buffer_size, rtgui_xml_event_handler_t handler,
- void* user)
- {
- rtgui_xml_t* xml = (rtgui_xml_t*) rtgui_malloc(sizeof(struct rtgui_xml));
- rt_memset(xml, 0, sizeof(rtgui_xml_t));
- xml->event_handler = handler;
- xml->user = user;
- /* create buffer */
- xml->buffer_size = buffer_size;
- xml->buffer = (char*)rtgui_malloc(xml->buffer_size);
- return xml;
- }
- void rtgui_xml_destroy(rtgui_xml_t* xml)
- {
- if(xml)
- {
- rtgui_free(xml->buffer);
- rtgui_free(xml);
- }
- }
- const char* rtgui_xml_event_str(rt_uint8_t event)
- {
- switch(event)
- {
- case EVENT_START:
- return "start tag";
- break;
- case EVENT_END:
- return "end tag";
- break;
- case EVENT_TEXT:
- return "text";
- break;
- case EVENT_NAME:
- return "attr name";
- break;
- case EVENT_VAL:
- return "attr val";
- break;
- case EVENT_END_DOC:
- return "end document";
- break;
- default:
- break;
- }
- return "err";
- }
- int rtgui_xml_parse(rtgui_xml_t* xml, const char* buf, rt_size_t len)
- {
- int i, j, c, match;
- #define is_space(ch) \
- ((rt_uint32_t)(ch - 9) < 5u || ch == ' ')
- #define is_alpha(ch) \
- ((rt_uint32_t)((ch | 0x20) - 'a') < 26u)
- #define is_digit(ch) \
- ((rt_uint32_t)(ch - '0') < 10u)
- #define is_letters(ch) \
- (is_alpha(ch) || is_digit(ch) || (ch == '.'))
- for(i=0; i<len; i++)
- {
- if(xml->halt) break;
- c = buf[i] & 0xff;
- /* search in state table */
- for(j=0, match = 0; RTGUI_XML_STATES[j].state != STAT_ERROR; j++)
- {
- if(RTGUI_XML_STATES[j].state != xml->state)
- continue;
- switch(RTGUI_XML_STATES[j].class_type)
- {
- case CLASS_TYPE_LETTERS:
- match = is_letters(c);
- break;
- case CLASS_TYPE_LEFT_ANGLE:
- match = (c == '<');
- break;
- case CLASS_TYPE_SLASH:
- match = (c == '/');
- break;
- case CLASS_TYPE_RIGHT_ANGLE:
- match = (c == '>');
- break;
- case CLASS_TYPE_EQUALS:
- match = (c == '=');
- break;
- case CLASS_TYPE_QUOTE:
- match = (c == '"');
- break;
- case CLASS_TYPE_SPACE:
- match = is_space(c);
- break;
- case CLASS_TYPE_ANY:
- match = 1;
- break;
- default:
- break;
- }
- /* we matched a character class */
- if(match)
- {
- if(RTGUI_XML_STATES[j].event == EVENT_COPY)
- {
- xml->copy = RT_TRUE;
- }
- else if(RTGUI_XML_STATES[j].event != EVENT_NONE)
- {
- if(xml->copy == RT_TRUE)
- {
- /* basically we are guaranteed never to have an event of
- type EVENT_COPY or EVENT_NONE here. */
- xml->event = RTGUI_XML_STATES[j].event;
- xml->buffer[xml->position] = 0; /* make a string */
- if(!xml->event_handler(RTGUI_XML_STATES[j].event,
- xml->buffer, xml->position ,
- xml->user))
- {
- xml->halt = 1; /* stop parsing from here out */
- }
- xml->position = 0;
- xml->copy = RT_FALSE;
- }
- }
- if(xml->copy == RT_TRUE)
- {
- /* check to see if we have room; one less for trailing
- nul */
- if(xml->position < xml->buffer_size-1)
- {
- xml->buffer[xml->position] = buf[i];
- xml->position++;
- }
- }
- xml->state = RTGUI_XML_STATES[j].next_state; /* change state */
- break; /* break out of loop though state search */
- }
- }
- }
- return !xml->halt;
- }
|