4 * A simple XML 1.0 processor. This handles all XML features that are used in
5 * WIM files, plus a bit more for futureproofing. It omits problematic
6 * features, such as expansion of entities other than simple escape sequences.
10 * Copyright 2023 Eric Biggers
12 * This file is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU Lesser General Public License as published by the Free
14 * Software Foundation; either version 3 of the License, or (at your option) any
17 * This file is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
22 * You should have received a copy of the GNU Lesser General Public License
23 * along with this file; if not, see https://www.gnu.org/licenses/.
32 #include "wimlib/error.h"
33 #include "wimlib/test_support.h"
34 #include "wimlib/util.h"
35 #include "wimlib/xmlproc.h"
37 /*----------------------------------------------------------------------------*
38 * XML node utility functions *
39 *----------------------------------------------------------------------------*/
42 tstrdupz(const tchar *str, size_t len)
44 tchar *new_str = CALLOC(len + 1, sizeof(str[0]));
47 tmemcpy(new_str, str, len);
51 static struct xml_node *
52 xml_new_node(struct xml_node *parent, enum xml_node_type type,
53 const tchar *name, size_t name_len,
54 const tchar *value, size_t value_len)
56 struct xml_node *node = CALLOC(1, sizeof(*node));
61 INIT_LIST_HEAD(&node->children);
63 node->name = tstrdupz(name, name_len);
68 node->value = tstrdupz(value, value_len);
73 xml_add_child(parent, node);
82 * Create a new ELEMENT node, and if @parent is non-NULL add the new node under
83 * @parent which should be another ELEMENT.
86 xml_new_element(struct xml_node *parent, const tchar *name)
88 return xml_new_node(parent, XML_ELEMENT_NODE, name, tstrlen(name),
93 * Create a new ELEMENT node with an attached TEXT node, and if @parent is
94 * non-NULL add the new ELEMENT under @parent which should be another ELEMENT.
97 xml_new_element_with_text(struct xml_node *parent, const tchar *name,
100 struct xml_node *element = xml_new_element(parent, name);
102 if (element && xml_element_set_text(element, text) != 0) {
103 xml_free_node(element);
109 /* Append @child to the children list of @parent. */
111 xml_add_child(struct xml_node *parent, struct xml_node *child)
113 xml_unlink_node(child); /* Shouldn't be needed, but be safe. */
114 child->parent = parent;
115 list_add_tail(&child->sibling_link, &parent->children);
118 /* Unlink @node from its parent, if it has one. */
120 xml_unlink_node(struct xml_node *node)
123 list_del(&node->sibling_link);
129 xml_free_children(struct xml_node *parent)
131 struct xml_node *child, *tmp;
133 list_for_each_entry_safe(child, tmp, &parent->children, sibling_link)
134 xml_free_node(child);
137 /* Recursively free @node, first unlinking it if needed. @node may be NULL. */
139 xml_free_node(struct xml_node *node)
142 xml_unlink_node(node);
143 xml_free_children(node);
151 * Return the text from the first TEXT child node of @element, or NULL if no
152 * such node exists. @element may be NULL.
155 xml_element_get_text(const struct xml_node *element)
157 const struct xml_node *child;
159 xml_node_for_each_child(element, child)
160 if (child->type == XML_TEXT_NODE)
166 * Set the contents of the given @element to the given @text, replacing the
167 * entire existing contents if any.
170 xml_element_set_text(struct xml_node *element, const tchar *text)
172 struct xml_node *text_node = xml_new_node(NULL, XML_TEXT_NODE, NULL, 0,
173 text, tstrlen(text));
175 return WIMLIB_ERR_NOMEM;
176 xml_free_children(element);
177 xml_add_child(element, text_node);
182 xml_element_append_text(struct xml_node *element,
183 const tchar *text, size_t text_len)
185 struct xml_node *last_child;
187 if (!list_empty(&element->children) &&
189 list_last_entry(&element->children, struct xml_node,
190 sibling_link))->type == XML_TEXT_NODE) {
192 * The new TEXT would directly follow another TEXT, so simplify
193 * the tree by just appending to the existing TEXT. (This case
194 * can theoretically be reached via the use of CDATA...)
196 size_t old_len = tstrlen(last_child->value);
197 tchar *new_value = CALLOC(old_len + text_len + 1,
198 sizeof(new_value[0]));
200 return WIMLIB_ERR_NOMEM;
201 tmemcpy(new_value, last_child->value, old_len);
202 tmemcpy(&new_value[old_len], text, text_len);
203 FREE(last_child->value);
204 last_child->value = new_value;
207 if (!xml_new_node(element, XML_TEXT_NODE, NULL, 0, text, text_len))
208 return WIMLIB_ERR_NOMEM;
212 /* Find the attribute with the given @name on @element. */
214 xml_get_attrib(const struct xml_node *element, const tchar *name)
216 struct xml_node *child;
218 xml_node_for_each_child(element, child) {
219 if (child->type == XML_ATTRIBUTE_NODE &&
220 !tstrcmp(child->name, name))
226 /* Set the attribute @name=@value on the given @element. */
228 xml_set_attrib(struct xml_node *element, const tchar *name, const tchar *value)
230 struct xml_node *attrib = xml_new_node(NULL, XML_ATTRIBUTE_NODE,
232 value, tstrlen(value));
234 return WIMLIB_ERR_NOMEM;
235 xml_replace_child(element, attrib);
240 * Add the ELEMENT or ATTRIBUTE node @replacement under the ELEMENT @parent,
241 * replacing any node with the same type and name that already exists.
244 xml_replace_child(struct xml_node *parent, struct xml_node *replacement)
246 struct xml_node *child;
248 xml_unlink_node(replacement); /* Shouldn't be needed, but be safe. */
250 xml_node_for_each_child(parent, child) {
251 if (child->type == replacement->type &&
252 !tstrcmp(child->name, replacement->name)) {
253 list_replace(&child->sibling_link,
254 &replacement->sibling_link);
255 replacement->parent = parent;
256 child->parent = NULL;
257 xml_free_node(child);
261 xml_add_child(parent, replacement);
265 xml_clone_tree(struct xml_node *orig)
267 struct xml_node *clone, *orig_child, *clone_child;
269 clone = xml_new_node(NULL, orig->type,
270 orig->name, orig->name ? tstrlen(orig->name) : 0,
271 orig->value, orig->value ? tstrlen(orig->value) : 0);
274 xml_node_for_each_child(orig, orig_child) {
275 clone_child = xml_clone_tree(orig_child);
278 xml_add_child(clone, clone_child);
283 xml_free_node(clone);
287 /*----------------------------------------------------------------------------*
288 * XML string validation *
289 *----------------------------------------------------------------------------*/
292 * Functions that check for legal names and values in XML 1.0. These are
293 * currently slightly over-lenient, as they allow everything non-ASCII. These
294 * are also not currently used by the XML parser to reject non-well-formed
295 * documents, but rather just by the user of the XML processor (xml.c) in order
296 * to avoid introducing illegal names and values into the document.
300 is_whitespace(tchar c)
302 return c == ' ' || c == '\n' || c == '\r' || c == '\t';
306 is_name_start_char(tchar c)
308 return (c & 0x7f) != c /* overly lenient for now */ ||
309 (c >= 'A' && c <= 'Z') ||
310 (c >= 'a' && c <= 'z') ||
311 c == ':' || c == '_';
315 is_name_char(tchar c)
317 return is_name_start_char(c) ||
318 (c >= '0' && c <= '9') || c == '-' || c == '.';
321 /* Allow characters used in element "paths"; see do_xml_path_walk() */
323 is_path_char(tchar c)
325 return c == '/' || c == '[' || c == ']';
329 xml_legal_path(const tchar *p)
331 if (!is_name_start_char(*p) && !is_path_char(*p))
333 for (p = p + 1; *p; p++) {
334 if (!is_name_char(*p) && !is_path_char(*p))
341 xml_legal_value(const tchar *p)
344 if (*p < 0x20 && !is_whitespace(*p))
351 #define BYTE_ORDER_MARK (tchar[]){ 0xfeff, 0 }
353 #define BYTE_ORDER_MARK "\xEF\xBB\xBF"
356 /*----------------------------------------------------------------------------*
358 *----------------------------------------------------------------------------*/
360 #define CHECK(cond) if (!(cond)) goto bad
363 skip_whitespace(const tchar **pp)
365 const tchar *p = *pp;
367 while (is_whitespace(*p))
373 skip_string(const tchar **pp, const tchar *str)
375 const tchar *p = *pp;
376 size_t len = tstrlen(str);
378 if (tstrncmp(p, str, len))
385 find_and_skip(const tchar **pp, const tchar *str)
387 const tchar *p = *pp;
392 *pp = p + tstrlen(str);
397 skip_misc(const tchar **pp)
399 const tchar *p = *pp, *prev_p;
404 /* Discard XML declaration and top-level PIs for now. */
405 if (skip_string(&p, T("<?")) && !find_and_skip(&p, T("?>")))
407 /* Discard DOCTYPE declaration for now. */
408 if (skip_string(&p, T("<!DOCTYPE")) && !find_and_skip(&p, T(">")))
410 /* Discard top-level comments for now. */
411 if (skip_string(&p, T("<!--")) && !find_and_skip(&p, T("-->")))
413 } while (p != prev_p);
418 static inline const tchar *
419 get_escape_seq(tchar c)
436 /* Note: 'str' must be NUL-terminated, but only 'len' chars are used. */
438 unescape_string(const tchar *str, size_t len, tchar **unescaped_ret)
440 const tchar *in_p = str;
441 tchar *unescaped, *out_p;
443 unescaped = CALLOC(len + 1, sizeof(str[0]));
445 return WIMLIB_ERR_NOMEM;
447 while (in_p < &str[len]) {
450 else if (skip_string(&in_p, T("<")))
452 else if (skip_string(&in_p, T(">")))
454 else if (skip_string(&in_p, T("&")))
456 else if (skip_string(&in_p, T("'")))
458 else if (skip_string(&in_p, T(""")))
463 if (in_p > &str[len])
465 *unescaped_ret = unescaped;
469 ERROR("Error unescaping string '%.*"TS"'", (int)len, str);
471 return WIMLIB_ERR_XML;
475 parse_element(const tchar **pp, struct xml_node *parent, int depth,
476 struct xml_node **node_ret);
479 parse_contents(const tchar **pp, struct xml_node *element, int depth)
481 const tchar *p = *pp;
485 const tchar *raw_text = p;
488 for (; *p != '<'; p++) {
490 return WIMLIB_ERR_XML;
493 ret = unescape_string(raw_text, p - raw_text, &text);
496 ret = xml_element_append_text(element, text,
503 break; /* Reached the end tag of @element */
504 } else if (p[1] == '?') {
505 /* Discard processing instructions for now. */
507 if (!find_and_skip(&p, T("?>")))
508 return WIMLIB_ERR_XML;
510 } else if (p[1] == '!') {
511 if (skip_string(&p, T("<![CDATA["))) {
513 if (!find_and_skip(&p, T("]]>")))
514 return WIMLIB_ERR_XML;
515 ret = xml_element_append_text(element, raw_text,
520 } else if (skip_string(&p, T("<!--"))) {
521 /* Discard comments for now. */
522 if (!find_and_skip(&p, T("-->")))
523 return WIMLIB_ERR_XML;
526 return WIMLIB_ERR_XML;
528 ret = parse_element(&p, element, depth + 1, NULL);
537 parse_element(const tchar **pp, struct xml_node *parent, int depth,
538 struct xml_node **element_ret)
540 const tchar *p = *pp;
541 struct xml_node *element = NULL;
542 const tchar *name_start;
546 /* Parse the start tag. */
551 while (!is_whitespace(*p) && *p != '>' && *p != '/' && *p != '\0')
553 name_len = p - name_start;
555 element = xml_new_node(parent, XML_ELEMENT_NODE, name_start, name_len,
558 ret = WIMLIB_ERR_NOMEM;
561 /* Parse the attributes list within the start tag. */
562 while (is_whitespace(*p)) {
563 const tchar *attr_name_start, *attr_value_start;
564 size_t attr_name_len, attr_value_len;
569 if (*p == '/' || *p == '>')
572 while (*p != '=' && !is_whitespace(*p) && *p != '\0')
574 attr_name_len = p - attr_name_start;
576 CHECK(attr_name_len > 0 && *p == '=');
580 CHECK(quote == '\'' || quote == '"');
581 attr_value_start = ++p;
582 while (*p != quote && *p != '\0')
585 attr_value_len = p - attr_value_start;
587 ret = unescape_string(attr_value_start, attr_value_len,
591 ret = xml_new_node(element, XML_ATTRIBUTE_NODE,
592 attr_name_start, attr_name_len,
593 attr_value, tstrlen(attr_value))
594 ? 0 : WIMLIB_ERR_NOMEM;
600 /* Closing an empty element tag */
603 /* Closing the start tag */
606 /* Parse the contents, then the end tag. */
607 ret = parse_contents(&p, element, depth);
614 CHECK(!tstrncmp(p, name_start, name_len));
622 *element_ret = element;
626 xml_free_node(element);
630 ret = WIMLIB_ERR_XML;
635 * Deserialize an XML document and return its root node in @doc_ret. The
636 * document must be given as a NUL-terminated string of 'tchar', i.e. UTF-16LE
637 * in Windows builds and UTF-8 everywhere else.
640 xml_parse_document(const tchar *p, struct xml_node **doc_ret)
643 struct xml_node *doc;
645 skip_string(&p, BYTE_ORDER_MARK);
647 return WIMLIB_ERR_XML;
648 ret = parse_element(&p, NULL, 0, &doc);
651 if (!skip_misc(&p) || *p) {
653 return WIMLIB_ERR_XML;
659 /*----------------------------------------------------------------------------*
661 *----------------------------------------------------------------------------*/
664 xml_write(struct xml_out_buf *buf, const tchar *str, size_t len)
666 if (buf->count + len + 1 > buf->capacity) {
667 size_t new_capacity = max(buf->capacity * 2, 4096);
668 tchar *new_buf = REALLOC(buf->buf,
669 new_capacity * sizeof(str[0]));
675 buf->capacity = new_capacity;
677 tmemcpy(&buf->buf[buf->count], str, len);
682 xml_puts(struct xml_out_buf *buf, const tchar *str)
684 xml_write(buf, str, tstrlen(str));
688 xml_escape_and_puts(struct xml_out_buf *buf, const tchar *str)
690 const tchar *p = str, *saved, *seq = NULL;
693 for (saved = p; *p && (seq = get_escape_seq(*p)) == NULL; p++)
695 xml_write(buf, saved, p - saved);
703 xml_write_element(struct xml_node *element, struct xml_out_buf *buf)
705 struct xml_node *child;
707 /* Write the start tag. */
708 xml_puts(buf, T("<"));
709 xml_puts(buf, element->name);
710 xml_node_for_each_child(element, child) {
711 if (child->type == XML_ATTRIBUTE_NODE) {
712 xml_puts(buf, T(" "));
713 xml_puts(buf, child->name);
714 xml_puts(buf, T("=\""));
715 xml_escape_and_puts(buf, child->value);
716 xml_puts(buf, T("\""));
719 xml_puts(buf, T(">"));
721 /* Write the contents. */
722 xml_node_for_each_child(element, child) {
723 if (child->type == XML_TEXT_NODE)
724 xml_escape_and_puts(buf, child->value);
725 else if (child->type == XML_ELEMENT_NODE)
726 xml_write_element(child, buf);
729 /* Write the end tag. */
730 xml_puts(buf, T("</"));
731 xml_puts(buf, element->name);
732 xml_puts(buf, T(">"));
736 * Serialize the document @doc into @buf as a NUL-terminated string of 'tchar',
737 * i.e. UTF-16LE in Windows builds and UTF-8 everywhere else. A byte order mark
738 * (BOM) is included, as this is needed for compatibility with WIMGAPI.
741 xml_write_document(struct xml_node *doc, struct xml_out_buf *buf)
743 xml_puts(buf, BYTE_ORDER_MARK);
744 xml_write_element(doc, buf);
746 return WIMLIB_ERR_NOMEM;
747 buf->buf[buf->count] = '\0';
751 /*----------------------------------------------------------------------------*
753 *----------------------------------------------------------------------------*/
755 #ifdef ENABLE_TEST_SUPPORT
757 wimlib_parse_and_write_xml_doc(const tchar *in, tchar **out_ret)
759 struct xml_node *doc;
760 struct xml_out_buf buf = {};
763 ret = xml_parse_document(in, &doc);
766 ret = xml_write_document(doc, &buf);
771 #endif /* ENABLE_TEST_SUPPORT */