4 * A simple XML 1.0 processor. This handles all XML features that are used in
5 * WIM files, plus a bit more for futureproofing. It omits problematic
6 * features, such as expansion of entities other than simple escape sequences.
10 * Copyright 2023 Eric Biggers
12 * This file is free software; you can redistribute it and/or modify it under
13 * the terms of the GNU Lesser General Public License as published by the Free
14 * Software Foundation; either version 3 of the License, or (at your option) any
17 * This file is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
22 * You should have received a copy of the GNU Lesser General Public License
23 * along with this file; if not, see https://www.gnu.org/licenses/.
32 #include "wimlib/error.h"
33 #include "wimlib/test_support.h"
34 #include "wimlib/util.h"
35 #include "wimlib/xmlproc.h"
37 /*----------------------------------------------------------------------------*
38 * XML node utility functions *
39 *----------------------------------------------------------------------------*/
42 tstrdupz(const tchar *str, size_t len)
44 tchar *new_str = CALLOC(len + 1, sizeof(str[0]));
47 tmemcpy(new_str, str, len);
51 static struct xml_node *
52 xml_new_node(struct xml_node *parent, enum xml_node_type type,
53 const tchar *name, size_t name_len,
54 const tchar *value, size_t value_len)
56 struct xml_node *node = CALLOC(1, sizeof(*node));
61 INIT_LIST_HEAD(&node->children);
63 node->name = tstrdupz(name, name_len);
68 node->value = tstrdupz(value, value_len);
73 xml_add_child(parent, node);
82 * Create a new ELEMENT node, and if @parent is non-NULL add the new node under
83 * @parent which should be another ELEMENT.
86 xml_new_element(struct xml_node *parent, const tchar *name)
88 return xml_new_node(parent, XML_ELEMENT_NODE, name, tstrlen(name),
93 * Create a new ELEMENT node with an attached TEXT node, and if @parent is
94 * non-NULL add the new ELEMENT under @parent which should be another ELEMENT.
97 xml_new_element_with_text(struct xml_node *parent, const tchar *name,
100 struct xml_node *element = xml_new_element(parent, name);
102 if (element && xml_element_set_text(element, text) != 0) {
103 xml_free_node(element);
109 /* Append @child to the children list of @parent. */
111 xml_add_child(struct xml_node *parent, struct xml_node *child)
113 xml_unlink_node(child); /* Shouldn't be needed, but be safe. */
114 child->parent = parent;
115 list_add_tail(&child->sibling_link, &parent->children);
118 /* Unlink @node from its parent, if it has one. */
120 xml_unlink_node(struct xml_node *node)
123 list_del(&node->sibling_link);
129 xml_free_children(struct xml_node *parent)
131 struct xml_node *child, *tmp;
133 list_for_each_entry_safe(child, tmp, &parent->children, sibling_link)
134 xml_free_node(child);
137 /* Recursively free @node, first unlinking it if needed. @node may be NULL. */
139 xml_free_node(struct xml_node *node)
142 xml_unlink_node(node);
143 xml_free_children(node);
151 * Return the text from the first TEXT child node of @element, or NULL if no
152 * such node exists. @element may be NULL.
155 xml_element_get_text(const struct xml_node *element)
157 const struct xml_node *child;
159 xml_node_for_each_child(element, child)
160 if (child->type == XML_TEXT_NODE)
166 * Set the contents of the given @element to the given @text, replacing the
167 * entire existing contents if any.
170 xml_element_set_text(struct xml_node *element, const tchar *text)
172 struct xml_node *text_node = xml_new_node(NULL, XML_TEXT_NODE, NULL, 0,
173 text, tstrlen(text));
175 return WIMLIB_ERR_NOMEM;
176 xml_free_children(element);
177 xml_add_child(element, text_node);
182 xml_element_append_text(struct xml_node *element,
183 const tchar *text, size_t text_len)
185 struct xml_node *last_child;
187 if (!list_empty(&element->children) &&
189 list_last_entry(&element->children, struct xml_node,
190 sibling_link))->type == XML_TEXT_NODE) {
192 * The new TEXT would directly follow another TEXT, so simplify
193 * the tree by just appending to the existing TEXT. (This case
194 * can theoretically be reached via the use of CDATA...)
196 size_t old_len = tstrlen(last_child->value);
197 tchar *new_value = CALLOC(old_len + text_len + 1,
198 sizeof(new_value[0]));
200 return WIMLIB_ERR_NOMEM;
201 tmemcpy(new_value, last_child->value, old_len);
202 tmemcpy(&new_value[old_len], text, text_len);
203 FREE(last_child->value);
204 last_child->value = new_value;
207 if (!xml_new_node(element, XML_TEXT_NODE, NULL, 0, text, text_len))
208 return WIMLIB_ERR_NOMEM;
212 /* Find the attribute with the given @name on @element. */
214 xml_get_attrib(const struct xml_node *element, const tchar *name)
216 struct xml_node *child;
218 xml_node_for_each_child(element, child) {
219 if (child->type == XML_ATTRIBUTE_NODE &&
220 !tstrcmp(child->name, name))
226 /* Set the attribute @name=@value on the given @element. */
228 xml_set_attrib(struct xml_node *element, const tchar *name, const tchar *value)
230 struct xml_node *attrib = xml_new_node(NULL, XML_ATTRIBUTE_NODE,
232 value, tstrlen(value));
234 return WIMLIB_ERR_NOMEM;
235 xml_replace_child(element, attrib);
240 * Add the ELEMENT or ATTRIBUTE node @replacement under the ELEMENT @parent,
241 * replacing any node with the same type and name that already exists.
244 xml_replace_child(struct xml_node *parent, struct xml_node *replacement)
246 struct xml_node *child;
248 xml_unlink_node(replacement); /* Shouldn't be needed, but be safe. */
250 xml_node_for_each_child(parent, child) {
251 if (child->type == replacement->type &&
252 !tstrcmp(child->name, replacement->name)) {
253 list_replace(&child->sibling_link,
254 &replacement->sibling_link);
255 replacement->parent = parent;
256 child->parent = NULL;
257 xml_free_node(child);
261 xml_add_child(parent, replacement);
265 xml_clone_tree(struct xml_node *orig)
267 struct xml_node *clone, *orig_child, *clone_child;
269 clone = xml_new_node(NULL, orig->type,
270 orig->name, orig->name ? tstrlen(orig->name) : 0,
271 orig->value, orig->value ? tstrlen(orig->value) : 0);
274 xml_node_for_each_child(orig, orig_child) {
275 clone_child = xml_clone_tree(orig_child);
278 xml_add_child(clone, clone_child);
283 xml_free_node(clone);
287 /*----------------------------------------------------------------------------*
288 * XML string validation *
289 *----------------------------------------------------------------------------*/
292 * Functions that check for legal names and values in XML 1.0. These are
293 * currently slightly over-lenient, as they allow everything non-ASCII. These
294 * are also not currently used by the XML parser to reject non-well-formed
295 * documents, but rather just by the user of the XML processor (xml.c) in order
296 * to avoid introducing illegal names and values into the document.
300 is_whitespace(tchar c)
302 return c == ' ' || c == '\n' || c == '\r' || c == '\t';
306 is_name_start_char(tchar c)
308 return (c & 0x7f) != c /* overly lenient for now */ ||
309 (c >= 'A' && c <= 'Z') ||
310 (c >= 'a' && c <= 'z') ||
311 c == ':' || c == '_';
315 is_name_char(tchar c)
317 return is_name_start_char(c) ||
318 (c >= '0' && c <= '9') || c == '-' || c == '.';
321 /* Allow characters used in element "paths"; see do_xml_path_walk() */
323 is_path_char(tchar c)
325 return c == '/' || c == '[' || c == ']';
329 xml_legal_path(const tchar *p)
331 if (!is_name_start_char(*p) && !is_path_char(*p))
333 for (p = p + 1; *p; p++) {
334 if (!is_name_char(*p) && !is_path_char(*p))
341 xml_legal_value(const tchar *p)
344 /* Careful: tchar can be signed. */
345 if (*p > 0 && *p < 0x20 && !is_whitespace(*p))
352 #define BYTE_ORDER_MARK (tchar[]){ 0xfeff, 0 }
354 #define BYTE_ORDER_MARK "\xEF\xBB\xBF"
357 /*----------------------------------------------------------------------------*
359 *----------------------------------------------------------------------------*/
361 #define CHECK(cond) if (!(cond)) goto bad
364 skip_whitespace(const tchar **pp)
366 const tchar *p = *pp;
368 while (is_whitespace(*p))
374 skip_string(const tchar **pp, const tchar *str)
376 const tchar *p = *pp;
377 size_t len = tstrlen(str);
379 if (tstrncmp(p, str, len))
386 find_and_skip(const tchar **pp, const tchar *str)
388 const tchar *p = *pp;
393 *pp = p + tstrlen(str);
398 skip_misc(const tchar **pp)
400 const tchar *p = *pp, *prev_p;
405 /* Discard XML declaration and top-level PIs for now. */
406 if (skip_string(&p, T("<?")) && !find_and_skip(&p, T("?>")))
408 /* Discard DOCTYPE declaration for now. */
409 if (skip_string(&p, T("<!DOCTYPE")) && !find_and_skip(&p, T(">")))
411 /* Discard top-level comments for now. */
412 if (skip_string(&p, T("<!--")) && !find_and_skip(&p, T("-->")))
414 } while (p != prev_p);
419 static inline const tchar *
420 get_escape_seq(tchar c)
437 /* Note: 'str' must be NUL-terminated, but only 'len' chars are used. */
439 unescape_string(const tchar *str, size_t len, tchar **unescaped_ret)
441 const tchar *in_p = str;
442 tchar *unescaped, *out_p;
444 unescaped = CALLOC(len + 1, sizeof(str[0]));
446 return WIMLIB_ERR_NOMEM;
448 while (in_p < &str[len]) {
451 else if (skip_string(&in_p, T("<")))
453 else if (skip_string(&in_p, T(">")))
455 else if (skip_string(&in_p, T("&")))
457 else if (skip_string(&in_p, T("'")))
459 else if (skip_string(&in_p, T(""")))
464 if (in_p > &str[len])
466 *unescaped_ret = unescaped;
470 ERROR("Error unescaping string '%.*"TS"'", (int)len, str);
472 return WIMLIB_ERR_XML;
476 parse_element(const tchar **pp, struct xml_node *parent, int depth,
477 struct xml_node **node_ret);
480 parse_contents(const tchar **pp, struct xml_node *element, int depth)
482 const tchar *p = *pp;
486 const tchar *raw_text = p;
489 for (; *p != '<'; p++) {
491 return WIMLIB_ERR_XML;
494 ret = unescape_string(raw_text, p - raw_text, &text);
497 ret = xml_element_append_text(element, text,
504 break; /* Reached the end tag of @element */
505 } else if (p[1] == '?') {
506 /* Discard processing instructions for now. */
508 if (!find_and_skip(&p, T("?>")))
509 return WIMLIB_ERR_XML;
511 } else if (p[1] == '!') {
512 if (skip_string(&p, T("<![CDATA["))) {
514 if (!find_and_skip(&p, T("]]>")))
515 return WIMLIB_ERR_XML;
516 ret = xml_element_append_text(element, raw_text,
521 } else if (skip_string(&p, T("<!--"))) {
522 /* Discard comments for now. */
523 if (!find_and_skip(&p, T("-->")))
524 return WIMLIB_ERR_XML;
527 return WIMLIB_ERR_XML;
529 ret = parse_element(&p, element, depth + 1, NULL);
538 parse_element(const tchar **pp, struct xml_node *parent, int depth,
539 struct xml_node **element_ret)
541 const tchar *p = *pp;
542 struct xml_node *element = NULL;
543 const tchar *name_start;
547 /* Parse the start tag. */
552 while (!is_whitespace(*p) && *p != '>' && *p != '/' && *p != '\0')
554 name_len = p - name_start;
556 element = xml_new_node(parent, XML_ELEMENT_NODE, name_start, name_len,
559 ret = WIMLIB_ERR_NOMEM;
562 /* Parse the attributes list within the start tag. */
563 while (is_whitespace(*p)) {
564 const tchar *attr_name_start, *attr_value_start;
565 size_t attr_name_len, attr_value_len;
570 if (*p == '/' || *p == '>')
573 while (*p != '=' && !is_whitespace(*p) && *p != '\0')
575 attr_name_len = p - attr_name_start;
577 CHECK(attr_name_len > 0 && *p == '=');
581 CHECK(quote == '\'' || quote == '"');
582 attr_value_start = ++p;
583 while (*p != quote && *p != '\0')
586 attr_value_len = p - attr_value_start;
588 ret = unescape_string(attr_value_start, attr_value_len,
592 ret = xml_new_node(element, XML_ATTRIBUTE_NODE,
593 attr_name_start, attr_name_len,
594 attr_value, tstrlen(attr_value))
595 ? 0 : WIMLIB_ERR_NOMEM;
601 /* Closing an empty element tag */
604 /* Closing the start tag */
607 /* Parse the contents, then the end tag. */
608 ret = parse_contents(&p, element, depth);
615 CHECK(!tstrncmp(p, name_start, name_len));
623 *element_ret = element;
627 xml_free_node(element);
631 ret = WIMLIB_ERR_XML;
636 * Deserialize an XML document and return its root node in @doc_ret. The
637 * document must be given as a NUL-terminated string of 'tchar', i.e. UTF-16LE
638 * in Windows builds and UTF-8 everywhere else.
641 xml_parse_document(const tchar *p, struct xml_node **doc_ret)
644 struct xml_node *doc;
646 skip_string(&p, BYTE_ORDER_MARK);
648 return WIMLIB_ERR_XML;
649 ret = parse_element(&p, NULL, 0, &doc);
652 if (!skip_misc(&p) || *p) {
654 return WIMLIB_ERR_XML;
660 /*----------------------------------------------------------------------------*
662 *----------------------------------------------------------------------------*/
665 xml_write(struct xml_out_buf *buf, const tchar *str, size_t len)
667 if (buf->count + len + 1 > buf->capacity) {
668 size_t new_capacity = max(buf->capacity * 2, 4096);
669 tchar *new_buf = REALLOC(buf->buf,
670 new_capacity * sizeof(str[0]));
676 buf->capacity = new_capacity;
678 tmemcpy(&buf->buf[buf->count], str, len);
683 xml_puts(struct xml_out_buf *buf, const tchar *str)
685 xml_write(buf, str, tstrlen(str));
689 xml_escape_and_puts(struct xml_out_buf *buf, const tchar *str)
691 const tchar *p = str, *saved, *seq = NULL;
694 for (saved = p; *p && (seq = get_escape_seq(*p)) == NULL; p++)
696 xml_write(buf, saved, p - saved);
704 xml_write_element(struct xml_node *element, struct xml_out_buf *buf)
706 struct xml_node *child;
708 /* Write the start tag. */
709 xml_puts(buf, T("<"));
710 xml_puts(buf, element->name);
711 xml_node_for_each_child(element, child) {
712 if (child->type == XML_ATTRIBUTE_NODE) {
713 xml_puts(buf, T(" "));
714 xml_puts(buf, child->name);
715 xml_puts(buf, T("=\""));
716 xml_escape_and_puts(buf, child->value);
717 xml_puts(buf, T("\""));
720 xml_puts(buf, T(">"));
722 /* Write the contents. */
723 xml_node_for_each_child(element, child) {
724 if (child->type == XML_TEXT_NODE)
725 xml_escape_and_puts(buf, child->value);
726 else if (child->type == XML_ELEMENT_NODE)
727 xml_write_element(child, buf);
730 /* Write the end tag. */
731 xml_puts(buf, T("</"));
732 xml_puts(buf, element->name);
733 xml_puts(buf, T(">"));
737 * Serialize the document @doc into @buf as a NUL-terminated string of 'tchar',
738 * i.e. UTF-16LE in Windows builds and UTF-8 everywhere else. A byte order mark
739 * (BOM) is included, as this is needed for compatibility with WIMGAPI.
742 xml_write_document(struct xml_node *doc, struct xml_out_buf *buf)
744 xml_puts(buf, BYTE_ORDER_MARK);
745 xml_write_element(doc, buf);
747 return WIMLIB_ERR_NOMEM;
748 buf->buf[buf->count] = '\0';
752 /*----------------------------------------------------------------------------*
754 *----------------------------------------------------------------------------*/
756 #ifdef ENABLE_TEST_SUPPORT
758 wimlib_parse_and_write_xml_doc(const tchar *in, tchar **out_ret)
760 struct xml_node *doc;
761 struct xml_out_buf buf = {};
764 ret = xml_parse_document(in, &doc);
767 ret = xml_write_document(doc, &buf);
772 #endif /* ENABLE_TEST_SUPPORT */