From 75e5efe798c0d0e83c709ff3a802fa48096fc776 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers3@gmail.com>
Date: Sat, 8 Jul 2023 12:12:08 -0700
Subject: [PATCH] Improve encoding detection of wimupdate command files

Make wimupdate command files (read from stdin) and wimcapture source
list files be interpreted as UTF-8 or UTF-16LE (autodetected).
Previously these files were interpreted as the Windows codepage or
UTF-16LE on Windows (autodetected), and UTF-8 on non-Windows.

This makes these files be consistent with wimextract pathlist files and
wimcapture config files, which already had the behavior of "autodetect
UTF-8 or UTF-16LE on all platforms".

Resolves https://wimlib.net/forums/viewtopic.php?p=1545
---
 doc/man1/wimupdate.1                 |   9 +-
 include/wimlib.h                     |  18 +++
 programs/imagex-win32.c              |  46 --------
 programs/imagex-win32.h              |   5 -
 programs/imagex.c                    | 159 +--------------------------
 src/textfile.c                       |  20 ++++
 tests/test-imagex-update_and_extract |  20 ++++
 7 files changed, 67 insertions(+), 210 deletions(-)

diff --git a/doc/man1/wimupdate.1 b/doc/man1/wimupdate.1
index 40f5268b..f5e7c77d 100644
--- a/doc/man1/wimupdate.1
+++ b/doc/man1/wimupdate.1
@@ -14,10 +14,11 @@ contains only one image.  You can use \fBwiminfo\fR(1) to list the images
 contained in \fIWIMFILE\fR.
 .PP
 The modifications to perform on the WIM image are specified as a sequence of
-commands, one per line, read in a text file from standard input.  It is
-recommended that standard input be redirected from a file (\fICMDFILE\fR), as
-shown above, rather than typing in commands interactively.  Alternatively, to
-specify a command directly on the command line, see the \fB--command\fR option.
+commands, one per line, read in a text file (UTF-8 or UTF-16LE encoded; plain
+ASCII is also fine) from standard input.  It is recommended that standard input
+be redirected from a file (\fICMDFILE\fR), as shown above, rather than typing in
+commands interactively.  Alternatively, to specify a command directly on the
+command line, see the \fB--command\fR option.
 .SH AVAILABLE COMMANDS
 This section documents the commands that may appear in the \fICMDFILE\fR
 described above.
diff --git a/include/wimlib.h b/include/wimlib.h
index ef08e96e..54d6cb58 100644
--- a/include/wimlib.h
+++ b/include/wimlib.h
@@ -3564,6 +3564,24 @@ wimlib_join_with_progress(const wimlib_tchar * const *swms,
 			  wimlib_progress_func_t progfunc,
 			  void *progctx);
 
+/**
+ * @ingroup G_general
+ *
+ * Load a UTF-8 or UTF-16LE encoded text file into memory.
+ *
+ * @param path
+ *	The path to the file, or NULL or "-" to use standard input.
+ * @param tstr_ret
+ *	On success, a buffer containing the file's text as a "wimlib_tchar"
+ *	string is returned here.  The buffer must be freed using free().
+ * @param tstr_nchars_ret
+ *	On success, the length of the text in "wimlib_tchar"s is returned here.
+ *
+ * @return 0 on success; a ::wimlib_error_code value on failure.
+ */
+WIMLIBAPI int
+wimlib_load_text_file(const wimlib_tchar *path,
+		      wimlib_tchar **tstr_ret, size_t *tstr_nchars_ret);
 
 /**
  * @ingroup G_mounting_wim_images
diff --git a/programs/imagex-win32.c b/programs/imagex-win32.c
index b72ae354..01507f0e 100644
--- a/programs/imagex-win32.c
+++ b/programs/imagex-win32.c
@@ -10,52 +10,6 @@
 #include <stdio.h>
 #include <windows.h>
 
-/* Convert a string from the "current Windows codepage" to UTF-16LE.  */
-wchar_t *
-win32_mbs_to_wcs(const char *mbs, size_t mbs_nbytes, size_t *num_wchars_ret)
-{
-	if (mbs_nbytes > INT_MAX) {
-		fwprintf(stderr, L"ERROR: too much data (%zu bytes)!\n",
-			 mbs_nbytes);
-		return NULL;
-	}
-	if (mbs_nbytes == 0) {
-		*num_wchars_ret = 0;
-		return (wchar_t*)mbs;
-	}
-	int len = MultiByteToWideChar(CP_ACP,
-				      MB_ERR_INVALID_CHARS,
-				      mbs,
-				      mbs_nbytes,
-				      NULL,
-				      0);
-	if (len <= 0)
-		goto out_invalid;
-	wchar_t *wcs = malloc(len * sizeof(wchar_t));
-	if (!wcs) {
-		fwprintf(stderr, L"ERROR: out of memory!\n");
-		return NULL;
-	}
-	int len2 = MultiByteToWideChar(CP_ACP,
-				       MB_ERR_INVALID_CHARS,
-				       mbs,
-				       mbs_nbytes,
-				       wcs,
-				       len);
-	if (len2 != len) {
-		free(wcs);
-		goto out_invalid;
-	}
-	*num_wchars_ret = len;
-	return wcs;
-out_invalid:
-	fwprintf(stderr,
-L"ERROR: Invalid multi-byte string in the text file you provided as input!\n"
-L"       Maybe try converting your text file to UTF-16LE?\n"
-	);
-	return NULL;
-}
-
 /* Set a file descriptor to binary mode.  */
 void set_fd_to_binary_mode(int fd)
 {
diff --git a/programs/imagex-win32.h b/programs/imagex-win32.h
index 2a694c0e..f7a751bd 100644
--- a/programs/imagex-win32.h
+++ b/programs/imagex-win32.h
@@ -2,12 +2,7 @@
 #define _IMAGEX_WIN32_H
 
 #include <stddef.h>
-#include <stdbool.h>
 #include <inttypes.h>
-#include <wchar.h>
-
-wchar_t *
-win32_mbs_to_wcs(const char *mbs, size_t mbs_nbytes, size_t *num_wchars_ret);
 
 void
 win32_print_security_descriptor(const uint8_t *sd, size_t size);
diff --git a/programs/imagex.c b/programs/imagex.c
index f053cd74..54e50bfc 100644
--- a/programs/imagex.c
+++ b/programs/imagex.c
@@ -968,152 +968,6 @@ parse_source_list(tchar **source_list_contents_p, size_t source_list_nchars,
 	return sources;
 }
 
-/* Reads the contents of a file into memory. */
-static char *
-file_get_contents(const tchar *filename, size_t *len_ret)
-{
-	struct stat stbuf;
-	void *buf = NULL;
-	size_t len;
-	FILE *fp;
-
-	if (tstat(filename, &stbuf) != 0) {
-		imagex_error_with_errno(T("Failed to stat the file \"%"TS"\""), filename);
-		goto out;
-	}
-	len = stbuf.st_size;
-
-	fp = tfopen(filename, T("rb"));
-	if (!fp) {
-		imagex_error_with_errno(T("Failed to open the file \"%"TS"\""), filename);
-		goto out;
-	}
-
-	buf = malloc(len ? len : 1);
-	if (!buf) {
-		imagex_error(T("Failed to allocate buffer of %zu bytes to hold "
-			       "contents of file \"%"TS"\""), len, filename);
-		goto out_fclose;
-	}
-	if (fread(buf, 1, len, fp) != len) {
-		imagex_error_with_errno(T("Failed to read %zu bytes from the "
-					  "file \"%"TS"\""), len, filename);
-		goto out_free_buf;
-	}
-	*len_ret = len;
-	goto out_fclose;
-out_free_buf:
-	free(buf);
-	buf = NULL;
-out_fclose:
-	fclose(fp);
-out:
-	return buf;
-}
-
-/* Read standard input until EOF and return the full contents in a malloc()ed
- * buffer and the number of bytes of data in @len_ret.  Returns NULL on read
- * error. */
-static char *
-stdin_get_contents(size_t *len_ret)
-{
-	/* stdin can, of course, be a pipe or other non-seekable file, so the
-	 * total length of the data cannot be pre-determined */
-	char *buf = NULL;
-	size_t newlen = 1024;
-	size_t pos = 0;
-	size_t inc = 1024;
-	for (;;) {
-		char *p = realloc(buf, newlen);
-		size_t bytes_read, bytes_to_read;
-		if (!p) {
-			imagex_error(T("out of memory while reading stdin"));
-			break;
-		}
-		buf = p;
-		bytes_to_read = newlen - pos;
-		bytes_read = fread(&buf[pos], 1, bytes_to_read, stdin);
-		pos += bytes_read;
-		if (bytes_read != bytes_to_read) {
-			if (feof(stdin)) {
-				*len_ret = pos;
-				return buf;
-			} else {
-				imagex_error_with_errno(T("error reading stdin"));
-				break;
-			}
-		}
-		newlen += inc;
-		inc *= 3;
-		inc /= 2;
-	}
-	free(buf);
-	return NULL;
-}
-
-
-static tchar *
-translate_text_to_tstr(char *text, size_t num_bytes, size_t *num_tchars_ret)
-{
-#ifndef _WIN32
-	/* On non-Windows, assume an ASCII-compatible encoding, such as UTF-8.
-	 * */
-	*num_tchars_ret = num_bytes;
-	return text;
-#else /* !_WIN32 */
-	/* On Windows, translate the text to UTF-16LE */
-	wchar_t *text_wstr;
-	size_t num_wchars;
-
-	if (num_bytes >= 2 &&
-	    (((unsigned char)text[0] == 0xff && (unsigned char)text[1] == 0xfe) ||
-	     ((unsigned char)text[0] <= 0x7f && (unsigned char)text[1] == 0x00)))
-	{
-		/* File begins with 0xfeff, the BOM for UTF-16LE, or it begins
-		 * with something that looks like an ASCII character encoded as
-		 * a UTF-16LE code unit.  Assume the file is encoded as
-		 * UTF-16LE.  This is not a 100% reliable check. */
-		num_wchars = num_bytes / 2;
-		text_wstr = (wchar_t*)text;
-	} else {
-		/* File does not look like UTF-16LE.  Assume it is encoded in
-		 * the current Windows code page.  I think these are always
-		 * ASCII-compatible, so any so-called "plain-text" (ASCII) files
-		 * should work as expected. */
-		text_wstr = win32_mbs_to_wcs(text,
-					     num_bytes,
-					     &num_wchars);
-		free(text);
-	}
-	*num_tchars_ret = num_wchars;
-	return text_wstr;
-#endif /* _WIN32 */
-}
-
-static tchar *
-file_get_text_contents(const tchar *filename, size_t *num_tchars_ret)
-{
-	char *contents;
-	size_t num_bytes;
-
-	contents = file_get_contents(filename, &num_bytes);
-	if (!contents)
-		return NULL;
-	return translate_text_to_tstr(contents, num_bytes, num_tchars_ret);
-}
-
-static tchar *
-stdin_get_text_contents(size_t *num_tchars_ret)
-{
-	char *contents;
-	size_t num_bytes;
-
-	contents = stdin_get_contents(&num_bytes);
-	if (!contents)
-		return NULL;
-	return translate_text_to_tstr(contents, num_bytes, num_tchars_ret);
-}
-
 #define TO_PERCENT(numerator, denominator) \
 	(((denominator) == 0) ? 0 : ((numerator) * 100 / (denominator)))
 
@@ -2202,13 +2056,8 @@ imagex_capture_or_append(int argc, tchar **argv, int cmd)
 
 	if (source_list) {
 		/* Set up capture sources in source list mode */
-		if (source[0] == T('-') && source[1] == T('\0')) {
-			source_list_contents = stdin_get_text_contents(&source_list_nchars);
-		} else {
-			source_list_contents = file_get_text_contents(source,
-								      &source_list_nchars);
-		}
-		if (!source_list_contents)
+		if (wimlib_load_text_file(source, &source_list_contents,
+					  &source_list_nchars) != 0)
 			goto out_err;
 
 		capture_sources = parse_source_list(&source_list_contents,
@@ -4300,8 +4149,8 @@ imagex_update(int argc, tchar **argv, int cmd)
 			tputs(T("Reading update commands from standard input..."));
 			recommend_man_page(CMD_UPDATE, stdout);
 		}
-		cmd_file_contents = stdin_get_text_contents(&cmd_file_nchars);
-		if (!cmd_file_contents) {
+		if (wimlib_load_text_file(NULL, &cmd_file_contents,
+					  &cmd_file_nchars) != 0) {
 			ret = -1;
 			goto out_wimlib_free;
 		}
diff --git a/src/textfile.c b/src/textfile.c
index 42c99539..d41464cd 100644
--- a/src/textfile.c
+++ b/src/textfile.c
@@ -382,3 +382,23 @@ load_text_file(const tchar *path, const void *buf, size_t bufsize,
 	*mem_ret = tstr;
 	return 0;
 }
+
+/* API function documented in wimlib.h */
+WIMLIBAPI int
+wimlib_load_text_file(const tchar *path,
+		      tchar **tstr_ret, size_t *tstr_nchars_ret)
+{
+	void *buf;
+	size_t bufsize;
+	int ret;
+
+	if (path == NULL || (path[0] == '-' && path[1] == '\0'))
+		ret = stdin_get_contents(&buf, &bufsize);
+	else
+		ret = read_file_contents(path, &buf, &bufsize);
+	if (ret)
+		return ret;
+	ret = translate_text_buffer(buf, bufsize, tstr_ret, tstr_nchars_ret);
+	FREE(buf);
+	return ret;
+}
diff --git a/tests/test-imagex-update_and_extract b/tests/test-imagex-update_and_extract
index 6c877166..721954df 100755
--- a/tests/test-imagex-update_and_extract
+++ b/tests/test-imagex-update_and_extract
@@ -119,6 +119,26 @@ do_apply
 ../tree-cmp file out.dir/newname
 [ ! -e out.dir/file ]
 
+prepare_empty_wim
+msg "Testing UTF-16LE-NOBOM command update file"
+echo -ne 'a\0d\0d\0 \0f\0i\0l\0e\0 \0/\0f\0i\0l\0e\0\n\0' \
+	| wimupdate test.wim
+do_apply
+../tree-cmp file out.dir/file
+
+prepare_empty_wim
+msg "Testing UTF-16LE-BOM command update file"
+echo -ne '\xff\xfea\0d\0d\0 \0f\0i\0l\0e\0 \0/\0f\0i\0l\0e\0\n\0' \
+	| wimupdate test.wim
+do_apply
+../tree-cmp file out.dir/file
+
+prepare_empty_wim
+msg "Testing UTF-8-BOM command update file"
+echo -ne '\xef\xbb\xbfadd file /file' | wimupdate test.wim
+do_apply
+../tree-cmp file out.dir/file
+
 prepare_empty_wim
 msg "Testing adding, then renaming file in WIM image in one command"
 wimupdate test.wim << EOF
-- 
2.43.0