From 75e5efe798c0d0e83c709ff3a802fa48096fc776 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 8 Jul 2023 12:12:08 -0700 Subject: [PATCH] Improve encoding detection of wimupdate command files Make wimupdate command files (read from stdin) and wimcapture source list files be interpreted as UTF-8 or UTF-16LE (autodetected). Previously these files were interpreted as the Windows codepage or UTF-16LE on Windows (autodetected), and UTF-8 on non-Windows. This makes these files be consistent with wimextract pathlist files and wimcapture config files, which already had the behavior of "autodetect UTF-8 or UTF-16LE on all platforms". Resolves https://wimlib.net/forums/viewtopic.php?p=1545 --- doc/man1/wimupdate.1 | 9 +- include/wimlib.h | 18 +++ programs/imagex-win32.c | 46 -------- programs/imagex-win32.h | 5 - programs/imagex.c | 159 +-------------------------- src/textfile.c | 20 ++++ tests/test-imagex-update_and_extract | 20 ++++ 7 files changed, 67 insertions(+), 210 deletions(-) diff --git a/doc/man1/wimupdate.1 b/doc/man1/wimupdate.1 index 40f5268b..f5e7c77d 100644 --- a/doc/man1/wimupdate.1 +++ b/doc/man1/wimupdate.1 @@ -14,10 +14,11 @@ contains only one image. You can use \fBwiminfo\fR(1) to list the images contained in \fIWIMFILE\fR. .PP The modifications to perform on the WIM image are specified as a sequence of -commands, one per line, read in a text file from standard input. It is -recommended that standard input be redirected from a file (\fICMDFILE\fR), as -shown above, rather than typing in commands interactively. Alternatively, to -specify a command directly on the command line, see the \fB--command\fR option. +commands, one per line, read in a text file (UTF-8 or UTF-16LE encoded; plain +ASCII is also fine) from standard input. It is recommended that standard input +be redirected from a file (\fICMDFILE\fR), as shown above, rather than typing in +commands interactively. Alternatively, to specify a command directly on the +command line, see the \fB--command\fR option. .SH AVAILABLE COMMANDS This section documents the commands that may appear in the \fICMDFILE\fR described above. diff --git a/include/wimlib.h b/include/wimlib.h index ef08e96e..54d6cb58 100644 --- a/include/wimlib.h +++ b/include/wimlib.h @@ -3564,6 +3564,24 @@ wimlib_join_with_progress(const wimlib_tchar * const *swms, wimlib_progress_func_t progfunc, void *progctx); +/** + * @ingroup G_general + * + * Load a UTF-8 or UTF-16LE encoded text file into memory. + * + * @param path + * The path to the file, or NULL or "-" to use standard input. + * @param tstr_ret + * On success, a buffer containing the file's text as a "wimlib_tchar" + * string is returned here. The buffer must be freed using free(). + * @param tstr_nchars_ret + * On success, the length of the text in "wimlib_tchar"s is returned here. + * + * @return 0 on success; a ::wimlib_error_code value on failure. + */ +WIMLIBAPI int +wimlib_load_text_file(const wimlib_tchar *path, + wimlib_tchar **tstr_ret, size_t *tstr_nchars_ret); /** * @ingroup G_mounting_wim_images diff --git a/programs/imagex-win32.c b/programs/imagex-win32.c index b72ae354..01507f0e 100644 --- a/programs/imagex-win32.c +++ b/programs/imagex-win32.c @@ -10,52 +10,6 @@ #include #include -/* Convert a string from the "current Windows codepage" to UTF-16LE. */ -wchar_t * -win32_mbs_to_wcs(const char *mbs, size_t mbs_nbytes, size_t *num_wchars_ret) -{ - if (mbs_nbytes > INT_MAX) { - fwprintf(stderr, L"ERROR: too much data (%zu bytes)!\n", - mbs_nbytes); - return NULL; - } - if (mbs_nbytes == 0) { - *num_wchars_ret = 0; - return (wchar_t*)mbs; - } - int len = MultiByteToWideChar(CP_ACP, - MB_ERR_INVALID_CHARS, - mbs, - mbs_nbytes, - NULL, - 0); - if (len <= 0) - goto out_invalid; - wchar_t *wcs = malloc(len * sizeof(wchar_t)); - if (!wcs) { - fwprintf(stderr, L"ERROR: out of memory!\n"); - return NULL; - } - int len2 = MultiByteToWideChar(CP_ACP, - MB_ERR_INVALID_CHARS, - mbs, - mbs_nbytes, - wcs, - len); - if (len2 != len) { - free(wcs); - goto out_invalid; - } - *num_wchars_ret = len; - return wcs; -out_invalid: - fwprintf(stderr, -L"ERROR: Invalid multi-byte string in the text file you provided as input!\n" -L" Maybe try converting your text file to UTF-16LE?\n" - ); - return NULL; -} - /* Set a file descriptor to binary mode. */ void set_fd_to_binary_mode(int fd) { diff --git a/programs/imagex-win32.h b/programs/imagex-win32.h index 2a694c0e..f7a751bd 100644 --- a/programs/imagex-win32.h +++ b/programs/imagex-win32.h @@ -2,12 +2,7 @@ #define _IMAGEX_WIN32_H #include -#include #include -#include - -wchar_t * -win32_mbs_to_wcs(const char *mbs, size_t mbs_nbytes, size_t *num_wchars_ret); void win32_print_security_descriptor(const uint8_t *sd, size_t size); diff --git a/programs/imagex.c b/programs/imagex.c index f053cd74..54e50bfc 100644 --- a/programs/imagex.c +++ b/programs/imagex.c @@ -968,152 +968,6 @@ parse_source_list(tchar **source_list_contents_p, size_t source_list_nchars, return sources; } -/* Reads the contents of a file into memory. */ -static char * -file_get_contents(const tchar *filename, size_t *len_ret) -{ - struct stat stbuf; - void *buf = NULL; - size_t len; - FILE *fp; - - if (tstat(filename, &stbuf) != 0) { - imagex_error_with_errno(T("Failed to stat the file \"%"TS"\""), filename); - goto out; - } - len = stbuf.st_size; - - fp = tfopen(filename, T("rb")); - if (!fp) { - imagex_error_with_errno(T("Failed to open the file \"%"TS"\""), filename); - goto out; - } - - buf = malloc(len ? len : 1); - if (!buf) { - imagex_error(T("Failed to allocate buffer of %zu bytes to hold " - "contents of file \"%"TS"\""), len, filename); - goto out_fclose; - } - if (fread(buf, 1, len, fp) != len) { - imagex_error_with_errno(T("Failed to read %zu bytes from the " - "file \"%"TS"\""), len, filename); - goto out_free_buf; - } - *len_ret = len; - goto out_fclose; -out_free_buf: - free(buf); - buf = NULL; -out_fclose: - fclose(fp); -out: - return buf; -} - -/* Read standard input until EOF and return the full contents in a malloc()ed - * buffer and the number of bytes of data in @len_ret. Returns NULL on read - * error. */ -static char * -stdin_get_contents(size_t *len_ret) -{ - /* stdin can, of course, be a pipe or other non-seekable file, so the - * total length of the data cannot be pre-determined */ - char *buf = NULL; - size_t newlen = 1024; - size_t pos = 0; - size_t inc = 1024; - for (;;) { - char *p = realloc(buf, newlen); - size_t bytes_read, bytes_to_read; - if (!p) { - imagex_error(T("out of memory while reading stdin")); - break; - } - buf = p; - bytes_to_read = newlen - pos; - bytes_read = fread(&buf[pos], 1, bytes_to_read, stdin); - pos += bytes_read; - if (bytes_read != bytes_to_read) { - if (feof(stdin)) { - *len_ret = pos; - return buf; - } else { - imagex_error_with_errno(T("error reading stdin")); - break; - } - } - newlen += inc; - inc *= 3; - inc /= 2; - } - free(buf); - return NULL; -} - - -static tchar * -translate_text_to_tstr(char *text, size_t num_bytes, size_t *num_tchars_ret) -{ -#ifndef _WIN32 - /* On non-Windows, assume an ASCII-compatible encoding, such as UTF-8. - * */ - *num_tchars_ret = num_bytes; - return text; -#else /* !_WIN32 */ - /* On Windows, translate the text to UTF-16LE */ - wchar_t *text_wstr; - size_t num_wchars; - - if (num_bytes >= 2 && - (((unsigned char)text[0] == 0xff && (unsigned char)text[1] == 0xfe) || - ((unsigned char)text[0] <= 0x7f && (unsigned char)text[1] == 0x00))) - { - /* File begins with 0xfeff, the BOM for UTF-16LE, or it begins - * with something that looks like an ASCII character encoded as - * a UTF-16LE code unit. Assume the file is encoded as - * UTF-16LE. This is not a 100% reliable check. */ - num_wchars = num_bytes / 2; - text_wstr = (wchar_t*)text; - } else { - /* File does not look like UTF-16LE. Assume it is encoded in - * the current Windows code page. I think these are always - * ASCII-compatible, so any so-called "plain-text" (ASCII) files - * should work as expected. */ - text_wstr = win32_mbs_to_wcs(text, - num_bytes, - &num_wchars); - free(text); - } - *num_tchars_ret = num_wchars; - return text_wstr; -#endif /* _WIN32 */ -} - -static tchar * -file_get_text_contents(const tchar *filename, size_t *num_tchars_ret) -{ - char *contents; - size_t num_bytes; - - contents = file_get_contents(filename, &num_bytes); - if (!contents) - return NULL; - return translate_text_to_tstr(contents, num_bytes, num_tchars_ret); -} - -static tchar * -stdin_get_text_contents(size_t *num_tchars_ret) -{ - char *contents; - size_t num_bytes; - - contents = stdin_get_contents(&num_bytes); - if (!contents) - return NULL; - return translate_text_to_tstr(contents, num_bytes, num_tchars_ret); -} - #define TO_PERCENT(numerator, denominator) \ (((denominator) == 0) ? 0 : ((numerator) * 100 / (denominator))) @@ -2202,13 +2056,8 @@ imagex_capture_or_append(int argc, tchar **argv, int cmd) if (source_list) { /* Set up capture sources in source list mode */ - if (source[0] == T('-') && source[1] == T('\0')) { - source_list_contents = stdin_get_text_contents(&source_list_nchars); - } else { - source_list_contents = file_get_text_contents(source, - &source_list_nchars); - } - if (!source_list_contents) + if (wimlib_load_text_file(source, &source_list_contents, + &source_list_nchars) != 0) goto out_err; capture_sources = parse_source_list(&source_list_contents, @@ -4300,8 +4149,8 @@ imagex_update(int argc, tchar **argv, int cmd) tputs(T("Reading update commands from standard input...")); recommend_man_page(CMD_UPDATE, stdout); } - cmd_file_contents = stdin_get_text_contents(&cmd_file_nchars); - if (!cmd_file_contents) { + if (wimlib_load_text_file(NULL, &cmd_file_contents, + &cmd_file_nchars) != 0) { ret = -1; goto out_wimlib_free; } diff --git a/src/textfile.c b/src/textfile.c index 42c99539..d41464cd 100644 --- a/src/textfile.c +++ b/src/textfile.c @@ -382,3 +382,23 @@ load_text_file(const tchar *path, const void *buf, size_t bufsize, *mem_ret = tstr; return 0; } + +/* API function documented in wimlib.h */ +WIMLIBAPI int +wimlib_load_text_file(const tchar *path, + tchar **tstr_ret, size_t *tstr_nchars_ret) +{ + void *buf; + size_t bufsize; + int ret; + + if (path == NULL || (path[0] == '-' && path[1] == '\0')) + ret = stdin_get_contents(&buf, &bufsize); + else + ret = read_file_contents(path, &buf, &bufsize); + if (ret) + return ret; + ret = translate_text_buffer(buf, bufsize, tstr_ret, tstr_nchars_ret); + FREE(buf); + return ret; +} diff --git a/tests/test-imagex-update_and_extract b/tests/test-imagex-update_and_extract index 6c877166..721954df 100755 --- a/tests/test-imagex-update_and_extract +++ b/tests/test-imagex-update_and_extract @@ -119,6 +119,26 @@ do_apply ../tree-cmp file out.dir/newname [ ! -e out.dir/file ] +prepare_empty_wim +msg "Testing UTF-16LE-NOBOM command update file" +echo -ne 'a\0d\0d\0 \0f\0i\0l\0e\0 \0/\0f\0i\0l\0e\0\n\0' \ + | wimupdate test.wim +do_apply +../tree-cmp file out.dir/file + +prepare_empty_wim +msg "Testing UTF-16LE-BOM command update file" +echo -ne '\xff\xfea\0d\0d\0 \0f\0i\0l\0e\0 \0/\0f\0i\0l\0e\0\n\0' \ + | wimupdate test.wim +do_apply +../tree-cmp file out.dir/file + +prepare_empty_wim +msg "Testing UTF-8-BOM command update file" +echo -ne '\xef\xbb\xbfadd file /file' | wimupdate test.wim +do_apply +../tree-cmp file out.dir/file + prepare_empty_wim msg "Testing adding, then renaming file in WIM image in one command" wimupdate test.wim << EOF -- 2.43.0