Improve encoding detection of wimupdate command files

author Eric Biggers <ebiggers3@gmail.com>

Sat, 8 Jul 2023 19:12:08 +0000 (12:12 -0700)

committer Eric Biggers <ebiggers3@gmail.com>

Sat, 8 Jul 2023 19:12:08 +0000 (12:12 -0700)
author Eric Biggers <ebiggers3@gmail.com>
Sat, 8 Jul 2023 19:12:08 +0000 (12:12 -0700)
committer Eric Biggers <ebiggers3@gmail.com>
Sat, 8 Jul 2023 19:12:08 +0000 (12:12 -0700)
diff --git a/doc/man1/wimupdate.1 b/doc/man1/wimupdate.1

index 40f5268b301753cfbe8befbf269428386e23596a..f5e7c77d7bec20a92108a7c6c101526a746c3258 100644 (file)
--- a/doc/man1/wimupdate.1
+++ b/doc/man1/wimupdate.1
@@ -14,10 +14,11 @@ contains only one image.  You can use \fBwiminfo\fR(1) to list the images
  contained in \fIWIMFILE\fR.
  .PP
  The modifications to perform on the WIM image are specified as a sequence of
-commands, one per line, read in a text file from standard input.  It is
-recommended that standard input be redirected from a file (\fICMDFILE\fR), as
-shown above, rather than typing in commands interactively.  Alternatively, to
-specify a command directly on the command line, see the \fB--command\fR option.
+commands, one per line, read in a text file (UTF-8 or UTF-16LE encoded; plain
+ASCII is also fine) from standard input.  It is recommended that standard input
+be redirected from a file (\fICMDFILE\fR), as shown above, rather than typing in
+commands interactively.  Alternatively, to specify a command directly on the
+command line, see the \fB--command\fR option.
  .SH AVAILABLE COMMANDS
  This section documents the commands that may appear in the \fICMDFILE\fR
  described above.
diff --git a/include/wimlib.h b/include/wimlib.h

index ef08e96e5ac1952e5ad6fce0f53973bce7362bf0..54d6cb5832763afa73d36416a3a88c061d89428e 100644 (file)
--- a/include/wimlib.h
+++ b/include/wimlib.h
@@ -3564,6 +3564,24 @@ wimlib_join_with_progress(const wimlib_tchar * const *swms,
                           wimlib_progress_func_t progfunc,
                           void *progctx);
  
+/**
+ * @ingroup G_general
+ *
+ * Load a UTF-8 or UTF-16LE encoded text file into memory.
+ *
+ * @param path
+ *     The path to the file, or NULL or "-" to use standard input.
+ * @param tstr_ret
+ *     On success, a buffer containing the file's text as a "wimlib_tchar"
+ *     string is returned here.  The buffer must be freed using free().
+ * @param tstr_nchars_ret
+ *     On success, the length of the text in "wimlib_tchar"s is returned here.
+ *
+ * @return 0 on success; a ::wimlib_error_code value on failure.
+ */
+WIMLIBAPI int
+wimlib_load_text_file(const wimlib_tchar *path,
+                     wimlib_tchar **tstr_ret, size_t *tstr_nchars_ret);
  
  /**
   * @ingroup G_mounting_wim_images
diff --git a/programs/imagex-win32.c b/programs/imagex-win32.c

index b72ae354f7401ef1124ef65b09be629f9ac99c12..01507f0ea82099fb2555927459b38aa89f3fb1d8 100644 (file)
--- a/programs/imagex-win32.c
+++ b/programs/imagex-win32.c
@@ -10,52 +10,6 @@
  #include <stdio.h>
  #include <windows.h>
  
-/* Convert a string from the "current Windows codepage" to UTF-16LE.  */
-wchar_t *
-win32_mbs_to_wcs(const char *mbs, size_t mbs_nbytes, size_t *num_wchars_ret)
-{
-       if (mbs_nbytes > INT_MAX) {
-               fwprintf(stderr, L"ERROR: too much data (%zu bytes)!\n",
-                        mbs_nbytes);
-               return NULL;
-       }
-       if (mbs_nbytes == 0) {
-               *num_wchars_ret = 0;
-               return (wchar_t*)mbs;
-       }
-       int len = MultiByteToWideChar(CP_ACP,
-                                     MB_ERR_INVALID_CHARS,
-                                     mbs,
-                                     mbs_nbytes,
-                                     NULL,
-                                     0);
-       if (len <= 0)
-               goto out_invalid;
-       wchar_t *wcs = malloc(len * sizeof(wchar_t));
-       if (!wcs) {
-               fwprintf(stderr, L"ERROR: out of memory!\n");
-               return NULL;
-       }
-       int len2 = MultiByteToWideChar(CP_ACP,
-                                      MB_ERR_INVALID_CHARS,
-                                      mbs,
-                                      mbs_nbytes,
-                                      wcs,
-                                      len);
-       if (len2 != len) {
-               free(wcs);
-               goto out_invalid;
-       }
-       *num_wchars_ret = len;
-       return wcs;
-out_invalid:
-       fwprintf(stderr,
-L"ERROR: Invalid multi-byte string in the text file you provided as input!\n"
-L"       Maybe try converting your text file to UTF-16LE?\n"
-       );
-       return NULL;
-}
-
  /* Set a file descriptor to binary mode.  */
  void set_fd_to_binary_mode(int fd)
  {
diff --git a/programs/imagex-win32.h b/programs/imagex-win32.h

index 2a694c0ede91bbbdec4cd0853e2604973eaca6f3..f7a751bd0256460754484ff68febc89927cb0358 100644 (file)
--- a/programs/imagex-win32.h
+++ b/programs/imagex-win32.h
@@ -2,12 +2,7 @@
  #define _IMAGEX_WIN32_H
  
  #include <stddef.h>
-#include <stdbool.h>
  #include <inttypes.h>
-#include <wchar.h>
-
-wchar_t *
-win32_mbs_to_wcs(const char *mbs, size_t mbs_nbytes, size_t *num_wchars_ret);
  
  void
  win32_print_security_descriptor(const uint8_t *sd, size_t size);
diff --git a/programs/imagex.c b/programs/imagex.c

index f053cd74bcddd7019c9541c98c48596e903cc67d..54e50bfcdd06fe68015b8a23c2ac81c36be80097 100644 (file)
--- a/programs/imagex.c
+++ b/programs/imagex.c
@@ -968,152 +968,6 @@ parse_source_list(tchar **source_list_contents_p, size_t source_list_nchars,
         return sources;
  }
  
-/* Reads the contents of a file into memory. */
-static char *
-file_get_contents(const tchar *filename, size_t *len_ret)
-{
-       struct stat stbuf;
-       void *buf = NULL;
-       size_t len;
-       FILE *fp;
-
-       if (tstat(filename, &stbuf) != 0) {
-               imagex_error_with_errno(T("Failed to stat the file \"%"TS"\""), filename);
-               goto out;
-       }
-       len = stbuf.st_size;
-
-       fp = tfopen(filename, T("rb"));
-       if (!fp) {
-               imagex_error_with_errno(T("Failed to open the file \"%"TS"\""), filename);
-               goto out;
-       }
-
-       buf = malloc(len ? len : 1);
-       if (!buf) {
-               imagex_error(T("Failed to allocate buffer of %zu bytes to hold "
-                              "contents of file \"%"TS"\""), len, filename);
-               goto out_fclose;
-       }
-       if (fread(buf, 1, len, fp) != len) {
-               imagex_error_with_errno(T("Failed to read %zu bytes from the "
-                                         "file \"%"TS"\""), len, filename);
-               goto out_free_buf;
-       }
-       *len_ret = len;
-       goto out_fclose;
-out_free_buf:
-       free(buf);
-       buf = NULL;
-out_fclose:
-       fclose(fp);
-out:
-       return buf;
-}
-
-/* Read standard input until EOF and return the full contents in a malloc()ed
- * buffer and the number of bytes of data in @len_ret.  Returns NULL on read
- * error. */
-static char *
-stdin_get_contents(size_t *len_ret)
-{
-       /* stdin can, of course, be a pipe or other non-seekable file, so the
-        * total length of the data cannot be pre-determined */
-       char *buf = NULL;
-       size_t newlen = 1024;
-       size_t pos = 0;
-       size_t inc = 1024;
-       for (;;) {
-               char *p = realloc(buf, newlen);
-               size_t bytes_read, bytes_to_read;
-               if (!p) {
-                       imagex_error(T("out of memory while reading stdin"));
-                       break;
-               }
-               buf = p;
-               bytes_to_read = newlen - pos;
-               bytes_read = fread(&buf[pos], 1, bytes_to_read, stdin);
-               pos += bytes_read;
-               if (bytes_read != bytes_to_read) {
-                       if (feof(stdin)) {
-                               *len_ret = pos;
-                               return buf;
-                       } else {
-                               imagex_error_with_errno(T("error reading stdin"));
-                               break;
-                       }
-               }
-               newlen += inc;
-               inc *= 3;
-               inc /= 2;
-       }
-       free(buf);
-       return NULL;
-}
-
-
-static tchar *
-translate_text_to_tstr(char *text, size_t num_bytes, size_t *num_tchars_ret)
-{
-#ifndef _WIN32
-       /* On non-Windows, assume an ASCII-compatible encoding, such as UTF-8.
-        * */
-       *num_tchars_ret = num_bytes;
-       return text;
-#else /* !_WIN32 */
-       /* On Windows, translate the text to UTF-16LE */
-       wchar_t *text_wstr;
-       size_t num_wchars;
-
-       if (num_bytes >= 2 &&
-           (((unsigned char)text[0] == 0xff && (unsigned char)text[1] == 0xfe) ||
-            ((unsigned char)text[0] <= 0x7f && (unsigned char)text[1] == 0x00)))
-       {
-               /* File begins with 0xfeff, the BOM for UTF-16LE, or it begins
-                * with something that looks like an ASCII character encoded as
-                * a UTF-16LE code unit.  Assume the file is encoded as
-                * UTF-16LE.  This is not a 100% reliable check. */
-               num_wchars = num_bytes / 2;
-               text_wstr = (wchar_t*)text;
-       } else {
-               /* File does not look like UTF-16LE.  Assume it is encoded in
-                * the current Windows code page.  I think these are always
-                * ASCII-compatible, so any so-called "plain-text" (ASCII) files
-                * should work as expected. */
-               text_wstr = win32_mbs_to_wcs(text,
-                                            num_bytes,
-                                            &num_wchars);
-               free(text);
-       }
-       *num_tchars_ret = num_wchars;
-       return text_wstr;
-#endif /* _WIN32 */
-}
-
-static tchar *
-file_get_text_contents(const tchar *filename, size_t *num_tchars_ret)
-{
-       char *contents;
-       size_t num_bytes;
-
-       contents = file_get_contents(filename, &num_bytes);
-       if (!contents)
-               return NULL;
-       return translate_text_to_tstr(contents, num_bytes, num_tchars_ret);
-}
-
-static tchar *
-stdin_get_text_contents(size_t *num_tchars_ret)
-{
-       char *contents;
-       size_t num_bytes;
-
-       contents = stdin_get_contents(&num_bytes);
-       if (!contents)
-               return NULL;
-       return translate_text_to_tstr(contents, num_bytes, num_tchars_ret);
-}
-
  #define TO_PERCENT(numerator, denominator) \
         (((denominator) == 0) ? 0 : ((numerator) * 100 / (denominator)))
  
@@ -2202,13 +2056,8 @@ imagex_capture_or_append(int argc, tchar **argv, int cmd)
  
         if (source_list) {
                 /* Set up capture sources in source list mode */
-               if (source[0] == T('-') && source[1] == T('\0')) {
-                       source_list_contents = stdin_get_text_contents(&source_list_nchars);
-               } else {
-                       source_list_contents = file_get_text_contents(source,
-                                                                     &source_list_nchars);
-               }
-               if (!source_list_contents)
+               if (wimlib_load_text_file(source, &source_list_contents,
+                                         &source_list_nchars) != 0)
                         goto out_err;
  
                 capture_sources = parse_source_list(&source_list_contents,
@@ -4300,8 +4149,8 @@ imagex_update(int argc, tchar **argv, int cmd)
                         tputs(T("Reading update commands from standard input..."));
                         recommend_man_page(CMD_UPDATE, stdout);
                 }
-               cmd_file_contents = stdin_get_text_contents(&cmd_file_nchars);
-               if (!cmd_file_contents) {
+               if (wimlib_load_text_file(NULL, &cmd_file_contents,
+                                         &cmd_file_nchars) != 0) {
                         ret = -1;
                         goto out_wimlib_free;
                 }
diff --git a/src/textfile.c b/src/textfile.c

index 42c995396381fd20a40623a31d434582ce4782df..d41464cd8cbd17016cdb2b22e9cf2539e9a13623 100644 (file)
--- a/src/textfile.c
+++ b/src/textfile.c
@@ -382,3 +382,23 @@ load_text_file(const tchar *path, const void *buf, size_t bufsize,
         *mem_ret = tstr;
         return 0;
  }
+
+/* API function documented in wimlib.h */
+WIMLIBAPI int
+wimlib_load_text_file(const tchar *path,
+                     tchar **tstr_ret, size_t *tstr_nchars_ret)
+{
+       void *buf;
+       size_t bufsize;
+       int ret;
+
+       if (path == NULL || (path[0] == '-' && path[1] == '\0'))
+               ret = stdin_get_contents(&buf, &bufsize);
+       else
+               ret = read_file_contents(path, &buf, &bufsize);
+       if (ret)
+               return ret;
+       ret = translate_text_buffer(buf, bufsize, tstr_ret, tstr_nchars_ret);
+       FREE(buf);
+       return ret;
+}
diff --git a/tests/test-imagex-update_and_extract b/tests/test-imagex-update_and_extract

index 6c8771669c8c57c3b8fde78f74c1c19aec5240b0..721954df10584d1fd057af21505516431e8d8186 100755 (executable)
--- a/tests/test-imagex-update_and_extract
+++ b/tests/test-imagex-update_and_extract
@@ -119,6 +119,26 @@ do_apply
  ../tree-cmp file out.dir/newname
  [ ! -e out.dir/file ]
  
+prepare_empty_wim
+msg "Testing UTF-16LE-NOBOM command update file"
+echo -ne 'a\0d\0d\0 \0f\0i\0l\0e\0 \0/\0f\0i\0l\0e\0\n\0' \
+       | wimupdate test.wim
+do_apply
+../tree-cmp file out.dir/file
+
+prepare_empty_wim
+msg "Testing UTF-16LE-BOM command update file"
+echo -ne '\xff\xfea\0d\0d\0 \0f\0i\0l\0e\0 \0/\0f\0i\0l\0e\0\n\0' \
+       | wimupdate test.wim
+do_apply
+../tree-cmp file out.dir/file
+
+prepare_empty_wim
+msg "Testing UTF-8-BOM command update file"
+echo -ne '\xef\xbb\xbfadd file /file' | wimupdate test.wim
+do_apply
+../tree-cmp file out.dir/file
+
  prepare_empty_wim
  msg "Testing adding, then renaming file in WIM image in one command"
  wimupdate test.wim << EOF
author	Eric Biggers <ebiggers3@gmail.com>
	Sat, 8 Jul 2023 19:12:08 +0000 (12:12 -0700)
committer	Eric Biggers <ebiggers3@gmail.com>
	Sat, 8 Jul 2023 19:12:08 +0000 (12:12 -0700)
doc/man1/wimupdate.1		patch \| blob \| history
include/wimlib.h		patch \| blob \| history
programs/imagex-win32.c		patch \| blob \| history
programs/imagex-win32.h		patch \| blob \| history
programs/imagex.c		patch \| blob \| history
src/textfile.c		patch \| blob \| history
tests/test-imagex-update_and_extract		patch \| blob \| history