From e303f13530913d434239e9c1b4b9c2b3868a443c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 11 Jun 2016 13:28:18 -0500 Subject: [PATCH] Rewrite string encoding conversion code - Drop support for multibyte encodings other than UTF-8. It is probably not worth the effort to support such encodings. Interestingly, the support was entirely broken before v1.9.1, yet no one ever complained... - Implement UTF-8 and UTF-16LE codecs and drop the iconv requirement. This provides better performance and actually results in *fewer* lines of code and a slightly smaller binary. --- .gitignore | 1 - Makefile.am | 1 - configure.ac | 10 - doc/man1/wimlib-imagex.1 | 11 - include/wimlib.h | 17 +- include/wimlib/encoding.h | 129 ++++----- include/wimlib/ntfs_3g.h | 3 - include/wimlib_tchar.h | 3 +- m4/iconv.m4 | 268 ------------------ programs/imagex.c | 25 -- src/dentry.c | 3 +- src/encoding.c | 571 +++++++++++++------------------------- src/error.c | 9 +- src/iterate_dir.c | 8 +- src/mount_image.c | 2 +- src/ntfs-3g_apply.c | 6 - src/registry.c | 3 +- src/reparse.c | 2 +- src/wim.c | 29 +- src/xml.c | 34 ++- tools/windeps/Makefile | 36 --- tools/windeps/sha256sums | 1 - 22 files changed, 292 insertions(+), 880 deletions(-) delete mode 100644 m4/iconv.m4 diff --git a/.gitignore b/.gitignore index 98944191..55644400 100644 --- a/.gitignore +++ b/.gitignore @@ -48,7 +48,6 @@ /tools/windeps/libxml2* /tools/windeps/mingw* /tools/windeps/sysroot_* -/tools/windeps/win-iconv* /tools/windeps/winpthreads* /wimlib-*-bin/ /wimlib-*.tar diff --git a/Makefile.am b/Makefile.am index ff5a29bb..6754144b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -204,7 +204,6 @@ libwim_la_LDFLAGS = $(AM_LDFLAGS) -version-info 26:0:11 libwim_la_LIBADD = \ $(PTHREAD_LIBS) \ $(LIBXML2_LIBS) \ - $(LTLIBICONV) \ $(LIBNTFS_3G_LIBS) \ $(LIBFUSE_LIBS) \ $(LIBRT_LIBS) \ diff --git a/configure.ac b/configure.ac index c6aed057..b978c03b 100644 --- a/configure.ac +++ b/configure.ac @@ -102,16 +102,6 @@ AX_PTHREAD([], [AC_MSG_ERROR(["cannot find pthreads library"])]) PKG_CHECK_MODULES([LIBXML2], [libxml-2.0]) PKGCONFIG_PRIVATE_REQUIRES="$PKGCONFIG_PRIVATE_REQUIRES libxml-2.0" -# ------------------------------ libiconv ------------------------------------- -AM_ICONV -if test "$am_cv_func_iconv" != "yes"; then - AC_MSG_ERROR([Cannot find the iconv() function. iconv() is used to - convert between encodings of WIM filenames and XML data. - wimlib cannot be compiled without it. iconv() is - available in the latest version of glibc and sometimes in - other libraries.]) -fi - ############################################################################### # Configuration options # ############################################################################### diff --git a/doc/man1/wimlib-imagex.1 b/doc/man1/wimlib-imagex.1 index 3fe685af..9ec74282 100644 --- a/doc/man1/wimlib-imagex.1 +++ b/doc/man1/wimlib-imagex.1 @@ -200,17 +200,6 @@ driver (WOF). With the \fB--wimboot\fR option, \fBwimapply\fR will extract VSS snapshot support. On Windows, \fBwimcapture\fR or \fBwimappend\fR with the \fB--snapshot\fR option will automatically create a temporary VSS snapshot and capture the image from it. This can be used to image a "live" Windows system. -.SH LOCALES AND CHARACTER ENCODINGS -WIM files themselves store file and stream names using Windows native "wide -character strings", which are UTF-16. On Windows, wimlib works using these same -strings, so conversions are usually not necessary and there should be no -problems with character encodings. -.PP -On UNIX-like systems, wimlib works primarily in the locale-dependent multibyte -encoding, which you are strongly recommended to set to UTF-8 to avoid any -problems. You can alternatively set the environmental variable -\fBWIMLIB_IMAGEX_USE_UTF8\fR to force \fBwimlib-imagex\fR to use UTF-8 -internally, even if the current locale is not UTF-8 compatible. .SH CASE SENSITIVITY By default, the case sensitivity of \fBwimlib-imagex\fR differs somewhat between UNIX-like systems and Windows. WIM images may (but usually do not) have diff --git a/include/wimlib.h b/include/wimlib.h index 7254a126..7a6906e7 100644 --- a/include/wimlib.h +++ b/include/wimlib.h @@ -151,10 +151,7 @@ * and the encoding is UTF-16LE. * * On UNIX-like systems, each ::wimlib_tchar is 1 byte and is simply a "char", - * and the encoding is the locale-dependent multibyte encoding. I recommend you - * set your locale to a UTF-8 capable locale to avoid any issues. Also, by - * default, wimlib on UNIX will assume the locale is UTF-8 capable unless you - * call wimlib_global_init() after having set your desired locale. + * and the encoding is UTF-8. * * @section sec_advanced Additional information and features * @@ -2323,9 +2320,8 @@ typedef int (*wimlib_iterate_lookup_table_callback_t)(const struct wimlib_resour /** @addtogroup G_general * @{ */ -/** Assume that strings are represented in UTF-8, even if this is not the - * locale's character encoding. This flag is ignored on Windows, where wimlib - * always uses UTF-16LE. */ +/** Deprecated; no longer has any effect. The library now always assumes UTF-8 + * encoding on non-Windows systems. */ #define WIMLIB_INIT_FLAG_ASSUME_UTF8 0x00000001 /** Windows-only: do not attempt to acquire additional privileges (currently @@ -2462,7 +2458,6 @@ enum wimlib_error_code { WIMLIB_ERR_DECOMPRESSION = 2, WIMLIB_ERR_FUSE = 6, WIMLIB_ERR_GLOB_HAD_NO_MATCHES = 8, - WIMLIB_ERR_ICONV_NOT_AVAILABLE = 9, WIMLIB_ERR_IMAGE_COUNT = 10, WIMLIB_ERR_IMAGE_NAME_COLLISION = 11, WIMLIB_ERR_INSUFFICIENT_PRIVILEGES = 12, @@ -2475,7 +2470,6 @@ enum wimlib_error_code { WIMLIB_ERR_INVALID_INTEGRITY_TABLE = 19, WIMLIB_ERR_INVALID_LOOKUP_TABLE_ENTRY = 20, WIMLIB_ERR_INVALID_METADATA_RESOURCE = 21, - WIMLIB_ERR_INVALID_MULTIBYTE_STRING = 22, WIMLIB_ERR_INVALID_OVERLAY = 23, WIMLIB_ERR_INVALID_PARAM = 24, WIMLIB_ERR_INVALID_PART_NUMBER = 25, @@ -3278,9 +3272,8 @@ wimlib_get_xml_data(WIMStruct *wim, void **buf_ret, size_t *bufsize_ret); * * Initialization function for wimlib. Call before using any other wimlib * function (except possibly wimlib_set_print_errors()). If not done manually, - * this function will be called automatically with @p init_flags set to - * ::WIMLIB_INIT_FLAG_ASSUME_UTF8. This function does nothing if called again - * after it has already successfully run. + * this function will be called automatically with a flags argument of 0. This + * function does nothing if called again after it has already successfully run. * * @param init_flags * Bitwise OR of flags prefixed with WIMLIB_INIT_FLAG. diff --git a/include/wimlib/encoding.h b/include/wimlib/encoding.h index f40f77be..45760c7e 100644 --- a/include/wimlib/encoding.h +++ b/include/wimlib/encoding.h @@ -7,85 +7,49 @@ #include "wimlib/util.h" #include "wimlib/types.h" -extern void -iconv_global_init(void); - -extern void -iconv_global_cleanup(void); - -extern u16 upcase[65536]; - -extern void -init_upcase(void); - -extern bool wimlib_mbs_is_utf8; - -#define DECLARE_CHAR_CONVERSION_FUNCTIONS(varname1, varname2, \ - chartype1, chartype2) \ - \ -extern int \ -varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes, \ - chartype2 **out_ret, \ - size_t *out_nbytes_ret); \ - \ -extern int \ -varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\ - size_t *out_nbytes_ret); \ - \ -extern int \ -varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes, \ - chartype2 *out); - -extern utf16lechar * -utf16le_dupz(const void *ustr, size_t usize); - -extern utf16lechar * -utf16le_dup(const utf16lechar *ustr); - -extern size_t -utf16le_len_bytes(const utf16lechar *s); - -extern size_t -utf16le_len_chars(const utf16lechar *s); +extern int +utf8_to_utf16le(const char *in, size_t in_nbytes, + utf16lechar **out_ret, size_t *out_nbytes_ret); -#if !TCHAR_IS_UTF16LE -DECLARE_CHAR_CONVERSION_FUNCTIONS(utf16le, tstr, utf16lechar, tchar); -DECLARE_CHAR_CONVERSION_FUNCTIONS(tstr, utf16le, tchar, utf16lechar); -#else +extern int +utf16le_to_utf8(const utf16lechar *in, size_t in_nbytes, + char **out_ret, size_t *out_nbytes_ret); static inline int -tstr_to_utf16le(const tchar *tstr, size_t tsize, - utf16lechar **ustr_ret, size_t *usize_ret) +tstr_to_tstr(const tchar *in, size_t in_nbytes, + tchar **out_ret, size_t *out_nbytes_ret) { - utf16lechar *ustr = utf16le_dupz(tstr, tsize); - if (!ustr) + *out_ret = MALLOC(in_nbytes + sizeof(tchar)); + if (unlikely(!*out_ret)) return WIMLIB_ERR_NOMEM; - *ustr_ret = ustr; - *usize_ret = tsize; + memcpy(*out_ret, in, in_nbytes); + (*out_ret)[in_nbytes / sizeof(tchar)] = 0; + if (out_nbytes_ret) + *out_nbytes_ret = in_nbytes; return 0; } -#define utf16le_to_tstr tstr_to_utf16le +#if TCHAR_IS_UTF16LE -#endif +/* tstr(UTF-16LE) <=> UTF-16LE */ +# define tstr_to_utf16le tstr_to_tstr +# define utf16le_to_tstr tstr_to_tstr -DECLARE_CHAR_CONVERSION_FUNCTIONS(utf8, tstr, char, tchar); -DECLARE_CHAR_CONVERSION_FUNCTIONS(tstr, utf8, tchar, char); +/* tstr(UTF-16LE) <=> UTF-8 */ +# define tstr_to_utf8 utf16le_to_utf8 +# define utf8_to_tstr utf8_to_utf16le -extern int -utf8_to_tstr_simple(const char *utf8str, tchar **out); +#else -extern int -tstr_to_utf8_simple(const tchar *tstr, char **out); +/* tstr(UTF-8) <=> UTF-16LE */ +# define tstr_to_utf16le utf8_to_utf16le +# define utf16le_to_tstr utf16le_to_utf8 -extern int -cmp_utf16le_strings(const utf16lechar *s1, size_t n1, - const utf16lechar *s2, size_t n2, - bool ignore_case); +/* tstr(UTF-8) <=> UTF-8 */ +# define tstr_to_utf8 tstr_to_tstr +# define utf8_to_tstr tstr_to_tstr -extern int -cmp_utf16le_strings_z(const utf16lechar *s1, const utf16lechar *s2, - bool ignore_case); +#endif /* Convert a string in the platform-dependent encoding to UTF-16LE, but if both * encodings are UTF-16LE, simply re-use the string. Release with @@ -117,8 +81,7 @@ tstr_get_utf16le(const tchar *tstr, const utf16lechar **ustr_ret) return 0; #else size_t tsize = tstrlen(tstr) * sizeof(tchar); - size_t dummy; - return tstr_to_utf16le(tstr, tsize, (utf16lechar **)ustr_ret, &dummy); + return tstr_to_utf16le(tstr, tsize, (utf16lechar **)ustr_ret, NULL); #endif } @@ -142,7 +105,8 @@ utf16le_get_tstr(const utf16lechar *ustr, size_t usize, #if TCHAR_IS_UTF16LE /* No conversion or copy needed */ *tstr_ret = ustr; - *tsize_ret = usize; + if (tsize_ret) + *tsize_ret = usize; return 0; #else return utf16le_to_tstr(ustr, usize, (tchar **)tstr_ret, tsize_ret); @@ -158,4 +122,33 @@ utf16le_put_tstr(const tchar *tstr) #endif } + +/* UTF-16LE utility functions */ + +extern u16 upcase[65536]; + +extern void +init_upcase(void); + +extern int +cmp_utf16le_strings(const utf16lechar *s1, size_t n1, + const utf16lechar *s2, size_t n2, + bool ignore_case); + +extern int +cmp_utf16le_strings_z(const utf16lechar *s1, const utf16lechar *s2, + bool ignore_case); + +extern utf16lechar * +utf16le_dupz(const void *ustr, size_t usize); + +extern utf16lechar * +utf16le_dup(const utf16lechar *s); + +extern size_t +utf16le_len_bytes(const utf16lechar *s); + +extern size_t +utf16le_len_chars(const utf16lechar *s); + #endif /* _WIMLIB_ENCODING_H */ diff --git a/include/wimlib/ntfs_3g.h b/include/wimlib/ntfs_3g.h index 0ee9da26..0328348f 100644 --- a/include/wimlib/ntfs_3g.h +++ b/include/wimlib/ntfs_3g.h @@ -9,9 +9,6 @@ struct blob_descriptor; struct ntfs_location; struct read_blob_callbacks; -extern void -libntfs3g_global_init(void); - extern int read_ntfs_attribute_prefix(const struct blob_descriptor *blob, u64 size, const struct read_blob_callbacks *cbs); diff --git a/include/wimlib_tchar.h b/include/wimlib_tchar.h index 5a2038f0..a45adbfd 100644 --- a/include/wimlib_tchar.h +++ b/include/wimlib_tchar.h @@ -67,8 +67,7 @@ typedef wchar_t tchar; # define tglob win32_wglob #else /* __WIN32__ */ /* For non-Windows builds, the "tchar" type will be one byte and will specify a - * string in the locale-dependent multibyte encoding. However, only UTF-8 is - * well supported in this library. */ + * string in UTF-8. */ typedef char tchar; # define TCHAR_IS_UTF16LE 0 # define T(text) text /* In this case, strings of "tchar" are simply strings of diff --git a/m4/iconv.m4 b/m4/iconv.m4 deleted file mode 100644 index 6a47236c..00000000 --- a/m4/iconv.m4 +++ /dev/null @@ -1,268 +0,0 @@ -# iconv.m4 serial 18 (gettext-0.18.2) -dnl Copyright (C) 2000-2002, 2007-2012 Free Software Foundation, Inc. -dnl This file is free software; the Free Software Foundation -dnl gives unlimited permission to copy and/or distribute it, -dnl with or without modifications, as long as this notice is preserved. - -dnl From Bruno Haible. - -AC_DEFUN([AM_ICONV_LINKFLAGS_BODY], -[ - dnl Prerequisites of AC_LIB_LINKFLAGS_BODY. - AC_REQUIRE([AC_LIB_PREPARE_PREFIX]) - AC_REQUIRE([AC_LIB_RPATH]) - - dnl Search for libiconv and define LIBICONV, LTLIBICONV and INCICONV - dnl accordingly. - AC_LIB_LINKFLAGS_BODY([iconv]) -]) - -AC_DEFUN([AM_ICONV_LINK], -[ - dnl Some systems have iconv in libc, some have it in libiconv (OSF/1 and - dnl those with the standalone portable GNU libiconv installed). - AC_REQUIRE([AC_CANONICAL_HOST]) dnl for cross-compiles - - dnl Search for libiconv and define LIBICONV, LTLIBICONV and INCICONV - dnl accordingly. - AC_REQUIRE([AM_ICONV_LINKFLAGS_BODY]) - - dnl Add $INCICONV to CPPFLAGS before performing the following checks, - dnl because if the user has installed libiconv and not disabled its use - dnl via --without-libiconv-prefix, he wants to use it. The first - dnl AC_LINK_IFELSE will then fail, the second AC_LINK_IFELSE will succeed. - am_save_CPPFLAGS="$CPPFLAGS" - AC_LIB_APPENDTOVAR([CPPFLAGS], [$INCICONV]) - - AC_CACHE_CHECK([for iconv], [am_cv_func_iconv], [ - am_cv_func_iconv="no, consider installing GNU libiconv" - am_cv_lib_iconv=no - AC_LINK_IFELSE( - [AC_LANG_PROGRAM( - [[ -#include -#include - ]], - [[iconv_t cd = iconv_open("",""); - iconv(cd,NULL,NULL,NULL,NULL); - iconv_close(cd);]])], - [am_cv_func_iconv=yes]) - if test "$am_cv_func_iconv" != yes; then - am_save_LIBS="$LIBS" - LIBS="$LIBS $LIBICONV" - AC_LINK_IFELSE( - [AC_LANG_PROGRAM( - [[ -#include -#include - ]], - [[iconv_t cd = iconv_open("",""); - iconv(cd,NULL,NULL,NULL,NULL); - iconv_close(cd);]])], - [am_cv_lib_iconv=yes] - [am_cv_func_iconv=yes]) - LIBS="$am_save_LIBS" - fi - ]) - if test "$am_cv_func_iconv" = yes; then - AC_CACHE_CHECK([for working iconv], [am_cv_func_iconv_works], [ - dnl This tests against bugs in AIX 5.1, AIX 6.1..7.1, HP-UX 11.11, - dnl Solaris 10. - am_save_LIBS="$LIBS" - if test $am_cv_lib_iconv = yes; then - LIBS="$LIBS $LIBICONV" - fi - AC_RUN_IFELSE( - [AC_LANG_SOURCE([[ -#include -#include -int main () -{ - int result = 0; - /* Test against AIX 5.1 bug: Failures are not distinguishable from successful - returns. */ - { - iconv_t cd_utf8_to_88591 = iconv_open ("ISO8859-1", "UTF-8"); - if (cd_utf8_to_88591 != (iconv_t)(-1)) - { - static const char input[] = "\342\202\254"; /* EURO SIGN */ - char buf[10]; - const char *inptr = input; - size_t inbytesleft = strlen (input); - char *outptr = buf; - size_t outbytesleft = sizeof (buf); - size_t res = iconv (cd_utf8_to_88591, - (char **) &inptr, &inbytesleft, - &outptr, &outbytesleft); - if (res == 0) - result |= 1; - iconv_close (cd_utf8_to_88591); - } - } - /* Test against Solaris 10 bug: Failures are not distinguishable from - successful returns. */ - { - iconv_t cd_ascii_to_88591 = iconv_open ("ISO8859-1", "646"); - if (cd_ascii_to_88591 != (iconv_t)(-1)) - { - static const char input[] = "\263"; - char buf[10]; - const char *inptr = input; - size_t inbytesleft = strlen (input); - char *outptr = buf; - size_t outbytesleft = sizeof (buf); - size_t res = iconv (cd_ascii_to_88591, - (char **) &inptr, &inbytesleft, - &outptr, &outbytesleft); - if (res == 0) - result |= 2; - iconv_close (cd_ascii_to_88591); - } - } - /* Test against AIX 6.1..7.1 bug: Buffer overrun. */ - { - iconv_t cd_88591_to_utf8 = iconv_open ("UTF-8", "ISO-8859-1"); - if (cd_88591_to_utf8 != (iconv_t)(-1)) - { - static const char input[] = "\304"; - static char buf[2] = { (char)0xDE, (char)0xAD }; - const char *inptr = input; - size_t inbytesleft = 1; - char *outptr = buf; - size_t outbytesleft = 1; - size_t res = iconv (cd_88591_to_utf8, - (char **) &inptr, &inbytesleft, - &outptr, &outbytesleft); - if (res != (size_t)(-1) || outptr - buf > 1 || buf[1] != (char)0xAD) - result |= 4; - iconv_close (cd_88591_to_utf8); - } - } -#if 0 /* This bug could be worked around by the caller. */ - /* Test against HP-UX 11.11 bug: Positive return value instead of 0. */ - { - iconv_t cd_88591_to_utf8 = iconv_open ("utf8", "iso88591"); - if (cd_88591_to_utf8 != (iconv_t)(-1)) - { - static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; - char buf[50]; - const char *inptr = input; - size_t inbytesleft = strlen (input); - char *outptr = buf; - size_t outbytesleft = sizeof (buf); - size_t res = iconv (cd_88591_to_utf8, - (char **) &inptr, &inbytesleft, - &outptr, &outbytesleft); - if ((int)res > 0) - result |= 8; - iconv_close (cd_88591_to_utf8); - } - } -#endif - /* Test against HP-UX 11.11 bug: No converter from EUC-JP to UTF-8 is - provided. */ - if (/* Try standardized names. */ - iconv_open ("UTF-8", "EUC-JP") == (iconv_t)(-1) - /* Try IRIX, OSF/1 names. */ - && iconv_open ("UTF-8", "eucJP") == (iconv_t)(-1) - /* Try AIX names. */ - && iconv_open ("UTF-8", "IBM-eucJP") == (iconv_t)(-1) - /* Try HP-UX names. */ - && iconv_open ("utf8", "eucJP") == (iconv_t)(-1)) - result |= 16; - return result; -}]])], - [am_cv_func_iconv_works=yes], - [am_cv_func_iconv_works=no], - [ -changequote(,)dnl - case "$host_os" in - aix* | hpux*) am_cv_func_iconv_works="guessing no" ;; - *) am_cv_func_iconv_works="guessing yes" ;; - esac -changequote([,])dnl - ]) - LIBS="$am_save_LIBS" - ]) - case "$am_cv_func_iconv_works" in - *no) am_func_iconv=no am_cv_lib_iconv=no ;; - *) am_func_iconv=yes ;; - esac - else - am_func_iconv=no am_cv_lib_iconv=no - fi - if test "$am_func_iconv" = yes; then - AC_DEFINE([HAVE_ICONV], [1], - [Define if you have the iconv() function and it works.]) - fi - if test "$am_cv_lib_iconv" = yes; then - AC_MSG_CHECKING([how to link with libiconv]) - AC_MSG_RESULT([$LIBICONV]) - else - dnl If $LIBICONV didn't lead to a usable library, we don't need $INCICONV - dnl either. - CPPFLAGS="$am_save_CPPFLAGS" - LIBICONV= - LTLIBICONV= - fi - AC_SUBST([LIBICONV]) - AC_SUBST([LTLIBICONV]) -]) - -dnl Define AM_ICONV using AC_DEFUN_ONCE for Autoconf >= 2.64, in order to -dnl avoid warnings like -dnl "warning: AC_REQUIRE: `AM_ICONV' was expanded before it was required". -dnl This is tricky because of the way 'aclocal' is implemented: -dnl - It requires defining an auxiliary macro whose name ends in AC_DEFUN. -dnl Otherwise aclocal's initial scan pass would miss the macro definition. -dnl - It requires a line break inside the AC_DEFUN_ONCE and AC_DEFUN expansions. -dnl Otherwise aclocal would emit many "Use of uninitialized value $1" -dnl warnings. -m4_define([gl_iconv_AC_DEFUN], - m4_version_prereq([2.64], - [[AC_DEFUN_ONCE( - [$1], [$2])]], - [m4_ifdef([gl_00GNULIB], - [[AC_DEFUN_ONCE( - [$1], [$2])]], - [[AC_DEFUN( - [$1], [$2])]])])) -gl_iconv_AC_DEFUN([AM_ICONV], -[ - AM_ICONV_LINK - if test "$am_cv_func_iconv" = yes; then - AC_MSG_CHECKING([for iconv declaration]) - AC_CACHE_VAL([am_cv_proto_iconv], [ - AC_COMPILE_IFELSE( - [AC_LANG_PROGRAM( - [[ -#include -#include -extern -#ifdef __cplusplus -"C" -#endif -#if defined(__STDC__) || defined(_MSC_VER) || defined(__cplusplus) -size_t iconv (iconv_t cd, char * *inbuf, size_t *inbytesleft, char * *outbuf, size_t *outbytesleft); -#else -size_t iconv(); -#endif - ]], - [[]])], - [am_cv_proto_iconv_arg1=""], - [am_cv_proto_iconv_arg1="const"]) - am_cv_proto_iconv="extern size_t iconv (iconv_t cd, $am_cv_proto_iconv_arg1 char * *inbuf, size_t *inbytesleft, char * *outbuf, size_t *outbytesleft);"]) - am_cv_proto_iconv=`echo "[$]am_cv_proto_iconv" | tr -s ' ' | sed -e 's/( /(/'` - AC_MSG_RESULT([ - $am_cv_proto_iconv]) - AC_DEFINE_UNQUOTED([ICONV_CONST], [$am_cv_proto_iconv_arg1], - [Define as const if the declaration of iconv() needs const.]) - dnl Also substitute ICONV_CONST in the gnulib generated . - m4_ifdef([gl_ICONV_H_DEFAULTS], - [AC_REQUIRE([gl_ICONV_H_DEFAULTS]) - if test -n "$am_cv_proto_iconv_arg1"; then - ICONV_CONST="const" - fi - ]) - fi -]) diff --git a/programs/imagex.c b/programs/imagex.c index 0f8c45f1..1eb8489b 100644 --- a/programs/imagex.c +++ b/programs/imagex.c @@ -4620,31 +4620,6 @@ main(int argc, tchar **argv) imagex_info_file = stdout; invocation_name = tbasename(argv[0]); -#ifndef __WIN32__ - if (getenv("WIMLIB_IMAGEX_USE_UTF8")) { - init_flags |= WIMLIB_INIT_FLAG_ASSUME_UTF8; - } else { - char *codeset; - - setlocale(LC_ALL, ""); - codeset = nl_langinfo(CODESET); - if (!strstr(codeset, "UTF-8") && - !strstr(codeset, "UTF8") && - !strstr(codeset, "utf-8") && - !strstr(codeset, "utf8")) - { - fprintf(stderr, -"WARNING: Running %"TS" in a UTF-8 locale is recommended!\n" -" Maybe try: `export LANG=en_US.UTF-8'?\n" -" Alternatively, set the environmental variable WIMLIB_IMAGEX_USE_UTF8\n" -" to any value to force wimlib to use UTF-8.\n", - invocation_name); - - } - } - -#endif /* !__WIN32__ */ - { tchar *igcase = tgetenv(T("WIMLIB_IMAGEX_IGNORE_CASE")); if (igcase != NULL) { diff --git a/src/dentry.c b/src/dentry.c index de3861e0..fb11ed6d 100644 --- a/src/dentry.c +++ b/src/dentry.c @@ -484,7 +484,6 @@ int calculate_dentry_full_path(struct wim_dentry *dentry) { size_t ulen; - size_t dummy; const struct wim_dentry *d; if (dentry->d_full_path) @@ -513,7 +512,7 @@ calculate_dentry_full_path(struct wim_dentry *dentry) wimlib_assert(p == ubuf); return utf16le_to_tstr(ubuf, ulen * sizeof(utf16lechar), - &dentry->d_full_path, &dummy); + &dentry->d_full_path, NULL); } /* diff --git a/src/encoding.c b/src/encoding.c index 4fd8712b..4a991616 100644 --- a/src/encoding.c +++ b/src/encoding.c @@ -1,9 +1,7 @@ /* - * encoding.c - */ - -/* - * Copyright (C) 2012, 2013 Eric Biggers + * encoding.c - UTF-8 and UTF-16LE codecs and utility functions + * + * Copyright (C) 2012-2016 Eric Biggers * * This file is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free @@ -24,386 +22,223 @@ #endif #include -#include -#include #include -#include "wimlib.h" -#include "wimlib/alloca.h" -#include "wimlib/assert.h" #include "wimlib/encoding.h" #include "wimlib/endianness.h" #include "wimlib/error.h" -#include "wimlib/list.h" +#include "wimlib/unaligned.h" #include "wimlib/util.h" +#define INVALID_CODEPOINT 0xFFFFFFFF +#define VALIDATE(expr) if (validate && unlikely(!(expr))) goto invalid +#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) < 0xE000) +#define IS_LOW_SURROGATE(c) ((c) >= 0xD800 && (c) < 0xDC00) +#define IS_HIGH_SURROGATE(c) ((c) >= 0xDC00 && (c) < 0xE000) +#define IS_UTF8_TAIL(c) (((c) & 0xC0) == 0x80) -bool wimlib_mbs_is_utf8 = !TCHAR_IS_UTF16LE; - -/* List of iconv_t conversion descriptors for a specific character conversion. - * The idea is that it is not thread-safe to have just one conversion - * descriptor, but it also is inefficient to open a new conversion descriptor to - * convert every string. Both these problems can be solved by maintaining a - * list of conversion descriptors; then, a thread can use an existing conversion - * descriptor if available. */ -struct iconv_list_head { - const char *from_encoding; - const char *to_encoding; - struct list_head list; - pthread_mutex_t mutex; -}; - -struct iconv_node { - iconv_t cd; - struct list_head list; - struct iconv_list_head *head; -}; - -#define ICONV_LIST(name, from, to) \ -struct iconv_list_head name = { \ - .from_encoding = from, \ - .to_encoding = to, \ -} +/* + * Decode the next Unicode codepoint from the string at @in, which has + * @remaining >= 1 bytes remaining, and return the number of bytes consumed and + * writes the codepoint to *c_ret. + * + * If the input may be invalid, then @validate must be specified as %true, and + * then on invalid input the function consumes at least one byte and sets *c_ret + * to INVALID_CODEPOINT. If the input is guaranteed to be valid, then @validate + * may be specified as %false. + */ +typedef unsigned (*decode_codepoint_fn)(const u8 *in, size_t remaining, + bool validate, u32 *c_ret); + +/* Encode the Unicode codepoint @c and return the number of bytes used */ +typedef unsigned (*encode_codepoint_fn)(u32 c, u8 *out); -static iconv_t * -get_iconv(struct iconv_list_head *head) +static inline unsigned +utf8_decode_codepoint(const u8 *in, size_t remaining, bool validate, u32 *c_ret) { - iconv_t cd; - iconv_t *cd_p; - struct iconv_node *i; - - pthread_mutex_lock(&head->mutex); - if (list_empty(&head->list)) { - cd = iconv_open(head->to_encoding, head->from_encoding); - if (cd == (iconv_t)-1) { - ERROR_WITH_ERRNO("Failed to open iconv from %s to %s", - head->from_encoding, head->to_encoding); - cd_p = NULL; - } else { - i = MALLOC(sizeof(struct iconv_node)); - if (i) { - i->head = head; - i->cd = cd; - cd_p = &i->cd; - } else { - iconv_close(cd); - cd_p = NULL; - } - } - } else { - i = container_of(head->list.next, struct iconv_node, list); - list_del(head->list.next); - cd_p = &i->cd; + if (likely(in[0] < 0x80)) { /* U+0...U+7F */ + *c_ret = in[0]; + return 1; } - pthread_mutex_unlock(&head->mutex); - return cd_p; -} -static void -put_iconv(iconv_t *cd) -{ - int errno_save = errno; - struct iconv_node *i = container_of(cd, struct iconv_node, cd); - struct iconv_list_head *head = i->head; - - pthread_mutex_lock(&head->mutex); - list_add(&i->list, &head->list); - pthread_mutex_unlock(&head->mutex); - errno = errno_save; -} + if (in[0] < 0xE0) { /* U+80...U+7FF */ + VALIDATE(in[0] >= 0xC2 && remaining >= 2 && + IS_UTF8_TAIL(in[1])); + *c_ret = ((u32)(in[0] & 0x1F) << 6) | (in[1] & 0x3F); + return 2; + } -#define DEFINE_CHAR_CONVERSION_FUNCTIONS(varname1, longname1, chartype1,\ - varname2, longname2, chartype2,\ - earlyreturn_on_utf8_locale, \ - earlyreturn_expr, \ - worst_case_len_expr, \ - err_return, \ - err_msg, \ - modifier) \ -static ICONV_LIST(iconv_##varname1##_to_##varname2, \ - longname1, longname2); \ - \ -modifier int \ -varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\ - size_t *out_nbytes_ret) \ -{ \ - iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2); \ - if (cd == NULL) \ - return WIMLIB_ERR_ICONV_NOT_AVAILABLE; \ - \ - chartype2 *buf; \ - size_t bufsize; \ - bool buf_onheap; \ - bufsize = (worst_case_len_expr) * sizeof(chartype2); \ - /* Worst case length */ \ - if (bufsize <= STACK_MAX) { \ - buf = alloca(bufsize); \ - buf_onheap = false; \ - } else { \ - buf = MALLOC(bufsize); \ - if (!buf) \ - return WIMLIB_ERR_NOMEM; \ - buf_onheap = true; \ - } \ - \ - char *inbuf = (char*)in; \ - size_t inbytesleft = in_nbytes; \ - char *outbuf = (char*)buf; \ - size_t outbytesleft = bufsize; \ - size_t len; \ - int ret; \ - \ - len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \ - if (len == (size_t)-1) { \ - err_msg; \ - ret = err_return; \ - } else { \ - *out_nbytes_ret = bufsize - outbytesleft; \ - ret = 0; \ - } \ - put_iconv(cd); \ - if (buf_onheap) \ - FREE(buf); \ - return ret; \ -} \ - \ -modifier int \ -varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes, \ - chartype2 *out) \ -{ \ - iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2); \ - if (cd == NULL) \ - return WIMLIB_ERR_ICONV_NOT_AVAILABLE; \ - \ - char *inbuf = (char*)in; \ - size_t inbytesleft = in_nbytes; \ - char *outbuf = (char*)out; \ - const size_t LARGE_NUMBER = 1000000000; \ - size_t outbytesleft = LARGE_NUMBER; \ - size_t len; \ - int ret; \ - \ - len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \ - if (len == (size_t)-1) { \ - err_msg; \ - ret = err_return; \ - } else { \ - out[(LARGE_NUMBER-outbytesleft)/sizeof(chartype2)] = 0; \ - ret = 0; \ - } \ - put_iconv(cd); \ - return ret; \ -} \ - \ -modifier int \ -varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes, \ - chartype2 **out_ret, \ - size_t *out_nbytes_ret) \ -{ \ - int ret; \ - chartype2 *out; \ - size_t out_nbytes; \ - \ - if (earlyreturn_on_utf8_locale && wimlib_mbs_is_utf8) { \ - earlyreturn_expr; \ - /* Out same as in */ \ - out = MALLOC(in_nbytes + sizeof(chartype2)); \ - if (!out) \ - return WIMLIB_ERR_NOMEM; \ - memcpy(out, in, in_nbytes); \ - out[in_nbytes / sizeof(chartype2)] = 0; \ - *out_ret = out; \ - *out_nbytes_ret = in_nbytes; \ - return 0; \ - } \ - \ - ret = varname1##_to_##varname2##_nbytes(in, in_nbytes, \ - &out_nbytes); \ - if (ret) \ - return ret; \ - \ - out = MALLOC(out_nbytes + sizeof(chartype2)); \ - if (!out) \ - return WIMLIB_ERR_NOMEM; \ - \ - ret = varname1##_to_##varname2##_buf(in, in_nbytes, out); \ - if (ret) { \ - FREE(out); \ - } else { \ - *out_ret = out; \ - *out_nbytes_ret = out_nbytes; \ - } \ - return ret; \ + if (in[0] < 0xF0) { /* U+800...U+FFFF excluding surrogates */ + VALIDATE(remaining >= 3 && + IS_UTF8_TAIL(in[1]) && + IS_UTF8_TAIL(in[2])); + *c_ret = ((u32)(in[0] & 0x0F) << 12) | + ((u32)(in[1] & 0x3F) << 6) | (in[2] & 0x3F); + VALIDATE(*c_ret >= 0x800 && !IS_SURROGATE(*c_ret)); + return 3; + } + + /* U+10000...U+10FFFF */ + VALIDATE(in[0] < 0xF8 && remaining >= 4 && + IS_UTF8_TAIL(in[1]) && + IS_UTF8_TAIL(in[2]) && + IS_UTF8_TAIL(in[3])); + *c_ret = ((u32)(in[0] & 0x07) << 18) | ((u32)(in[1] & 0x3F) << 12) | + ((u32)(in[2] & 0x3F) << 6) | (in[3] & 0x3F); + VALIDATE(*c_ret >= 0x10000 && *c_ret <= 0x10FFFF); + return 4; + +invalid: + *c_ret = INVALID_CODEPOINT; + return 1; } -#if !TCHAR_IS_UTF16LE - -/* UNIX */ - -DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", tchar, - utf16le, "UTF-16LE", utf16lechar, - false, - , - in_nbytes * 2, - WIMLIB_ERR_INVALID_UTF8_STRING, - ERROR_WITH_ERRNO("Failed to convert UTF-8 string " - "to UTF-16LE string!"), - static) - -DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar, - utf8, "UTF-8", tchar, - false, - , - in_nbytes * 2, - WIMLIB_ERR_INVALID_UTF16_STRING, - ERROR_WITH_ERRNO("Failed to convert UTF-16LE string " - "to UTF-8 string!"), - static) - -DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar, - utf16le, "UTF-16LE", utf16lechar, - true, - return utf8_to_utf16le(in, in_nbytes, out_ret, out_nbytes_ret), - in_nbytes * 2, - WIMLIB_ERR_INVALID_MULTIBYTE_STRING, - ERROR_WITH_ERRNO("Failed to convert multibyte " - "string \"%"TS"\" to UTF-16LE string!", in); - ERROR("If the data you provided was UTF-8, please make sure " - "the character encoding\n" - " of your current locale is UTF-8."), - ) - -DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar, - tstr, "", tchar, - true, - return utf16le_to_utf8(in, in_nbytes, out_ret, out_nbytes_ret), - in_nbytes * 2, - WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE, - ERROR("Failed to convert UTF-16LE string to " - "multibyte string!"); - ERROR("This may be because the UTF-16LE string " - "could not be represented\n" - " in your locale's character encoding."), - ) -#endif +static inline unsigned +utf8_encode_codepoint(u32 c, u8 *out) +{ + if (likely(c < 0x80)) { + out[0] = c; + return 1; + } -/* tchar to UTF-8 and back */ -#if TCHAR_IS_UTF16LE - -/* Windows */ -DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "UTF-16LE", tchar, - utf8, "UTF-8", char, - false, - , - in_nbytes * 2, - WIMLIB_ERR_INVALID_UTF16_STRING, - ERROR_WITH_ERRNO("Failed to convert UTF-16LE " - "string \"%"TS"\" to UTF-8 string!", in), - ) - -DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char, - tstr, "UTF-16LE", tchar, - false, - , - in_nbytes * 2, - WIMLIB_ERR_INVALID_UTF8_STRING, - ERROR_WITH_ERRNO("Failed to convert UTF-8 string " - "to UTF-16LE string!"), - ) -#else - -/* UNIX */ - -DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar, - utf8, "UTF-8", char, - true, - , - in_nbytes * 4, - WIMLIB_ERR_INVALID_MULTIBYTE_STRING, - ERROR_WITH_ERRNO("Failed to convert multibyte " - "string \"%"TS"\" to UTF-8 string!", in); - ERROR("If the data you provided was UTF-8, please make sure " - "the character\n" - " encoding of your current locale is UTF-8."), - ) - -DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char, - tstr, "", tchar, - true, - , - in_nbytes * 4, - WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE, - ERROR("Failed to convert UTF-8 string to " - "multibyte string!"); - ERROR("This may be because the UTF-8 data " - "could not be represented\n" - " in your locale's character encoding."), - ) -#endif + if (c < 0x800) { + out[0] = 0xC0 | (c >> 6); + out[1] = 0x80 | (c & 0x3F); + return 2; + } -int -tstr_to_utf8_simple(const tchar *tstr, char **out) -{ - size_t out_nbytes; - return tstr_to_utf8(tstr, tstrlen(tstr) * sizeof(tchar), - out, &out_nbytes); + if (c < 0x10000) { + out[0] = 0xE0 | (c >> 12); + out[1] = 0x80 | ((c >> 6) & 0x3F); + out[2] = 0x80 | (c & 0x3F); + return 3; + } + + out[0] = 0xF0 | (c >> 18); + out[1] = 0x80 | ((c >> 12) & 0x3F); + out[2] = 0x80 | ((c >> 6) & 0x3F); + out[3] = 0x80 | (c & 0x3F); + return 4; } -int -utf8_to_tstr_simple(const char *utf8str, tchar **out) +static inline unsigned +utf16le_decode_codepoint(const u8 *in, size_t remaining, bool validate, + u32 *c_ret) { - size_t out_nbytes; - return utf8_to_tstr(utf8str, strlen(utf8str), out, &out_nbytes); + u32 l, h; + + VALIDATE(remaining >= 2); + l = get_unaligned_le16(in); + if (unlikely(IS_SURROGATE(l))) { + /* U+10000...U+10FFFF */ + VALIDATE(IS_LOW_SURROGATE(l) && remaining >= 4); + h = get_unaligned_le16(in + 2); + VALIDATE(IS_HIGH_SURROGATE(h)); + *c_ret = 0x10000 + (((l - 0xD800) << 10) | (h - 0xDC00)); + return 4; + } + *c_ret = l; + return 2; + +invalid: + *c_ret = INVALID_CODEPOINT; + return min(remaining, 2); } -static void -iconv_init(struct iconv_list_head *head) +static inline unsigned +utf16le_encode_codepoint(u32 c, u8 *out) { - pthread_mutex_init(&head->mutex, NULL); - INIT_LIST_HEAD(&head->list); + if (likely(c < 0x10000)) { + put_unaligned_le16(c, out); + return 2; + } + c -= 0x10000; + put_unaligned_le16(0xD800 + (c >> 10), out); + put_unaligned_le16(0xDC00 + (c & 0x3FF), out + 2); + return 4; } -static void -iconv_cleanup(struct iconv_list_head *head) +/* + * Convert the string @in of size @in_nbytes from the encoding given by the + * @decode_codepoint function to the encoding given by the @encode_codepoint + * function. + * + * On success, write the allocated output string and its size to @out_ret (must + * not be NULL) and @out_nbytes_ret (may be NULL), respectively. The output + * string will be null terminated, but the null terminator will not be counted + * in the size. + * + * If the input string is malformed, set errno=ILSEQ and return @ilseq_err. + * If out of memory, set errno=ENOMEM and return WIMLIB_ERR_NOMEM. + */ +static inline int +convert(const u8 * const in, const size_t in_nbytes, + u8 **out_ret, size_t *out_nbytes_ret, + int ilseq_err, + decode_codepoint_fn decode_codepoint, + encode_codepoint_fn encode_codepoint) { - pthread_mutex_destroy(&head->mutex); - while (!list_empty(&head->list)) { - struct iconv_node *i; - - i = container_of(head->list.next, struct iconv_node, list); - list_del(&i->list); - iconv_close(i->cd); - FREE(i); + const u8 * const in_end = in + in_nbytes; + const u8 *p_in; + u8 *p_out; + size_t out_nbytes = 0; + u8 *out; + u8 tmp[8]; /* assuming no codepoint requires > 8 bytes to encode */ + u32 c; + + /* Validate the input string and compute the output size */ + for (p_in = in; p_in != in_end; ) { + p_in += (*decode_codepoint)(p_in, in_end - p_in, true, &c); + if (unlikely(c == INVALID_CODEPOINT)) { + errno = EILSEQ; + return ilseq_err; + } + out_nbytes += (*encode_codepoint)(c, tmp); + } + + /* Allocate the output string */ + out = MALLOC(out_nbytes + (*encode_codepoint)(0, tmp)); + if (unlikely(!out)) + return WIMLIB_ERR_NOMEM; + + /* Do the conversion */ + for (p_in = in, p_out = out; p_in != in_end; ) { + p_in += (*decode_codepoint)(p_in, in_end - p_in, false, &c); + p_out += (*encode_codepoint)(c, p_out); } + + /* Add a null terminator */ + (*encode_codepoint)(0, p_out); + + /* Return the output string and its size */ + *out_ret = out; + if (out_nbytes_ret) + *out_nbytes_ret = out_nbytes; + return 0; } -void -iconv_global_init(void) +int +utf8_to_utf16le(const char *in, size_t in_nbytes, + utf16lechar **out_ret, size_t *out_nbytes_ret) { - iconv_init(&iconv_utf8_to_tstr); - iconv_init(&iconv_tstr_to_utf8); -#if !TCHAR_IS_UTF16LE - iconv_init(&iconv_utf16le_to_tstr); - iconv_init(&iconv_tstr_to_utf16le); - iconv_init(&iconv_utf16le_to_utf8); - iconv_init(&iconv_utf8_to_utf16le); -#endif + return convert((const u8 *)in, in_nbytes, + (u8 **)out_ret, out_nbytes_ret, + WIMLIB_ERR_INVALID_UTF8_STRING, + utf8_decode_codepoint, utf16le_encode_codepoint); } -void -iconv_global_cleanup(void) +int +utf16le_to_utf8(const utf16lechar *in, size_t in_nbytes, + char **out_ret, size_t *out_nbytes_ret) { - iconv_cleanup(&iconv_utf8_to_tstr); - iconv_cleanup(&iconv_tstr_to_utf8); -#if !TCHAR_IS_UTF16LE - iconv_cleanup(&iconv_utf16le_to_tstr); - iconv_cleanup(&iconv_tstr_to_utf16le); - iconv_cleanup(&iconv_utf16le_to_utf8); - iconv_cleanup(&iconv_utf8_to_utf16le); -#endif + return convert((const u8 *)in, in_nbytes, + (u8 **)out_ret, out_nbytes_ret, + WIMLIB_ERR_INVALID_UTF16_STRING, + utf16le_decode_codepoint, utf8_encode_codepoint); } -/* A table that maps from UCS-2 characters to their upper case equivalents. +/* + * A table that maps from UCS-2 characters to their upper case equivalents. * Index and array values are both CPU endian. * Note: this is only an *approximation* of real UTF-16 case folding. */ @@ -484,38 +319,17 @@ init_upcase(void) /* Delta filter */ for (u32 i = 0; i < ARRAY_LEN(upcase); i++) upcase[i] += i; - -#if 0 - /* Sanity checks */ - wimlib_assert(upcase['a'] == 'A'); - wimlib_assert(upcase['A'] == 'A'); - wimlib_assert(upcase['z'] == 'Z'); - wimlib_assert(upcase['Z'] == 'Z'); - wimlib_assert(upcase['1'] == '1'); - wimlib_assert(upcase[0x00e9] == 0x00c9); /* Latin letter e, with acute accent */ - wimlib_assert(upcase[0x00c9] == 0x00c9); - wimlib_assert(upcase[0x03c1] == 0x03a1); /* Greek letter rho */ - wimlib_assert(upcase[0x03a1] == 0x03a1); - wimlib_assert(upcase[0x0436] == 0x0416); /* Cyrillic letter zhe */ - wimlib_assert(upcase[0x0416] == 0x0416); - wimlib_assert(upcase[0x0567] == 0x0537); /* Armenian letter eh */ - wimlib_assert(upcase[0x0537] == 0x0537); - wimlib_assert(upcase[0x24d0] == 0x24b6); /* Circled Latin letter A - (is that a real character???) */ - wimlib_assert(upcase[0x24b6] == 0x24b6); - wimlib_assert(upcase[0x2603] == 0x2603); /* Note to self: Upper case - snowman symbol does not - exist. */ -#endif } -/* Compare UTF-16LE strings case-sensitively (%ignore_case == false) or +/* + * Compare UTF-16LE strings case-sensitively (%ignore_case == false) or * case-insensitively (%ignore_case == true). * * This is implemented using the default upper-case table used by NTFS. It does * not handle all possible cases allowed by UTF-16LE. For example, different * normalizations of the same sequence of "characters" are not considered equal. - * It hopefully does the right thing most of the time though. */ + * It hopefully does the right thing most of the time though. + */ int cmp_utf16le_strings(const utf16lechar *s1, size_t n1, const utf16lechar *s2, size_t n2, @@ -583,12 +397,9 @@ utf16le_dupz(const void *ustr, size_t usize) /* Duplicate a null-terminated UTF-16LE string. */ utf16lechar * -utf16le_dup(const utf16lechar *ustr) +utf16le_dup(const utf16lechar *s) { - const utf16lechar *p = ustr; - while (*p++) - ; - return memdup(ustr, (const u8 *)p - (const u8 *)ustr); + return memdup(s, utf16le_len_bytes(s) + sizeof(utf16lechar)); } /* Return the length, in bytes, of a UTF-null terminated UTF-16 string, diff --git a/src/error.c b/src/error.c index db2a107f..26fc10a4 100644 --- a/src/error.c +++ b/src/error.c @@ -187,9 +187,6 @@ static const tchar * const error_strings[] = { = T("An error was returned by fuse_main()"), [WIMLIB_ERR_GLOB_HAD_NO_MATCHES] = T("The provided file glob did not match any files"), - [WIMLIB_ERR_ICONV_NOT_AVAILABLE] - = T("The iconv() function does not seem to work. " - "Maybe check to make sure the directory /usr/lib/gconv exists"), [WIMLIB_ERR_IMAGE_COUNT] = T("Inconsistent image count among the metadata " "resources, the WIM header, and/or the XML data"), @@ -215,8 +212,6 @@ static const tchar * const error_strings[] = { = T("An entry in the WIM's lookup table is invalid"), [WIMLIB_ERR_INVALID_METADATA_RESOURCE] = T("The metadata resource is invalid"), - [WIMLIB_ERR_INVALID_MULTIBYTE_STRING] - = T("A string was not valid in the current locale's character encoding"), [WIMLIB_ERR_INVALID_OVERLAY] = T("Conflicting files in overlay when creating a WIM image"), [WIMLIB_ERR_INVALID_PARAM] @@ -230,9 +225,9 @@ static const tchar * const error_strings[] = { [WIMLIB_ERR_INVALID_RESOURCE_HASH] = T("The SHA-1 message digest of a WIM resource did not match the expected value"), [WIMLIB_ERR_INVALID_UTF8_STRING] - = T("A string provided as input by the user was not a valid UTF-8 string"), + = T("A string was not a valid UTF-8 string"), [WIMLIB_ERR_INVALID_UTF16_STRING] - = T("A string in a WIM dentry is not a valid UTF-16LE string"), + = T("A string was not a valid UTF-16 string"), [WIMLIB_ERR_IS_DIRECTORY] = T("One of the specified paths to delete was a directory"), [WIMLIB_ERR_IS_SPLIT_WIM] diff --git a/src/iterate_dir.c b/src/iterate_dir.c index c5f5dad7..ad34e799 100644 --- a/src/iterate_dir.c +++ b/src/iterate_dir.c @@ -50,12 +50,11 @@ stream_to_wimlib_stream_entry(const struct wim_inode *inode, const u8 *hash; if (stream_is_named(strm)) { - size_t dummy; int ret; ret = utf16le_get_tstr(strm->stream_name, utf16le_len_bytes(strm->stream_name), - &wstream->stream_name, &dummy); + &wstream->stream_name, NULL); if (ret) return ret; } @@ -87,7 +86,6 @@ init_wimlib_dentry(struct wimlib_dir_entry *wdentry, struct wim_dentry *dentry, WIMStruct *wim, int flags) { int ret; - size_t dummy; const struct wim_inode *inode = dentry->d_inode; const struct wim_inode_stream *strm; struct wimlib_unix_data unix_data; @@ -95,12 +93,12 @@ init_wimlib_dentry(struct wimlib_dir_entry *wdentry, struct wim_dentry *dentry, u32 object_id_len; ret = utf16le_get_tstr(dentry->d_name, dentry->d_name_nbytes, - &wdentry->filename, &dummy); + &wdentry->filename, NULL); if (ret) return ret; ret = utf16le_get_tstr(dentry->d_short_name, dentry->d_short_name_nbytes, - &wdentry->dos_name, &dummy); + &wdentry->dos_name, NULL); if (ret) return ret; diff --git a/src/mount_image.c b/src/mount_image.c index 18c3a4ef..6f0c6e1c 100644 --- a/src/mount_image.c +++ b/src/mount_image.c @@ -2484,7 +2484,7 @@ wimlib_unmount_image_with_progress(const char *dir, int unmount_flags, int mount_flags; int ret; - ret = wimlib_global_init(WIMLIB_INIT_FLAG_ASSUME_UTF8); + ret = wimlib_global_init(0); if (ret) return ret; diff --git a/src/ntfs-3g_apply.c b/src/ntfs-3g_apply.c index eb48896b..e644da42 100644 --- a/src/ntfs-3g_apply.c +++ b/src/ntfs-3g_apply.c @@ -1082,9 +1082,3 @@ const struct apply_operations ntfs_3g_apply_ops = { .context_size = sizeof(struct ntfs_3g_apply_ctx), .single_tree_only = true, }; - -void -libntfs3g_global_init(void) -{ - ntfs_set_char_encoding(setlocale(LC_ALL, "")); -} diff --git a/src/registry.c b/src/registry.c index 36ecdd39..f7eca9ab 100644 --- a/src/registry.c +++ b/src/registry.c @@ -681,12 +681,11 @@ append_subkey_name(const struct nk *sub_nk, void *_next_subkey_p) subkey[i] = sub_nk->name[i]; subkey[name_size] = '\0'; } else { - size_t dummy; enum hive_status status; status = translate_wimlib_error( utf16le_to_tstr((utf16lechar *)sub_nk->name, - name_size, &subkey, &dummy)); + name_size, &subkey, NULL)); if (status != HIVE_OK) return status; } diff --git a/src/reparse.c b/src/reparse.c index f96fd49b..94ff1292 100644 --- a/src/reparse.c +++ b/src/reparse.c @@ -252,7 +252,7 @@ wim_inode_readlink(const struct wim_inode *inode, char *buf, size_t bufsize, if (parse_link_reparse_point(&rpbuf, rpbuflen, &link)) return -EINVAL; - /* Translate the substitute name to the current multibyte encoding. */ + /* Translate the substitute name to UTF-8. */ if (utf16le_to_tstr(link.substitute_name, link.substitute_name_nbytes, &target_buffer, &target_len)) return -errno; diff --git a/src/wim.c b/src/wim.c index f6d1f3a5..8c3dbfc4 100644 --- a/src/wim.c +++ b/src/wim.c @@ -40,7 +40,6 @@ #include "wimlib/file_io.h" #include "wimlib/integrity.h" #include "wimlib/metadata.h" -#include "wimlib/ntfs_3g.h" /* for libntfs3g_global_init() */ #include "wimlib/security.h" #include "wimlib/wim.h" #include "wimlib/xml.h" @@ -162,7 +161,7 @@ wimlib_create_new_wim(enum wimlib_compression_type ctype, WIMStruct **wim_ret) int ret; WIMStruct *wim; - ret = wimlib_global_init(WIMLIB_INIT_FLAG_ASSUME_UTF8); + ret = wimlib_global_init(0); if (ret) return ret; @@ -790,7 +789,7 @@ open_wim_as_WIMStruct(const void *wim_filename_or_fd, int open_flags, WIMStruct *wim; int ret; - ret = wimlib_global_init(WIMLIB_INIT_FLAG_ASSUME_UTF8); + ret = wimlib_global_init(0); if (ret) return ret; @@ -936,21 +935,6 @@ wimlib_free(WIMStruct *wim) wim_decrement_refcnt(wim); } -static bool -test_locale_ctype_utf8(void) -{ -#ifdef __WIN32__ - return false; -#else - char *ctype = nl_langinfo(CODESET); - - return (strstr(ctype, "UTF-8") || - strstr(ctype, "UTF8") || - strstr(ctype, "utf8") || - strstr(ctype, "utf-8")); -#endif -} - /* API function documented in wimlib.h */ WIMLIBAPI u32 wimlib_get_version(void) @@ -999,19 +983,11 @@ wimlib_global_init(int init_flags) goto out_unlock; xml_global_init(); - if (!(init_flags & WIMLIB_INIT_FLAG_ASSUME_UTF8)) { - wimlib_mbs_is_utf8 = test_locale_ctype_utf8(); - #ifdef WITH_NTFS_3G - if (!wimlib_mbs_is_utf8) - libntfs3g_global_init(); - #endif - } #ifdef __WIN32__ ret = win32_global_init(init_flags); if (ret) goto out_unlock; #endif - iconv_global_init(); init_upcase(); if (init_flags & WIMLIB_INIT_FLAG_DEFAULT_CASE_SENSITIVE) default_ignore_case = false; @@ -1038,7 +1014,6 @@ wimlib_global_cleanup(void) goto out_unlock; xml_global_cleanup(); - iconv_global_cleanup(); #ifdef __WIN32__ win32_global_cleanup(); #endif diff --git a/src/xml.c b/src/xml.c index 18721b14..5d5b5604 100644 --- a/src/xml.c +++ b/src/xml.c @@ -64,12 +64,14 @@ struct wim_xml_info { /* The number of WIM images (the length of 'images') */ int image_count; +#if TCHAR_IS_UTF16LE /* Temporary memory for UTF-8 => 'tchar' string translations. When an * API function needs to return a 'tchar' string, it uses one of these * array slots to hold the string and returns a pointer to it. */ tchar *strings[128]; size_t next_string_idx; size_t num_strings; +#endif }; /*----------------------------------------------------------------------------* @@ -144,18 +146,21 @@ node_get_timestamp(const xmlNode *node) static int tstr_get_utf8(const tchar *tstr, const xmlChar **utf8_ret) { - if (wimlib_mbs_is_utf8) { - *utf8_ret = (xmlChar *)tstr; - return 0; - } - return tstr_to_utf8_simple(tstr, (char **)utf8_ret); +#if TCHAR_IS_UTF16LE + return utf16le_to_utf8(tstr, tstrlen(tstr) * sizeof(tchar), + (char **)utf8_ret, NULL); +#else + *utf8_ret = (xmlChar *)tstr; + return 0; +#endif } static void tstr_put_utf8(const xmlChar *utf8) { - if (!wimlib_mbs_is_utf8) - FREE((void *)utf8); +#if TCHAR_IS_UTF16LE + FREE((char *)utf8); +#endif } /* Retrieve the text contents of an XML element as a 'tchar' string. If not @@ -163,26 +168,29 @@ tstr_put_utf8(const xmlChar *utf8) static const tchar * node_get_ttext(struct wim_xml_info *info, xmlNode *node) { +#if TCHAR_IS_UTF16LE const xmlChar *text; tchar **ttext_p; text = node_get_text(node); - - if (!text || wimlib_mbs_is_utf8) - return (const tchar *)text; + if (!text) + return NULL; ttext_p = &info->strings[info->next_string_idx]; if (info->num_strings >= ARRAY_LEN(info->strings)) { FREE(*ttext_p); *ttext_p = NULL; } - if (utf8_to_tstr_simple(text, ttext_p)) + if (utf8_to_tstr(text, strlen(text), ttext_p, NULL)) return NULL; if (info->num_strings < ARRAY_LEN(info->strings)) info->num_strings++; info->next_string_idx++; info->next_string_idx %= ARRAY_LEN(info->strings); return *ttext_p; +#else + return node_get_text(node); +#endif } /* Unlink the specified node from its parent, then free it (recursively). */ @@ -335,10 +343,12 @@ static struct wim_xml_info * alloc_wim_xml_info(void) { struct wim_xml_info *info = MALLOC(sizeof(*info)); +#if TCHAR_IS_UTF16LE if (info) { info->next_string_idx = 0; info->num_strings = 0; } +#endif return info; } @@ -609,8 +619,10 @@ xml_free_info_struct(struct wim_xml_info *info) if (info) { xmlFreeDoc(info->doc); FREE(info->images); + #if TCHAR_IS_UTF16LE for (size_t i = 0; i < info->num_strings; i++) FREE(info->strings[i]); + #endif FREE(info); } } diff --git a/tools/windeps/Makefile b/tools/windeps/Makefile index 5b7dde52..4f6401fc 100644 --- a/tools/windeps/Makefile +++ b/tools/windeps/Makefile @@ -12,11 +12,9 @@ ARCHITECTURES := i686 x86_64 LIBXML2_VERSION := 2.9.4 WINPTHREADS_VERSION := 4.0.4 -WINICONV_VERSION := 0.0.6 LIBXML_URL := ftp://xmlsoft.org/libxml2/libxml2-$(LIBXML2_VERSION).tar.gz WINPTHREADS_URL := http://downloads.sourceforge.net/mingw-w64/mingw-w64/mingw-w64-release/mingw-w64-v$(WINPTHREADS_VERSION).tar.bz2 -WINICONV_URL := https://github.com/win-iconv/win-iconv/archive/$(WINICONV_VERSION).tar.gz LIBXML_SRCDIR := libxml2-$(LIBXML2_VERSION) @@ -42,17 +40,6 @@ $(WINPTHREADS_SRCDIR):$(WINPTHREADS_DIST) checksums_verified cp $@/COPYING COPYING.winpthreads MAKE_CLEAN_FILES += $(WINPTHREADS_SRCDIR) mingw-w64-v$(WINPTHREADS_VERSION) COPYING.winpthreads -WINICONV_SRCDIR := win-iconv-$(WINICONV_VERSION) -WINICONV_DIST := $(WINICONV_VERSION).tar.gz -SRCDIR_TARGETS += $(WINICONV_SRCDIR) -DIST_TARGETS += $(WINICONV_DIST) -$(WINICONV_DIST): - wget $(WINICONV_URL) -$(WINICONV_SRCDIR):$(WINICONV_DIST) checksums_verified - tar xvf $< -# win-iconv is public domain, so there's no license file. -MAKE_CLEAN_FILES += $(WINICONV_SRCDIR) - checksums_verified:$(DIST_TARGETS) sha256sum -c sha256sums @@ -104,35 +91,12 @@ $(1)_BUILD_TARGETS += winpthreads_$(1) MAKE_CLEAN_FILES += build_winpthreads_$(1) endef -# -# declare_winiconv_target(arch) -# -define declare_winiconv_target -winiconv_$(1):$(WINICONV_SRCDIR) - builddir=build_winiconv_$(1); \ - rm -rf $$$$builddir; \ - cp -r $(WINICONV_SRCDIR) $$$$builddir; \ - cd $$$$builddir; \ - $(MAKE) CC=$(1)-w64-mingw32-gcc \ - AR=$(1)-w64-mingw32-ar \ - RANLIB=$(1)-w64-mingw32-ranlib \ - DLLTOOL=$(1)-w64-mingw32-dlltool \ - CFLAGS=-O2 \ - prefix=$$$$PWD/../sysroot_$(1) \ - install; \ - rm -f ../sysroot_$(1)/lib/libiconv.dll.a; - -$(1)_BUILD_TARGETS += winiconv_$(1) -MAKE_CLEAN_FILES += build_winiconv_$(1) -endef - # # declare_arch_targets(arch) # define declare_arch_targets $(eval $(call declare_libxml_target,$(1))) $(eval $(call declare_winpthreads_target,$(1))) -$(eval $(call declare_winiconv_target,$(1))) sysroot_$(1): $($(1)_BUILD_TARGETS) diff --git a/tools/windeps/sha256sums b/tools/windeps/sha256sums index 038e2a07..1e940d8d 100644 --- a/tools/windeps/sha256sums +++ b/tools/windeps/sha256sums @@ -1,3 +1,2 @@ ffb911191e509b966deb55de705387f14156e1a56b21824357cdf0053233633c libxml2-2.9.4.tar.gz 89356a0aa8cf9f8b9dc8d92bc8dd01a131d4750c3acb30c6350a406316c42199 mingw-w64-v4.0.4.tar.bz2 -d464bbe0410f72b09f301bead9f1cf091e6aa15e97323961ecb9242c0e7f609b 0.0.6.tar.gz -- 2.43.0