X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Fencoding.c;h=31e6913add4cff2aca184b1a6594f11cf3e3d5ee;hp=e8bc40273f4180ab19f6a83cc0e178a5b84ce756;hb=192169052ceeb7fac5ff051fe0e836d3b2ddb7f7;hpb=b5d427e440402aff987a060ff6ceefcb97117fa4 diff --git a/src/encoding.c b/src/encoding.c index e8bc4027..31e6913a 100644 --- a/src/encoding.c +++ b/src/encoding.c @@ -1,7 +1,5 @@ /* - * encoding.c: Convert "multibyte" strings (the locale-default encoding--- - * generally, UTF-8 or something like ISO-8859-1) to UTF-16LE strings, and vice - * versa. + * encoding.c */ /* @@ -23,17 +21,22 @@ * along with wimlib; if not, see http://www.gnu.org/licenses/. */ -#include "config.h" #include "wimlib_internal.h" -#include -#include "list.h" +#include #include +#include #include -#include +#include -bool wimlib_mbs_is_utf8 = false; +bool wimlib_mbs_is_utf8 = !TCHAR_IS_UTF16LE; +/* List of iconv_t conversion descriptors for a specific character conversion. + * The idea is that it is not thread-safe to have just one conversion + * descriptor, but it also is inefficient to open a new conversion descriptor to + * convert every string. Both these problems can be solved by maintaining a + * list of conversion descriptors; then, a thread can use an existing conversion + * descriptor if available. */ struct iconv_list_head { const char *from_encoding; const char *to_encoding; @@ -95,7 +98,7 @@ put_iconv(iconv_t *cd) int errno_save = errno; struct iconv_node *i = container_of(cd, struct iconv_node, cd); struct iconv_list_head *head = i->head; - + pthread_mutex_lock(&head->mutex); list_add(&i->list, &head->list); pthread_mutex_unlock(&head->mutex); @@ -109,13 +112,16 @@ static bool error_message_being_printed = false; #define DEFINE_CHAR_CONVERSION_FUNCTIONS(varname1, longname1, chartype1,\ varname2, longname2, chartype2,\ + earlyreturn_on_utf8_locale, \ + earlyreturn_expr, \ worst_case_len_expr, \ err_return, \ - err_msg) \ + err_msg, \ + modifier) \ static ICONV_LIST(iconv_##varname1##_to_##varname2, \ longname1, longname2); \ \ -int \ +modifier int \ varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\ size_t *out_nbytes_ret) \ { \ @@ -148,7 +154,7 @@ varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\ return ret; \ } \ \ -int \ +modifier int \ varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes, \ chartype2 *out) \ { \ @@ -166,7 +172,11 @@ varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes, \ \ len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \ if (len == (size_t)-1) { \ - err_msg; \ + if (!error_message_being_printed) { \ + error_message_being_printed = true; \ + err_msg; \ + error_message_being_printed = false; \ + } \ ret = err_return; \ } else { \ out[(LARGE_NUMBER-outbytesleft)/sizeof(chartype2)] = 0; \ @@ -176,7 +186,7 @@ varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes, \ return ret; \ } \ \ -int \ +modifier int \ varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes, \ chartype2 **out_ret, \ size_t *out_nbytes_ret) \ @@ -185,6 +195,19 @@ varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes, \ chartype2 *out; \ size_t out_nbytes; \ \ + if (earlyreturn_on_utf8_locale && wimlib_mbs_is_utf8) { \ + earlyreturn_expr; \ + /* Out same as in */ \ + out = MALLOC(in_nbytes + sizeof(chartype2)); \ + if (!out) \ + return WIMLIB_ERR_NOMEM; \ + memcpy(out, in, in_nbytes); \ + out[in_nbytes / sizeof(chartype2)] = 0; \ + *out_ret = out; \ + *out_nbytes_ret = in_nbytes; \ + return 0; \ + } \ + \ ret = varname1##_to_##varname2##_nbytes(in, in_nbytes, \ &out_nbytes); \ if (ret) \ @@ -196,9 +219,7 @@ varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes, \ \ ret = varname1##_to_##varname2##_buf(in, in_nbytes, out); \ if (ret) { \ - int errno_save = errno; \ FREE(out); \ - errno = errno_save; \ } else { \ *out_ret = out; \ *out_nbytes_ret = out_nbytes; \ @@ -206,27 +227,125 @@ varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes, \ return ret; \ } +#if !TCHAR_IS_UTF16LE + +/* UNIX */ + +DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", tchar, + utf16le, "UTF-16LE", utf16lechar, + false, + , + in_nbytes * 2, + WIMLIB_ERR_INVALID_UTF8_STRING, + ERROR_WITH_ERRNO("Failed to convert UTF-8 string " + "to UTF-16LE string!"), + static) + DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar, - mbs, "", mbchar, - in_nbytes / 2 * MB_CUR_MAX, - WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE, - ERROR_WITH_ERRNO("Failed to convert UTF-16LE " - "string %U to multibyte string", in)) + utf8, "UTF-8", tchar, + false, + , + in_nbytes * 2, + WIMLIB_ERR_INVALID_UTF16_STRING, + ERROR_WITH_ERRNO("Failed to convert UTF-16LE string " + "to UTF-8 string!"), + static) -DEFINE_CHAR_CONVERSION_FUNCTIONS(mbs, "", mbchar, +DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar, utf16le, "UTF-16LE", utf16lechar, + true, + return utf8_to_utf16le(in, in_nbytes, out_ret, out_nbytes_ret), in_nbytes * 2, WIMLIB_ERR_INVALID_MULTIBYTE_STRING, ERROR_WITH_ERRNO("Failed to convert multibyte " - "string %s to UTF-16LE string", in)) + "string \"%"TS"\" to UTF-16LE string!", in); + ERROR("If the data you provided was UTF-8, please make sure " + "the character encoding\n" + " of your current locale is UTF-8."), + ) + +DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar, + tstr, "", tchar, + true, + return utf16le_to_utf8(in, in_nbytes, out_ret, out_nbytes_ret), + in_nbytes * 2, + WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE, + ERROR("Failed to convert UTF-16LE string to " + "multibyte string!"); + ERROR("This may be because the UTF-16LE string " + "could not be represented\n" + " in your locale's character encoding."), + ) +#endif + +/* tchar to UTF-8 and back */ +#if TCHAR_IS_UTF16LE + +/* Windows */ +DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "UTF-16LE", tchar, + utf8, "UTF-8", char, + false, + , + in_nbytes * 2, + WIMLIB_ERR_INVALID_UTF16_STRING, + ERROR_WITH_ERRNO("Failed to convert UTF-16LE " + "string \"%"TS"\" to UTF-8 string!", in), + static) -DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", utf8char, - mbs, "", mbchar, - in_nbytes, +DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char, + tstr, "UTF-16LE", tchar, + false, + , + in_nbytes * 2, WIMLIB_ERR_INVALID_UTF8_STRING, - ERROR_WITH_ERRNO("Failed to convert UTF-8 " - "string %U to multibyte string", in)) + ERROR_WITH_ERRNO("Failed to convert UTF-8 string " + "to UTF-16LE string!"), + static) +#else + +/* UNIX */ + +DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar, + utf8, "UTF-8", char, + true, + , + in_nbytes * 4, + WIMLIB_ERR_INVALID_MULTIBYTE_STRING, + ERROR_WITH_ERRNO("Failed to convert multibyte " + "string \"%"TS"\" to UTF-8 string!", in); + ERROR("If the data you provided was UTF-8, please make sure " + "the character\n" + " encoding of your current locale is UTF-8."), + static) + +DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char, + tstr, "", tchar, + true, + , + in_nbytes * 4, + WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE, + ERROR("Failed to convert UTF-8 string to " + "multibyte string!"); + ERROR("This may be because the UTF-8 data " + "could not be represented\n" + " in your locale's character encoding."), + static) +#endif + +int +tstr_to_utf8_simple(const tchar *tstr, char **out) +{ + size_t out_nbytes; + return tstr_to_utf8(tstr, tstrlen(tstr) * sizeof(tchar), + out, &out_nbytes); +} +int +utf8_to_tstr_simple(const char *utf8str, tchar **out) +{ + size_t out_nbytes; + return utf8_to_tstr(utf8str, strlen(utf8str), out, &out_nbytes); +} static void iconv_cleanup(struct iconv_list_head *head) @@ -234,7 +353,7 @@ iconv_cleanup(struct iconv_list_head *head) pthread_mutex_destroy(&head->mutex); while (!list_empty(&head->list)) { struct iconv_node *i; - + i = container_of(head->list.next, struct iconv_node, list); list_del(&i->list); iconv_close(i->cd); @@ -245,19 +364,12 @@ iconv_cleanup(struct iconv_list_head *head) void iconv_global_cleanup() { - iconv_cleanup(&iconv_utf16le_to_mbs); - iconv_cleanup(&iconv_mbs_to_utf16le); - iconv_cleanup(&iconv_utf8_to_mbs); -} - - - -bool -utf8_str_contains_nonascii_chars(const utf8char *utf8_str) -{ - do { - if ((unsigned char)*utf8_str > 127) - return true; - } while (*++utf8_str); - return false; + iconv_cleanup(&iconv_utf8_to_tstr); + iconv_cleanup(&iconv_tstr_to_utf8); +#if !TCHAR_IS_UTF16LE + iconv_cleanup(&iconv_utf16le_to_tstr); + iconv_cleanup(&iconv_tstr_to_utf16le); + iconv_cleanup(&iconv_utf16le_to_utf8); + iconv_cleanup(&iconv_utf8_to_utf16le); +#endif }