X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Fencoding.c;h=7d260046c3a8f99588f9eb2dcfe6ac42e1a9fd8d;hp=de0435811c9279d4d751d6f24ad8fdab09274c5e;hb=cf373e59a7f6ff7d1fd007c1f22defe508aa67d4;hpb=14baa6ae892debbaa18dba8119931580efd0e517 diff --git a/src/encoding.c b/src/encoding.c index de043581..7d260046 100644 --- a/src/encoding.c +++ b/src/encoding.c @@ -1,7 +1,5 @@ /* - * encoding.c: Convert "multibyte" strings (the locale-default encoding--- - * generally, UTF-8 or something like ISO-8859-1) to UTF-16LE strings, and vice - * versa. + * encoding.c */ /* @@ -23,16 +21,22 @@ * along with wimlib; if not, see http://www.gnu.org/licenses/. */ -#include "config.h" #include "wimlib_internal.h" -#include -#include "list.h" +#include #include +#include #include +#include -bool wimlib_mbs_is_utf8 = false; +bool wimlib_mbs_is_utf8 = !TCHAR_IS_UTF16LE; +/* List of iconv_t conversion descriptors for a specific character conversion. + * The idea is that it is not thread-safe to have just one conversion + * descriptor, but it also is inefficient to open a new conversion descriptor to + * convert every string. Both these problems can be solved by maintaining a + * list of conversion descriptors; then, a thread can use an existing conversion + * descriptor if available. */ struct iconv_list_head { const char *from_encoding; const char *to_encoding; @@ -54,226 +58,316 @@ struct iconv_list_head name = { \ .mutex = PTHREAD_MUTEX_INITIALIZER, \ } -static ICONV_LIST(iconv_mbs_to_utf16le, "", "UTF-16LE"); -static ICONV_LIST(iconv_utf16le_to_mbs, "UTF-16LE", ""); - static iconv_t * get_iconv(struct iconv_list_head *head) { iconv_t cd; + iconv_t *cd_p; struct iconv_node *i; pthread_mutex_lock(&head->mutex); if (list_empty(&head->list)) { cd = iconv_open(head->to_encoding, head->from_encoding); if (cd == (iconv_t)-1) { - goto out_unlock; + ERROR_WITH_ERRNO("Failed to open iconv from %s to %s", + head->from_encoding, head->to_encoding); + cd_p = NULL; } else { i = MALLOC(sizeof(struct iconv_node)); - if (!i) { + if (i) { + i->head = head; + i->cd = cd; + cd_p = &i->cd; + } else { iconv_close(cd); - cd = (iconv_t)-1; - goto out_unlock; + cd_p = NULL; } - i->head = head; } } else { i = container_of(head->list.next, struct iconv_node, list); list_del(head->list.next); + cd_p = &i->cd; } - cd = i->cd; -out_unlock: pthread_mutex_unlock(&head->mutex); - return cd; + return cd_p; } static void put_iconv(iconv_t *cd) { + int errno_save = errno; struct iconv_node *i = container_of(cd, struct iconv_node, cd); struct iconv_list_head *head = i->head; - + pthread_mutex_lock(&head->mutex); list_add(&i->list, &head->list); pthread_mutex_unlock(&head->mutex); + errno = errno_save; } -int -mbs_to_utf16le_nbytes(const mbchar *mbs, size_t mbs_nbytes, - size_t *utf16le_nbytes_ret) -{ - iconv_t *cd = get_iconv(&iconv_mbs_to_utf16le); - if (*cd == (iconv_t)-1) - return WIMLIB_ERR_ICONV_NOT_AVAILABLE; - - /* Worst case length */ - utf16lechar buf[mbs_nbytes * 2]; - char *inbuf = (char*)mbs; - char *outbuf = (char*)buf; - size_t outbytesleft = sizeof(buf); - size_t inbytesleft = mbs_nbytes; - size_t len; - int ret; - - len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - if (len == (size_t)-1) { - ret = WIMLIB_ERR_INVALID_MULTIBYTE_STRING; - } else { - *utf16le_nbytes_ret = sizeof(buf) - outbytesleft; - ret = 0; - } - put_iconv(cd); - return ret; +/* Prevent printing an error message if a character conversion error occurs + * while printing an error message. (This variable is not per-thread but it + * doesn't matter too much since it's just the error messages.) */ +static bool error_message_being_printed = false; + +#define DEFINE_CHAR_CONVERSION_FUNCTIONS(varname1, longname1, chartype1,\ + varname2, longname2, chartype2,\ + earlyreturn_on_utf8_locale, \ + earlyreturn_expr, \ + worst_case_len_expr, \ + err_return, \ + err_msg, \ + modifier) \ +static ICONV_LIST(iconv_##varname1##_to_##varname2, \ + longname1, longname2); \ + \ +modifier int \ +varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\ + size_t *out_nbytes_ret) \ +{ \ + iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2); \ + if (cd == NULL) \ + return WIMLIB_ERR_ICONV_NOT_AVAILABLE; \ + \ + /* Worst case length */ \ + chartype2 buf[worst_case_len_expr]; \ + char *inbuf = (char*)in; \ + size_t inbytesleft = in_nbytes; \ + char *outbuf = (char*)buf; \ + size_t outbytesleft = sizeof(buf); \ + size_t len; \ + int ret; \ + \ + len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \ + if (len == (size_t)-1) { \ + if (!error_message_being_printed) { \ + error_message_being_printed = true; \ + err_msg; \ + error_message_being_printed = false; \ + } \ + ret = err_return; \ + } else { \ + *out_nbytes_ret = sizeof(buf) - outbytesleft; \ + ret = 0; \ + } \ + put_iconv(cd); \ + return ret; \ +} \ + \ +modifier int \ +varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes, \ + chartype2 *out) \ +{ \ + iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2); \ + if (cd == NULL) \ + return WIMLIB_ERR_ICONV_NOT_AVAILABLE; \ + \ + char *inbuf = (char*)in; \ + size_t inbytesleft = in_nbytes; \ + char *outbuf = (char*)out; \ + const size_t LARGE_NUMBER = 1000000000; \ + size_t outbytesleft = LARGE_NUMBER; \ + size_t len; \ + int ret; \ + \ + len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \ + if (len == (size_t)-1) { \ + if (!error_message_being_printed) { \ + error_message_being_printed = true; \ + err_msg; \ + error_message_being_printed = false; \ + } \ + ret = err_return; \ + } else { \ + out[(LARGE_NUMBER-outbytesleft)/sizeof(chartype2)] = 0; \ + ret = 0; \ + } \ + put_iconv(cd); \ + return ret; \ +} \ + \ +modifier int \ +varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes, \ + chartype2 **out_ret, \ + size_t *out_nbytes_ret) \ +{ \ + int ret; \ + chartype2 *out; \ + size_t out_nbytes; \ + \ + if (earlyreturn_on_utf8_locale && wimlib_mbs_is_utf8) { \ + earlyreturn_expr; \ + /* Out same as in */ \ + out = MALLOC(in_nbytes + sizeof(chartype2)); \ + if (!out) \ + return WIMLIB_ERR_NOMEM; \ + memcpy(out, in, in_nbytes); \ + out[in_nbytes / sizeof(chartype2)] = 0; \ + *out_ret = out; \ + *out_nbytes_ret = in_nbytes; \ + return 0; \ + } \ + \ + ret = varname1##_to_##varname2##_nbytes(in, in_nbytes, \ + &out_nbytes); \ + if (ret) \ + return ret; \ + \ + out = MALLOC(out_nbytes + sizeof(chartype2)); \ + if (!out) \ + return WIMLIB_ERR_NOMEM; \ + \ + ret = varname1##_to_##varname2##_buf(in, in_nbytes, out); \ + if (ret) { \ + FREE(out); \ + } else { \ + *out_ret = out; \ + *out_nbytes_ret = out_nbytes; \ + } \ + return ret; \ } +#if !TCHAR_IS_UTF16LE + +/* UNIX */ + +DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", tchar, + utf16le, "UTF-16LE", utf16lechar, + false, + , + in_nbytes * 2, + WIMLIB_ERR_INVALID_UTF8_STRING, + ERROR_WITH_ERRNO("Failed to convert UTF-8 string " + "to UTF-16LE string!"), + static) + +DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar, + utf8, "UTF-8", tchar, + false, + , + in_nbytes * 2, + WIMLIB_ERR_INVALID_UTF16_STRING, + ERROR_WITH_ERRNO("Failed to convert UTF-16LE string " + "to UTF-8 string!"), + static) + +DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar, + utf16le, "UTF-16LE", utf16lechar, + true, + return utf8_to_utf16le(in, in_nbytes, out_ret, out_nbytes_ret), + in_nbytes * 2, + WIMLIB_ERR_INVALID_MULTIBYTE_STRING, + ERROR_WITH_ERRNO("Failed to convert multibyte " + "string \"%"TS"\" to UTF-16LE string!", in); + ERROR("If the data you provided was UTF-8, please make sure " + "the character encoding\n" + " of your current locale is UTF-8."), + ) + +DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar, + tstr, "", tchar, + true, + return utf16le_to_utf8(in, in_nbytes, out_ret, out_nbytes_ret), + in_nbytes * 2, + WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE, + ERROR("Failed to convert UTF-16LE string to " + "multibyte string!"); + ERROR("This may be because the UTF-16LE string " + "could not be represented\n" + " in your locale's character encoding."), + ) +#endif + +/* tchar to UTF-8 and back */ +#if TCHAR_IS_UTF16LE + +/* Windows */ +DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "UTF-16LE", tchar, + utf8, "UTF-8", char, + false, + , + in_nbytes * 2, + WIMLIB_ERR_INVALID_UTF16_STRING, + ERROR_WITH_ERRNO("Failed to convert UTF-16LE " + "string \"%"TS"\" to UTF-8 string!", in), + static) + +DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char, + tstr, "UTF-16LE", tchar, + false, + , + in_nbytes * 2, + WIMLIB_ERR_INVALID_UTF8_STRING, + ERROR_WITH_ERRNO("Failed to convert UTF-8 string " + "to UTF-16LE string!"), + static) +#else + +/* UNIX */ + +DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar, + utf8, "UTF-8", char, + true, + , + in_nbytes * 4, + WIMLIB_ERR_INVALID_MULTIBYTE_STRING, + ERROR_WITH_ERRNO("Failed to convert multibyte " + "string \"%"TS"\" to UTF-8 string!", in); + ERROR("If the data you provided was UTF-8, please make sure " + "the character\n" + " encoding of your current locale is UTF-8."), + static) + +DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char, + tstr, "", tchar, + true, + , + in_nbytes * 4, + WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE, + ERROR("Failed to convert UTF-8 string to " + "multibyte string!"); + ERROR("This may be because the UTF-8 data " + "could not be represented\n" + " in your locale's character encoding."), + static) +#endif int -utf16le_to_mbs_nbytes(const utf16lechar *utf16le_str, size_t utf16le_nbytes, - size_t *mbs_nbytes_ret) -{ - iconv_t *cd = get_iconv(&iconv_utf16le_to_mbs); - if (*cd == (iconv_t)-1) - return WIMLIB_ERR_ICONV_NOT_AVAILABLE; - - /* Worst case length */ - mbchar buf[utf16le_nbytes / 2 * MB_CUR_MAX]; - char *inbuf = (char*)utf16le_str; - char *outbuf = (char*)buf; - size_t outbytesleft = sizeof(buf); - size_t inbytesleft = utf16le_nbytes; - size_t len; - int ret; - - len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - if (len == (size_t)-1) { - ERROR("Could not convert \"%W\" to encoding of current locale", - utf16le_str); - /* EILSEQ is supposed to mean that the *input* is invalid, but - * it's also returned if any input characters are not - * representable in the output encoding. (The actual behavior - * in this case is undefined for some reason...). Assume it's - * the latter error case. */ - ret = WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE; - } else { - *mbs_nbytes_ret = sizeof(buf) - outbytesleft; - ret = 0; - } - put_iconv(cd); - return ret; -} - -int -mbs_to_utf16le_buf(const mbchar *mbs, size_t mbs_nbytes, - utf16lechar *utf16le_str) -{ - iconv_t *cd = get_iconv(&iconv_mbs_to_utf16le); - if (*cd == (iconv_t)-1) - return WIMLIB_ERR_ICONV_NOT_AVAILABLE; - - char *inbuf = (char*)mbs; - size_t inbytesleft = mbs_nbytes; - char *outbuf = (char*)utf16le_str; - size_t outbytesleft = SIZE_MAX; - size_t len; - int ret; - - len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - if (len == (size_t)-1) { - ret = WIMLIB_ERR_INVALID_MULTIBYTE_STRING; - } else { - ret = 0; - } - put_iconv(cd); - return ret; -} - -int -utf16le_to_mbs_buf(const utf16lechar *utf16le_str, size_t utf16le_nbytes, - mbchar *mbs) +tstr_to_utf8_simple(const tchar *tstr, char **out) { - int ret; - iconv_t *cd = get_iconv(&iconv_utf16le_to_mbs); - if (*cd == (iconv_t)-1) - return WIMLIB_ERR_ICONV_NOT_AVAILABLE; - - char *inbuf = (char*)utf16le_str; - size_t inbytesleft; - char *outbuf = (char*)mbs; - size_t outbytesleft = SIZE_MAX; - size_t len; - - len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - if (len == (size_t)-1) { - ret = WIMLIB_ERR_INVALID_UTF16_STRING; - } else { - ret = 0; - } - mbs[SIZE_MAX - inbytesleft] = '\0'; - put_iconv(cd); - return ret; + size_t out_nbytes; + return tstr_to_utf8(tstr, tstrlen(tstr) * sizeof(tchar), + out, &out_nbytes); } int -mbs_to_utf16le(const mbchar *mbs, size_t mbs_nbytes, - utf16lechar **utf16le_ret, size_t *utf16le_nbytes_ret) +utf8_to_tstr_simple(const char *utf8str, tchar **out) { - int ret; - utf16lechar *utf16le_str; - size_t utf16le_nbytes; - - ret = mbs_to_utf16le_nbytes(mbs, mbs_nbytes, - &utf16le_nbytes); - if (ret) - return ret; - - utf16le_str = MALLOC(utf16le_nbytes + 1); - if (!utf16le_str) - return WIMLIB_ERR_NOMEM; - - ret = mbs_to_utf16le_buf(mbs, mbs_nbytes, utf16le_str); - if (ret) { - FREE(utf16le_str); - } else { - *utf16le_ret = utf16le_str; - *utf16le_nbytes_ret = utf16le_nbytes; - } - return ret; + size_t out_nbytes; + return utf8_to_tstr(utf8str, strlen(utf8str), out, &out_nbytes); } - -int -utf16le_to_mbs(const utf16lechar *utf16le_str, size_t utf16le_nbytes, - mbchar **mbs_ret, size_t *mbs_nbytes_ret) +static void +iconv_cleanup(struct iconv_list_head *head) { - int ret; - mbchar *mbs; - size_t mbs_nbytes; - - ret = utf16le_to_mbs_nbytes(utf16le_str, utf16le_nbytes, - &mbs_nbytes); - if (ret) - return ret; - - mbs = MALLOC(mbs_nbytes + 1); - if (!mbs) - return WIMLIB_ERR_NOMEM; + pthread_mutex_destroy(&head->mutex); + while (!list_empty(&head->list)) { + struct iconv_node *i; - ret = utf16le_to_mbs_buf(utf16le_str, utf16le_nbytes, mbs); - if (ret) { - FREE(mbs); - } else { - *mbs_ret = mbs; - *mbs_nbytes_ret = mbs_nbytes; + i = container_of(head->list.next, struct iconv_node, list); + list_del(&i->list); + iconv_close(i->cd); + FREE(i); } - return ret; } -bool -utf8_str_contains_nonascii_chars(const utf8char *utf8_str) +void +iconv_global_cleanup() { - do { - if ((unsigned char)*utf8_str > 127) - return false; - } while (*++utf8_str); - return true; + iconv_cleanup(&iconv_utf8_to_tstr); + iconv_cleanup(&iconv_tstr_to_utf8); +#if !TCHAR_IS_UTF16LE + iconv_cleanup(&iconv_utf16le_to_tstr); + iconv_cleanup(&iconv_tstr_to_utf16le); +#endif }