X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Fencoding.c;h=48bad187ff8e21c7df4333e35854aabf75ef6b5d;hp=de0435811c9279d4d751d6f24ad8fdab09274c5e;hb=de12c346dc64404821d52d545e2e1b3d44230f2a;hpb=14baa6ae892debbaa18dba8119931580efd0e517 diff --git a/src/encoding.c b/src/encoding.c index de043581..48bad187 100644 --- a/src/encoding.c +++ b/src/encoding.c @@ -1,7 +1,7 @@ /* * encoding.c: Convert "multibyte" strings (the locale-default encoding--- * generally, UTF-8 or something like ISO-8859-1) to UTF-16LE strings, and vice - * versa. + * versa. Also, convert UTF-8 strings to multibyte strings. */ /* @@ -23,16 +23,21 @@ * along with wimlib; if not, see http://www.gnu.org/licenses/. */ -#include "config.h" #include "wimlib_internal.h" -#include -#include "list.h" +#include #include +#include #include -bool wimlib_mbs_is_utf8 = false; +bool wimlib_mbs_is_utf8 = true; +/* List of iconv_t conversion descriptors for a specific character conversion. + * The idea is that it is not thread-safe to have just one conversion + * descriptor, but it also is inefficient to open a new conversion descriptor to + * convert every string. Both these problems can be solved by maintaining a + * list of conversion descriptors; then, a thread can use an existing conversion + * descriptor if available. */ struct iconv_list_head { const char *from_encoding; const char *to_encoding; @@ -54,218 +59,208 @@ struct iconv_list_head name = { \ .mutex = PTHREAD_MUTEX_INITIALIZER, \ } -static ICONV_LIST(iconv_mbs_to_utf16le, "", "UTF-16LE"); -static ICONV_LIST(iconv_utf16le_to_mbs, "UTF-16LE", ""); - static iconv_t * get_iconv(struct iconv_list_head *head) { iconv_t cd; + iconv_t *cd_p; struct iconv_node *i; pthread_mutex_lock(&head->mutex); if (list_empty(&head->list)) { cd = iconv_open(head->to_encoding, head->from_encoding); if (cd == (iconv_t)-1) { - goto out_unlock; + ERROR_WITH_ERRNO("Failed to open iconv from %s to %s", + head->from_encoding, head->to_encoding); + cd_p = NULL; } else { i = MALLOC(sizeof(struct iconv_node)); - if (!i) { + if (i) { + i->head = head; + i->cd = cd; + cd_p = &i->cd; + } else { iconv_close(cd); - cd = (iconv_t)-1; - goto out_unlock; + cd_p = NULL; } - i->head = head; } } else { i = container_of(head->list.next, struct iconv_node, list); list_del(head->list.next); + cd_p = &i->cd; } - cd = i->cd; -out_unlock: pthread_mutex_unlock(&head->mutex); - return cd; + return cd_p; } static void put_iconv(iconv_t *cd) { + int errno_save = errno; struct iconv_node *i = container_of(cd, struct iconv_node, cd); struct iconv_list_head *head = i->head; pthread_mutex_lock(&head->mutex); list_add(&i->list, &head->list); pthread_mutex_unlock(&head->mutex); + errno = errno_save; } -int -mbs_to_utf16le_nbytes(const mbchar *mbs, size_t mbs_nbytes, - size_t *utf16le_nbytes_ret) -{ - iconv_t *cd = get_iconv(&iconv_mbs_to_utf16le); - if (*cd == (iconv_t)-1) - return WIMLIB_ERR_ICONV_NOT_AVAILABLE; - - /* Worst case length */ - utf16lechar buf[mbs_nbytes * 2]; - char *inbuf = (char*)mbs; - char *outbuf = (char*)buf; - size_t outbytesleft = sizeof(buf); - size_t inbytesleft = mbs_nbytes; - size_t len; - int ret; - - len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - if (len == (size_t)-1) { - ret = WIMLIB_ERR_INVALID_MULTIBYTE_STRING; - } else { - *utf16le_nbytes_ret = sizeof(buf) - outbytesleft; - ret = 0; - } - put_iconv(cd); - return ret; +/* Prevent printing an error message if a character conversion error occurs + * while printing an error message. (This variable is not per-thread but it + * doesn't matter too much since it's just the error messages.) */ +static bool error_message_being_printed = false; + +#define DEFINE_CHAR_CONVERSION_FUNCTIONS(varname1, longname1, chartype1,\ + varname2, longname2, chartype2,\ + worst_case_len_expr, \ + err_return, \ + err_msg) \ +static ICONV_LIST(iconv_##varname1##_to_##varname2, \ + longname1, longname2); \ + \ +int \ +varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\ + size_t *out_nbytes_ret) \ +{ \ + iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2); \ + if (cd == NULL) \ + return WIMLIB_ERR_ICONV_NOT_AVAILABLE; \ + \ + /* Worst case length */ \ + chartype2 buf[worst_case_len_expr]; \ + char *inbuf = (char*)in; \ + size_t inbytesleft = in_nbytes; \ + char *outbuf = (char*)buf; \ + size_t outbytesleft = sizeof(buf); \ + size_t len; \ + int ret; \ + \ + len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \ + if (len == (size_t)-1) { \ + if (!error_message_being_printed) { \ + error_message_being_printed = true; \ + err_msg; \ + error_message_being_printed = false; \ + } \ + ret = err_return; \ + } else { \ + *out_nbytes_ret = sizeof(buf) - outbytesleft; \ + ret = 0; \ + } \ + put_iconv(cd); \ + return ret; \ +} \ + \ +int \ +varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes, \ + chartype2 *out) \ +{ \ + iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2); \ + if (cd == NULL) \ + return WIMLIB_ERR_ICONV_NOT_AVAILABLE; \ + \ + char *inbuf = (char*)in; \ + size_t inbytesleft = in_nbytes; \ + char *outbuf = (char*)out; \ + const size_t LARGE_NUMBER = 1000000000; \ + size_t outbytesleft = LARGE_NUMBER; \ + size_t len; \ + int ret; \ + \ + len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \ + if (len == (size_t)-1) { \ + if (!error_message_being_printed) { \ + error_message_being_printed = true; \ + err_msg; \ + error_message_being_printed = false; \ + } \ + ret = err_return; \ + } else { \ + out[(LARGE_NUMBER-outbytesleft)/sizeof(chartype2)] = 0; \ + ret = 0; \ + } \ + put_iconv(cd); \ + return ret; \ +} \ + \ +int \ +varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes, \ + chartype2 **out_ret, \ + size_t *out_nbytes_ret) \ +{ \ + int ret; \ + chartype2 *out; \ + size_t out_nbytes; \ + \ + ret = varname1##_to_##varname2##_nbytes(in, in_nbytes, \ + &out_nbytes); \ + if (ret) \ + return ret; \ + \ + out = MALLOC(out_nbytes + sizeof(chartype2)); \ + if (!out) \ + return WIMLIB_ERR_NOMEM; \ + \ + ret = varname1##_to_##varname2##_buf(in, in_nbytes, out); \ + if (ret) { \ + int errno_save = errno; \ + FREE(out); \ + errno = errno_save; \ + } else { \ + *out_ret = out; \ + *out_nbytes_ret = out_nbytes; \ + } \ + return ret; \ } +DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar, + mbs, "", mbchar, + in_nbytes / 2 * MB_CUR_MAX, + WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE, + ERROR("Failed to convert UTF-16LE string " + "to multibyte string!"); + ERROR("This may be because the UTF-16LE data " + "could not be represented in your " + "locale's character encoding.")) + +DEFINE_CHAR_CONVERSION_FUNCTIONS(mbs, "", mbchar, + utf16le, "UTF-16LE", utf16lechar, + in_nbytes * 2, + WIMLIB_ERR_INVALID_MULTIBYTE_STRING, + ERROR_WITH_ERRNO("Failed to convert multibyte " + "string \"%s\" to UTF-16LE string!", in); + ERROR("If the data you provided was UTF-8, please make sure " + "the character encoding of your current locale is UTF-8.")) + +DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", utf8char, + mbs, "", mbchar, + in_nbytes, + WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE, + ERROR("Failed to convert UTF-8 string to multibyte string!"); + ERROR("This may be because the UTF-8 data could not be represented " + "in your locale's character encoding.")) -int -utf16le_to_mbs_nbytes(const utf16lechar *utf16le_str, size_t utf16le_nbytes, - size_t *mbs_nbytes_ret) -{ - iconv_t *cd = get_iconv(&iconv_utf16le_to_mbs); - if (*cd == (iconv_t)-1) - return WIMLIB_ERR_ICONV_NOT_AVAILABLE; - - /* Worst case length */ - mbchar buf[utf16le_nbytes / 2 * MB_CUR_MAX]; - char *inbuf = (char*)utf16le_str; - char *outbuf = (char*)buf; - size_t outbytesleft = sizeof(buf); - size_t inbytesleft = utf16le_nbytes; - size_t len; - int ret; - - len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - if (len == (size_t)-1) { - ERROR("Could not convert \"%W\" to encoding of current locale", - utf16le_str); - /* EILSEQ is supposed to mean that the *input* is invalid, but - * it's also returned if any input characters are not - * representable in the output encoding. (The actual behavior - * in this case is undefined for some reason...). Assume it's - * the latter error case. */ - ret = WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE; - } else { - *mbs_nbytes_ret = sizeof(buf) - outbytesleft; - ret = 0; - } - put_iconv(cd); - return ret; -} - -int -mbs_to_utf16le_buf(const mbchar *mbs, size_t mbs_nbytes, - utf16lechar *utf16le_str) -{ - iconv_t *cd = get_iconv(&iconv_mbs_to_utf16le); - if (*cd == (iconv_t)-1) - return WIMLIB_ERR_ICONV_NOT_AVAILABLE; - - char *inbuf = (char*)mbs; - size_t inbytesleft = mbs_nbytes; - char *outbuf = (char*)utf16le_str; - size_t outbytesleft = SIZE_MAX; - size_t len; - int ret; - - len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - if (len == (size_t)-1) { - ret = WIMLIB_ERR_INVALID_MULTIBYTE_STRING; - } else { - ret = 0; - } - put_iconv(cd); - return ret; -} - -int -utf16le_to_mbs_buf(const utf16lechar *utf16le_str, size_t utf16le_nbytes, - mbchar *mbs) -{ - int ret; - iconv_t *cd = get_iconv(&iconv_utf16le_to_mbs); - if (*cd == (iconv_t)-1) - return WIMLIB_ERR_ICONV_NOT_AVAILABLE; - - char *inbuf = (char*)utf16le_str; - size_t inbytesleft; - char *outbuf = (char*)mbs; - size_t outbytesleft = SIZE_MAX; - size_t len; - - len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - if (len == (size_t)-1) { - ret = WIMLIB_ERR_INVALID_UTF16_STRING; - } else { - ret = 0; - } - mbs[SIZE_MAX - inbytesleft] = '\0'; - put_iconv(cd); - return ret; -} - -int -mbs_to_utf16le(const mbchar *mbs, size_t mbs_nbytes, - utf16lechar **utf16le_ret, size_t *utf16le_nbytes_ret) +static void +iconv_cleanup(struct iconv_list_head *head) { - int ret; - utf16lechar *utf16le_str; - size_t utf16le_nbytes; - - ret = mbs_to_utf16le_nbytes(mbs, mbs_nbytes, - &utf16le_nbytes); - if (ret) - return ret; - - utf16le_str = MALLOC(utf16le_nbytes + 1); - if (!utf16le_str) - return WIMLIB_ERR_NOMEM; - - ret = mbs_to_utf16le_buf(mbs, mbs_nbytes, utf16le_str); - if (ret) { - FREE(utf16le_str); - } else { - *utf16le_ret = utf16le_str; - *utf16le_nbytes_ret = utf16le_nbytes; + pthread_mutex_destroy(&head->mutex); + while (!list_empty(&head->list)) { + struct iconv_node *i; + + i = container_of(head->list.next, struct iconv_node, list); + list_del(&i->list); + iconv_close(i->cd); + FREE(i); } - return ret; } - -int -utf16le_to_mbs(const utf16lechar *utf16le_str, size_t utf16le_nbytes, - mbchar **mbs_ret, size_t *mbs_nbytes_ret) +void +iconv_global_cleanup() { - int ret; - mbchar *mbs; - size_t mbs_nbytes; - - ret = utf16le_to_mbs_nbytes(utf16le_str, utf16le_nbytes, - &mbs_nbytes); - if (ret) - return ret; - - mbs = MALLOC(mbs_nbytes + 1); - if (!mbs) - return WIMLIB_ERR_NOMEM; - - ret = utf16le_to_mbs_buf(utf16le_str, utf16le_nbytes, mbs); - if (ret) { - FREE(mbs); - } else { - *mbs_ret = mbs; - *mbs_nbytes_ret = mbs_nbytes; - } - return ret; + iconv_cleanup(&iconv_utf16le_to_mbs); + iconv_cleanup(&iconv_mbs_to_utf16le); + iconv_cleanup(&iconv_utf8_to_mbs); } bool @@ -273,7 +268,7 @@ utf8_str_contains_nonascii_chars(const utf8char *utf8_str) { do { if ((unsigned char)*utf8_str > 127) - return false; + return true; } while (*++utf8_str); - return true; + return false; }