/* * encoding.c: Convert UTF-8 to UTF-16LE strings and vice versa */ /* * Copyright (C) 2012 Eric Biggers * * This file is part of wimlib, a library for working with WIM files. * * wimlib is free software; you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your option) * any later version. * * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with wimlib; if not, see http://www.gnu.org/licenses/. */ #include "wimlib.h" #include "util.h" #include "endianness.h" #include #ifdef WITH_NTFS_3G #include #include #else #include #endif /* * NOTE: * * utf16_to_utf8_size() and utf8_to_utf16_size() were taken from * libntfs-3g/unistr.c in the NTFS-3g sources. (Modified slightly to remove * unneeded functionality.) */ #ifndef WITH_NTFS_3G /* * Return the amount of 8-bit elements in UTF-8 needed (without the terminating * null) to store a given UTF-16LE string. * * Return -1 with errno set if string has invalid byte sequence or too long. */ static int utf16_to_utf8_size(const u16 *ins, const int ins_len) { int i, ret = -1; int count = 0; bool surrog; surrog = false; for (i = 0; i < ins_len && ins[i]; i++) { unsigned short c = le16_to_cpu(ins[i]); if (surrog) { if ((c >= 0xdc00) && (c < 0xe000)) { surrog = false; count += 4; } else goto fail; } else if (c < 0x80) count++; else if (c < 0x800) count += 2; else if (c < 0xd800) count += 3; else if (c < 0xdc00) surrog = true; #if NOREVBOM else if ((c >= 0xe000) && (c < 0xfffe)) #else else if (c >= 0xe000) #endif count += 3; else goto fail; } if (surrog) goto fail; ret = count; out: return ret; fail: errno = EILSEQ; goto out; } /* * Return the amount of 16-bit elements in UTF-16LE needed * (without the terminating null) to store given UTF-8 string. * * Return -1 with errno set if it's longer than PATH_MAX or string is invalid. * * Note: This does not check whether the input sequence is a valid utf8 string, * and should be used only in context where such check is made! */ static int utf8_to_utf16_size(const char *s) { unsigned int byte; size_t count = 0; while ((byte = *((const unsigned char *)s++))) { count++; if (byte >= 0xc0) { if (byte >= 0xF5) { errno = EILSEQ; return -1; } if (!*s) break; if (byte >= 0xC0) s++; if (!*s) break; if (byte >= 0xE0) s++; if (!*s) break; if (byte >= 0xF0) { s++; count++; } } } return count; } #endif /* !WITH_NTFS_3G */ /* Converts a string in the UTF-16LE encoding to a newly allocated string in the * UTF-8 encoding. * * If available, do so by calling a similar function from libntfs-3g. * Otherwise, use iconv() along with the helper function utf16_to_utf8_size(). */ int utf16_to_utf8(const char *utf16_str, size_t utf16_nbytes, char **utf8_str_ret, size_t *utf8_nbytes_ret) { int ret; if (utf16_nbytes == 0) { *utf8_str_ret = NULL; *utf8_nbytes_ret = 0; return 0; } if (utf16_nbytes & 1) { ERROR("UTF-16LE string is invalid (odd number of bytes)!"); return WIMLIB_ERR_INVALID_UTF16_STRING; } #ifdef WITH_NTFS_3G char *outs = NULL; int outs_len = ntfs_ucstombs((const ntfschar*)utf16_str, utf16_nbytes / 2, &outs, 0); if (outs_len >= 0) { *utf8_str_ret = outs; *utf8_nbytes_ret = outs_len; ret = 0; } else { if (errno == ENOMEM) ret = WIMLIB_ERR_NOMEM; else ret = WIMLIB_ERR_INVALID_UTF16_STRING; } #else /* WITH_NTFS_3G */ static iconv_t cd_utf16_to_utf8 = (iconv_t)(-1); if (cd_utf16_to_utf8 == (iconv_t)(-1)) { cd_utf16_to_utf8 = iconv_open("UTF-8", "UTF-16LE"); if (cd_utf16_to_utf8 == (iconv_t)-1) { ERROR_WITH_ERRNO("Failed to get conversion descriptor " "for converting UTF-16LE to UTF-8"); if (errno == ENOMEM) return WIMLIB_ERR_NOMEM; else return WIMLIB_ERR_ICONV_NOT_AVAILABLE; } } ret = utf16_to_utf8_size((const u16*)utf16_str, utf16_nbytes / 2); if (ret >= 0) { size_t utf8_expected_nbytes; char *utf8_str; size_t utf8_bytes_left; size_t utf16_bytes_left; size_t num_chars_converted; char *utf8_str_save; const char *utf16_str_save; utf8_expected_nbytes = ret; utf8_str = MALLOC(utf8_expected_nbytes + 1); if (utf8_str) { utf8_bytes_left = utf8_expected_nbytes; utf16_bytes_left = utf16_nbytes; utf8_str_save = utf8_str; utf16_str_save = utf16_str; num_chars_converted = iconv(cd_utf16_to_utf8, (char**)&utf16_str, &utf16_bytes_left, &utf8_str, &utf8_bytes_left); utf8_str = utf8_str_save; utf16_str = utf16_str_save; if (utf16_bytes_left == 0 && utf8_bytes_left == 0 && num_chars_converted != (size_t)(-1)) { utf8_str[utf8_expected_nbytes] = '\0'; *utf8_str_ret = utf8_str; *utf8_nbytes_ret = utf8_expected_nbytes; ret = 0; } else { FREE(utf8_str); ret = WIMLIB_ERR_INVALID_UTF16_STRING; } } else ret = WIMLIB_ERR_NOMEM; } else ret = WIMLIB_ERR_INVALID_UTF16_STRING; #endif /* WITH_NTFS_3G */ #ifdef ENABLE_ERROR_MESSAGES if (ret != 0) { ERROR_WITH_ERRNO("Error converting UTF-16LE string to UTF-8"); ERROR("The failing string was:"); print_string(utf16_str, utf16_nbytes); putchar('\n'); } #endif /* ENABLE_ERROR_MESSAGES */ return ret; } /* Converts a string in the UTF-8 encoding to a newly allocated string in the * UTF-16 encoding. * * If available, do so by calling a similar function from libntfs-3g. * Otherwise, use iconv() along with the helper function utf8_to_utf16_size(). */ int utf8_to_utf16(const char *utf8_str, size_t utf8_nbytes, char **utf16_str_ret, size_t *utf16_nbytes_ret) { int ret; if (utf8_nbytes == 0) { *utf16_str_ret = NULL; *utf16_nbytes_ret = 0; return 0; } #ifdef WITH_NTFS_3G char *outs = NULL; int outs_nchars = ntfs_mbstoucs(utf8_str, (ntfschar**)&outs); if (outs_nchars >= 0) { *utf16_str_ret = outs; *utf16_nbytes_ret = (size_t)outs_nchars * 2; ret = 0; } else { if (errno == ENOMEM) ret = WIMLIB_ERR_NOMEM; else ret = WIMLIB_ERR_INVALID_UTF8_STRING; } #else /* WITH_NTFS_3G */ static iconv_t cd_utf8_to_utf16 = (iconv_t)(-1); if (cd_utf8_to_utf16 == (iconv_t)(-1)) { cd_utf8_to_utf16 = iconv_open("UTF-16LE", "UTF-8"); if (cd_utf8_to_utf16 == (iconv_t)-1) { ERROR_WITH_ERRNO("Failed to get conversion descriptor " "for converting UTF-8 to UTF-16LE"); if (errno == ENOMEM) return WIMLIB_ERR_NOMEM; else return WIMLIB_ERR_ICONV_NOT_AVAILABLE; } } ret = utf8_to_utf16_size(utf8_str); if (ret >= 0) { size_t utf16_expected_nbytes; char *utf16_str; size_t utf16_bytes_left; size_t utf8_bytes_left; size_t num_chars_converted; const char *utf8_str_save; char *utf16_str_save; utf16_expected_nbytes = (size_t)ret * 2; utf16_str = MALLOC(utf16_expected_nbytes + 2); if (utf16_str) { utf16_bytes_left = utf16_expected_nbytes; utf8_bytes_left = utf8_nbytes; utf8_str_save = utf8_str; utf16_str_save = utf16_str; num_chars_converted = iconv(cd_utf8_to_utf16, (char**)&utf8_str, &utf8_bytes_left, &utf16_str, &utf16_bytes_left); utf8_str = utf8_str_save; utf16_str = utf16_str_save; if (utf16_bytes_left == 0 && utf8_bytes_left == 0 && num_chars_converted != (size_t)(-1)) { utf16_str[utf16_expected_nbytes] = '\0'; utf16_str[utf16_expected_nbytes + 1] = '\0'; *utf16_str_ret = utf16_str; *utf16_nbytes_ret = utf16_expected_nbytes; ret = 0; } else { FREE(utf16_str); ret = WIMLIB_ERR_INVALID_UTF8_STRING; } } else ret = WIMLIB_ERR_NOMEM; } else ret = WIMLIB_ERR_INVALID_UTF8_STRING; #endif /* WITH_NTFS_3G */ #ifdef ENABLE_ERROR_MESSAGES if (ret != 0) { ERROR_WITH_ERRNO("Error converting UTF-8 string to UTF-16LE"); ERROR("The failing string was:"); print_string(utf8_str, utf8_nbytes); putchar('\n'); ERROR("Length: %zu bytes", utf8_nbytes); } #endif /* ENABLE_ERROR_MESSAGES */ return ret; }