2 * encoding.c: Convert UTF-8 to UTF-16LE strings and vice versa
6 * Copyright (C) 2012, 2013 Eric Biggers
8 * This file is part of wimlib, a library for working with WIM files.
10 * wimlib is free software; you can redistribute it and/or modify it under the
11 * terms of the GNU General Public License as published by the Free
12 * Software Foundation; either version 3 of the License, or (at your option)
15 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
16 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
17 * A PARTICULAR PURPOSE. See the GNU General Public License for more
20 * You should have received a copy of the GNU General Public License
21 * along with wimlib; if not, see http://www.gnu.org/licenses/.
27 #include "endianness.h"
32 # include <ntfs-3g/volume.h>
33 # include <ntfs-3g/unistr.h>
34 #elif defined(__WIN32__)
44 * utf16_to_utf8_size() and utf8_to_utf16_size() were taken from
45 * libntfs-3g/unistr.c in the NTFS-3g sources. (Modified slightly to remove
46 * unneeded functionality.)
48 #if !defined(WITH_NTFS_3G) && !defined(__WIN32__)
50 * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
51 * null) to store a given UTF-16LE string.
53 * Return -1 with errno set if string has invalid byte sequence or too long.
55 static int utf16_to_utf8_size(const u16 *ins, const int ins_len)
62 for (i = 0; i < ins_len && ins[i]; i++) {
63 unsigned short c = le16_to_cpu(ins[i]);
65 if ((c >= 0xdc00) && (c < 0xe000)) {
80 else if ((c >= 0xe000) && (c < 0xfffe))
100 * Return the amount of 16-bit elements in UTF-16LE needed
101 * (without the terminating null) to store given UTF-8 string.
103 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
105 * Note: This does not check whether the input sequence is a valid utf8 string,
106 * and should be used only in context where such check is made!
108 static int utf8_to_utf16_size(const char *s)
112 while ((byte = *((const unsigned char *)s++))) {
138 static iconv_t cd_utf8_to_utf16 = (iconv_t)(-1);
139 static iconv_t cd_utf16_to_utf8 = (iconv_t)(-1);
141 int iconv_global_init()
143 if (cd_utf16_to_utf8 == (iconv_t)(-1)) {
144 cd_utf16_to_utf8 = iconv_open("UTF-8", "UTF-16LE");
145 if (cd_utf16_to_utf8 == (iconv_t)-1) {
146 ERROR_WITH_ERRNO("Failed to get conversion descriptor "
147 "for converting UTF-16LE to UTF-8");
149 return WIMLIB_ERR_NOMEM;
151 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
155 if (cd_utf8_to_utf16 == (iconv_t)(-1)) {
156 cd_utf8_to_utf16 = iconv_open("UTF-16LE", "UTF-8");
157 if (cd_utf8_to_utf16 == (iconv_t)-1) {
158 ERROR_WITH_ERRNO("Failed to get conversion descriptor "
159 "for converting UTF-8 to UTF-16LE");
161 return WIMLIB_ERR_NOMEM;
163 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
169 void iconv_global_cleanup()
171 if (cd_utf8_to_utf16 != (iconv_t)(-1))
172 iconv_close(cd_utf8_to_utf16);
173 if (cd_utf16_to_utf8 != (iconv_t)(-1))
174 iconv_close(cd_utf16_to_utf8);
176 #endif /* !WITH_NTFS_3G && !__WIN32__ */
178 /* Converts a string in the UTF-16LE encoding to a newly allocated string in the
181 * If available, do so by calling a similar function from libntfs-3g.
182 * Otherwise, use iconv() along with the helper function utf16_to_utf8_size().
184 int utf16_to_utf8(const char *utf16_str, size_t utf16_nbytes,
185 char **utf8_str_ret, size_t *utf8_nbytes_ret)
189 if (utf16_nbytes == 0) {
190 *utf8_str_ret = NULL;
191 *utf8_nbytes_ret = 0;
195 if (utf16_nbytes & 1) {
196 ERROR("UTF-16LE string is invalid (odd number of bytes)!");
197 return WIMLIB_ERR_INVALID_UTF16_STRING;
201 int outs_len = ntfs_ucstombs((const ntfschar*)utf16_str,
202 utf16_nbytes / 2, &outs, 0);
204 *utf8_str_ret = outs;
205 *utf8_nbytes_ret = outs_len;
209 ret = WIMLIB_ERR_NOMEM;
211 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
213 #elif defined(__WIN32__)
216 utf8_nbytes = wcstombs(NULL, (const wchar_t*)utf16_str, 0);
217 if (utf8_nbytes == (size_t)(-1)) {
218 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
220 utf8_str = MALLOC(utf8_nbytes + 1);
222 ret = WIMLIB_ERR_NOMEM;
224 wcstombs(utf8_str, (const wchar_t*)utf16_str, utf8_nbytes + 1);
225 *utf8_str_ret = utf8_str;
226 *utf8_nbytes_ret = utf8_nbytes;
231 ret = iconv_global_init();
235 ret = utf16_to_utf8_size((const u16*)utf16_str, utf16_nbytes / 2);
237 size_t utf8_expected_nbytes;
239 size_t utf8_bytes_left;
240 size_t utf16_bytes_left;
241 size_t num_chars_converted;
243 const char *utf16_str_save;
245 utf8_expected_nbytes = ret;
246 utf8_str = MALLOC(utf8_expected_nbytes + 1);
248 utf8_bytes_left = utf8_expected_nbytes;
249 utf16_bytes_left = utf16_nbytes;
250 utf8_str_save = utf8_str;
251 utf16_str_save = utf16_str;
252 num_chars_converted = iconv(cd_utf16_to_utf8,
257 utf8_str = utf8_str_save;
258 utf16_str = utf16_str_save;
259 if (utf16_bytes_left == 0 &&
260 utf8_bytes_left == 0 &&
261 num_chars_converted != (size_t)(-1))
263 utf8_str[utf8_expected_nbytes] = '\0';
264 *utf8_str_ret = utf8_str;
265 *utf8_nbytes_ret = utf8_expected_nbytes;
269 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
272 ret = WIMLIB_ERR_NOMEM;
274 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
275 #endif /* WITH_NTFS_3G */
277 #ifdef ENABLE_ERROR_MESSAGES
279 ERROR_WITH_ERRNO("Error converting UTF-16LE string to UTF-8");
280 ERROR("The failing string was:");
281 print_string(utf16_str, utf16_nbytes);
284 #endif /* ENABLE_ERROR_MESSAGES */
289 /* Converts a string in the UTF-8 encoding to a newly allocated string in the
292 * If available, do so by calling a similar function from libntfs-3g.
293 * Otherwise, use iconv() along with the helper function utf8_to_utf16_size().
295 int utf8_to_utf16(const char *utf8_str, size_t utf8_nbytes,
296 char **utf16_str_ret, size_t *utf16_nbytes_ret)
299 if (utf8_nbytes == 0) {
300 *utf16_str_ret = NULL;
301 *utf16_nbytes_ret = 0;
306 int outs_nchars = ntfs_mbstoucs(utf8_str, (ntfschar**)&outs);
307 if (outs_nchars >= 0) {
308 *utf16_str_ret = outs;
309 *utf16_nbytes_ret = (size_t)outs_nchars * 2;
313 ret = WIMLIB_ERR_NOMEM;
315 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
317 #elif defined(__WIN32__)
321 utf16_nchars = mbstowcs(NULL, utf8_str, 0);
322 if (utf16_nchars == (size_t)(-1)) {
323 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
325 utf16_str = MALLOC((utf16_nchars + 1) * sizeof(wchar_t));
327 ret = WIMLIB_ERR_NOMEM;
329 mbstowcs((wchar_t*)utf16_str, utf8_str,
331 *utf16_str_ret = utf16_str;
332 *utf16_nbytes_ret = utf16_nchars * sizeof(wchar_t);
338 ret = iconv_global_init();
341 ret = utf8_to_utf16_size(utf8_str);
343 size_t utf16_expected_nbytes;
345 size_t utf16_bytes_left;
346 size_t utf8_bytes_left;
347 size_t num_chars_converted;
348 const char *utf8_str_save;
349 char *utf16_str_save;
351 utf16_expected_nbytes = (size_t)ret * 2;
352 utf16_str = MALLOC(utf16_expected_nbytes + 2);
354 utf16_bytes_left = utf16_expected_nbytes;
355 utf8_bytes_left = utf8_nbytes;
356 utf8_str_save = utf8_str;
357 utf16_str_save = utf16_str;
358 num_chars_converted = iconv(cd_utf8_to_utf16,
363 utf8_str = utf8_str_save;
364 utf16_str = utf16_str_save;
365 if (utf16_bytes_left == 0 &&
366 utf8_bytes_left == 0 &&
367 num_chars_converted != (size_t)(-1))
369 utf16_str[utf16_expected_nbytes] = '\0';
370 utf16_str[utf16_expected_nbytes + 1] = '\0';
371 *utf16_str_ret = utf16_str;
372 *utf16_nbytes_ret = utf16_expected_nbytes;
376 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
379 ret = WIMLIB_ERR_NOMEM;
381 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
382 #endif /* WITH_NTFS_3G */
384 #ifdef ENABLE_ERROR_MESSAGES
386 ERROR_WITH_ERRNO("Error converting UTF-8 string to UTF-16LE");
387 ERROR("The failing string was:");
388 print_string(utf8_str, utf8_nbytes);
390 ERROR("Length: %zu bytes", utf8_nbytes);
392 #endif /* ENABLE_ERROR_MESSAGES */