2 * encoding.c: Convert UTF-8 to UTF-16LE strings and vice versa
6 * Copyright (C) 2012, 2013 Eric Biggers
8 * This file is part of wimlib, a library for working with WIM files.
10 * wimlib is free software; you can redistribute it and/or modify it under the
11 * terms of the GNU General Public License as published by the Free
12 * Software Foundation; either version 3 of the License, or (at your option)
15 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
16 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
17 * A PARTICULAR PURPOSE. See the GNU General Public License for more
20 * You should have received a copy of the GNU General Public License
21 * along with wimlib; if not, see http://www.gnu.org/licenses/.
26 #include "endianness.h"
31 #include <ntfs-3g/volume.h>
32 #include <ntfs-3g/unistr.h>
40 * utf16_to_utf8_size() and utf8_to_utf16_size() were taken from
41 * libntfs-3g/unistr.c in the NTFS-3g sources. (Modified slightly to remove
42 * unneeded functionality.)
46 * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
47 * null) to store a given UTF-16LE string.
49 * Return -1 with errno set if string has invalid byte sequence or too long.
51 static int utf16_to_utf8_size(const u16 *ins, const int ins_len)
58 for (i = 0; i < ins_len && ins[i]; i++) {
59 unsigned short c = le16_to_cpu(ins[i]);
61 if ((c >= 0xdc00) && (c < 0xe000)) {
76 else if ((c >= 0xe000) && (c < 0xfffe))
96 * Return the amount of 16-bit elements in UTF-16LE needed
97 * (without the terminating null) to store given UTF-8 string.
99 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
101 * Note: This does not check whether the input sequence is a valid utf8 string,
102 * and should be used only in context where such check is made!
104 static int utf8_to_utf16_size(const char *s)
108 while ((byte = *((const unsigned char *)s++))) {
133 #endif /* !WITH_NTFS_3G */
136 static iconv_t cd_utf8_to_utf16 = (iconv_t)(-1);
137 static iconv_t cd_utf16_to_utf8 = (iconv_t)(-1);
139 int iconv_global_init()
141 if (cd_utf16_to_utf8 == (iconv_t)(-1)) {
142 cd_utf16_to_utf8 = iconv_open("UTF-8", "UTF-16LE");
143 if (cd_utf16_to_utf8 == (iconv_t)-1) {
144 ERROR_WITH_ERRNO("Failed to get conversion descriptor "
145 "for converting UTF-16LE to UTF-8");
147 return WIMLIB_ERR_NOMEM;
149 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
153 if (cd_utf8_to_utf16 == (iconv_t)(-1)) {
154 cd_utf8_to_utf16 = iconv_open("UTF-16LE", "UTF-8");
155 if (cd_utf8_to_utf16 == (iconv_t)-1) {
156 ERROR_WITH_ERRNO("Failed to get conversion descriptor "
157 "for converting UTF-8 to UTF-16LE");
159 return WIMLIB_ERR_NOMEM;
161 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
167 void iconv_global_cleanup()
169 if (cd_utf8_to_utf16 != (iconv_t)(-1))
170 iconv_close(cd_utf8_to_utf16);
171 if (cd_utf16_to_utf8 != (iconv_t)(-1))
172 iconv_close(cd_utf16_to_utf8);
176 /* Converts a string in the UTF-16LE encoding to a newly allocated string in the
179 * If available, do so by calling a similar function from libntfs-3g.
180 * Otherwise, use iconv() along with the helper function utf16_to_utf8_size().
182 int utf16_to_utf8(const char *utf16_str, size_t utf16_nbytes,
183 char **utf8_str_ret, size_t *utf8_nbytes_ret)
187 if (utf16_nbytes == 0) {
188 *utf8_str_ret = NULL;
189 *utf8_nbytes_ret = 0;
193 if (utf16_nbytes & 1) {
194 ERROR("UTF-16LE string is invalid (odd number of bytes)!");
195 return WIMLIB_ERR_INVALID_UTF16_STRING;
199 int outs_len = ntfs_ucstombs((const ntfschar*)utf16_str,
200 utf16_nbytes / 2, &outs, 0);
202 *utf8_str_ret = outs;
203 *utf8_nbytes_ret = outs_len;
207 ret = WIMLIB_ERR_NOMEM;
209 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
211 #else /* !WITH_NTFS_3G */
213 ret = iconv_global_init();
217 ret = utf16_to_utf8_size((const u16*)utf16_str, utf16_nbytes / 2);
219 size_t utf8_expected_nbytes;
221 size_t utf8_bytes_left;
222 size_t utf16_bytes_left;
223 size_t num_chars_converted;
225 const char *utf16_str_save;
227 utf8_expected_nbytes = ret;
228 utf8_str = MALLOC(utf8_expected_nbytes + 1);
230 utf8_bytes_left = utf8_expected_nbytes;
231 utf16_bytes_left = utf16_nbytes;
232 utf8_str_save = utf8_str;
233 utf16_str_save = utf16_str;
234 num_chars_converted = iconv(cd_utf16_to_utf8,
239 utf8_str = utf8_str_save;
240 utf16_str = utf16_str_save;
241 if (utf16_bytes_left == 0 &&
242 utf8_bytes_left == 0 &&
243 num_chars_converted != (size_t)(-1))
245 utf8_str[utf8_expected_nbytes] = '\0';
246 *utf8_str_ret = utf8_str;
247 *utf8_nbytes_ret = utf8_expected_nbytes;
251 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
254 ret = WIMLIB_ERR_NOMEM;
256 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
257 #endif /* WITH_NTFS_3G */
259 #ifdef ENABLE_ERROR_MESSAGES
261 ERROR_WITH_ERRNO("Error converting UTF-16LE string to UTF-8");
262 ERROR("The failing string was:");
263 print_string(utf16_str, utf16_nbytes);
266 #endif /* ENABLE_ERROR_MESSAGES */
271 /* Converts a string in the UTF-8 encoding to a newly allocated string in the
274 * If available, do so by calling a similar function from libntfs-3g.
275 * Otherwise, use iconv() along with the helper function utf8_to_utf16_size().
277 int utf8_to_utf16(const char *utf8_str, size_t utf8_nbytes,
278 char **utf16_str_ret, size_t *utf16_nbytes_ret)
281 if (utf8_nbytes == 0) {
282 *utf16_str_ret = NULL;
283 *utf16_nbytes_ret = 0;
288 int outs_nchars = ntfs_mbstoucs(utf8_str, (ntfschar**)&outs);
289 if (outs_nchars >= 0) {
290 *utf16_str_ret = outs;
291 *utf16_nbytes_ret = (size_t)outs_nchars * 2;
295 ret = WIMLIB_ERR_NOMEM;
297 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
299 #else /* !WITH_NTFS_3G */
301 ret = iconv_global_init();
304 ret = utf8_to_utf16_size(utf8_str);
306 size_t utf16_expected_nbytes;
308 size_t utf16_bytes_left;
309 size_t utf8_bytes_left;
310 size_t num_chars_converted;
311 const char *utf8_str_save;
312 char *utf16_str_save;
314 utf16_expected_nbytes = (size_t)ret * 2;
315 utf16_str = MALLOC(utf16_expected_nbytes + 2);
317 utf16_bytes_left = utf16_expected_nbytes;
318 utf8_bytes_left = utf8_nbytes;
319 utf8_str_save = utf8_str;
320 utf16_str_save = utf16_str;
321 num_chars_converted = iconv(cd_utf8_to_utf16,
326 utf8_str = utf8_str_save;
327 utf16_str = utf16_str_save;
328 if (utf16_bytes_left == 0 &&
329 utf8_bytes_left == 0 &&
330 num_chars_converted != (size_t)(-1))
332 utf16_str[utf16_expected_nbytes] = '\0';
333 utf16_str[utf16_expected_nbytes + 1] = '\0';
334 *utf16_str_ret = utf16_str;
335 *utf16_nbytes_ret = utf16_expected_nbytes;
339 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
342 ret = WIMLIB_ERR_NOMEM;
344 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
345 #endif /* WITH_NTFS_3G */
347 #ifdef ENABLE_ERROR_MESSAGES
349 ERROR_WITH_ERRNO("Error converting UTF-8 string to UTF-16LE");
350 ERROR("The failing string was:");
351 print_string(utf8_str, utf8_nbytes);
353 ERROR("Length: %zu bytes", utf8_nbytes);
355 #endif /* ENABLE_ERROR_MESSAGES */