2 * encoding.c: Convert UTF-8 to UTF-16LE strings and vice versa
6 * Copyright (C) 2012 Eric Biggers
8 * This file is part of wimlib, a library for working with WIM files.
10 * wimlib is free software; you can redistribute it and/or modify it under the
11 * terms of the GNU General Public License as published by the Free
12 * Software Foundation; either version 3 of the License, or (at your option)
15 * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
16 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
17 * A PARTICULAR PURPOSE. See the GNU General Public License for more
20 * You should have received a copy of the GNU General Public License
21 * along with wimlib; if not, see http://www.gnu.org/licenses/.
26 #include "endianness.h"
31 #include <ntfs-3g/volume.h>
32 #include <ntfs-3g/unistr.h>
40 * utf16_to_utf8_size() and utf8_to_utf16_size() were taken from
41 * libntfs-3g/unistr.c in the NTFS-3g sources. (Modified slightly to remove
42 * unneeded functionality.)
46 * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
47 * null) to store a given UTF-16LE string.
49 * Return -1 with errno set if string has invalid byte sequence or too long.
51 static int utf16_to_utf8_size(const u16 *ins, const int ins_len)
58 for (i = 0; i < ins_len && ins[i]; i++) {
59 unsigned short c = le16_to_cpu(ins[i]);
61 if ((c >= 0xdc00) && (c < 0xe000)) {
76 else if ((c >= 0xe000) && (c < 0xfffe))
96 * Return the amount of 16-bit elements in UTF-16LE needed
97 * (without the terminating null) to store given UTF-8 string.
99 * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
101 * Note: This does not check whether the input sequence is a valid utf8 string,
102 * and should be used only in context where such check is made!
104 static int utf8_to_utf16_size(const char *s)
108 while ((byte = *((const unsigned char *)s++))) {
133 #endif /* !WITH_NTFS_3G */
135 /* Converts a string in the UTF-16LE encoding to a newly allocated string in the
138 * If available, do so by calling a similar function from libntfs-3g.
139 * Otherwise, use iconv() along with the helper function utf16_to_utf8_size().
141 int utf16_to_utf8(const char *utf16_str, size_t utf16_nbytes,
142 char **utf8_str_ret, size_t *utf8_nbytes_ret)
146 if (utf16_nbytes == 0) {
147 *utf8_str_ret = NULL;
148 *utf8_nbytes_ret = 0;
152 if (utf16_nbytes & 1) {
153 ERROR("UTF-16LE string is invalid (odd number of bytes)!");
154 return WIMLIB_ERR_INVALID_UTF16_STRING;
158 int outs_len = ntfs_ucstombs((const ntfschar*)utf16_str,
159 utf16_nbytes / 2, &outs, 0);
161 *utf8_str_ret = outs;
162 *utf8_nbytes_ret = outs_len;
166 ret = WIMLIB_ERR_NOMEM;
168 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
170 #else /* WITH_NTFS_3G */
171 static iconv_t cd_utf16_to_utf8 = (iconv_t)(-1);
172 if (cd_utf16_to_utf8 == (iconv_t)(-1)) {
173 cd_utf16_to_utf8 = iconv_open("UTF-8", "UTF-16LE");
174 if (cd_utf16_to_utf8 == (iconv_t)-1) {
175 ERROR_WITH_ERRNO("Failed to get conversion descriptor "
176 "for converting UTF-16LE to UTF-8");
178 return WIMLIB_ERR_NOMEM;
180 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
183 ret = utf16_to_utf8_size((const u16*)utf16_str, utf16_nbytes / 2);
185 size_t utf8_expected_nbytes;
187 size_t utf8_bytes_left;
188 size_t utf16_bytes_left;
189 size_t num_chars_converted;
191 const char *utf16_str_save;
193 utf8_expected_nbytes = ret;
194 utf8_str = MALLOC(utf8_expected_nbytes + 1);
196 utf8_bytes_left = utf8_expected_nbytes;
197 utf16_bytes_left = utf16_nbytes;
198 utf8_str_save = utf8_str;
199 utf16_str_save = utf16_str;
200 num_chars_converted = iconv(cd_utf16_to_utf8,
205 utf8_str = utf8_str_save;
206 utf16_str = utf16_str_save;
207 if (utf16_bytes_left == 0 &&
208 utf8_bytes_left == 0 &&
209 num_chars_converted != (size_t)(-1))
211 utf8_str[utf8_expected_nbytes] = '\0';
212 *utf8_str_ret = utf8_str;
213 *utf8_nbytes_ret = utf8_expected_nbytes;
217 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
220 ret = WIMLIB_ERR_NOMEM;
222 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
223 #endif /* WITH_NTFS_3G */
225 #ifdef ENABLE_ERROR_MESSAGES
227 ERROR_WITH_ERRNO("Error converting UTF-16LE string to UTF-8");
228 ERROR("The failing string was:");
229 print_string(utf16_str, utf16_nbytes);
232 #endif /* ENABLE_ERROR_MESSAGES */
237 /* Converts a string in the UTF-8 encoding to a newly allocated string in the
240 * If available, do so by calling a similar function from libntfs-3g.
241 * Otherwise, use iconv() along with the helper function utf8_to_utf16_size().
243 int utf8_to_utf16(const char *utf8_str, size_t utf8_nbytes,
244 char **utf16_str_ret, size_t *utf16_nbytes_ret)
247 if (utf8_nbytes == 0) {
248 *utf16_str_ret = NULL;
249 *utf16_nbytes_ret = 0;
254 int outs_nchars = ntfs_mbstoucs(utf8_str, (ntfschar**)&outs);
255 if (outs_nchars >= 0) {
256 *utf16_str_ret = outs;
257 *utf16_nbytes_ret = (size_t)outs_nchars * 2;
261 ret = WIMLIB_ERR_NOMEM;
263 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
265 #else /* WITH_NTFS_3G */
266 static iconv_t cd_utf8_to_utf16 = (iconv_t)(-1);
267 if (cd_utf8_to_utf16 == (iconv_t)(-1)) {
268 cd_utf8_to_utf16 = iconv_open("UTF-16LE", "UTF-8");
269 if (cd_utf8_to_utf16 == (iconv_t)-1) {
270 ERROR_WITH_ERRNO("Failed to get conversion descriptor "
271 "for converting UTF-8 to UTF-16LE");
273 return WIMLIB_ERR_NOMEM;
275 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
279 ret = utf8_to_utf16_size(utf8_str);
281 size_t utf16_expected_nbytes;
283 size_t utf16_bytes_left;
284 size_t utf8_bytes_left;
285 size_t num_chars_converted;
286 const char *utf8_str_save;
287 char *utf16_str_save;
289 utf16_expected_nbytes = (size_t)ret * 2;
290 utf16_str = MALLOC(utf16_expected_nbytes + 2);
292 utf16_bytes_left = utf16_expected_nbytes;
293 utf8_bytes_left = utf8_nbytes;
294 utf8_str_save = utf8_str;
295 utf16_str_save = utf16_str;
296 num_chars_converted = iconv(cd_utf8_to_utf16,
301 utf8_str = utf8_str_save;
302 utf16_str = utf16_str_save;
303 if (utf16_bytes_left == 0 &&
304 utf8_bytes_left == 0 &&
305 num_chars_converted != (size_t)(-1))
307 utf16_str[utf16_expected_nbytes] = '\0';
308 utf16_str[utf16_expected_nbytes + 1] = '\0';
309 *utf16_str_ret = utf16_str;
310 *utf16_nbytes_ret = utf16_expected_nbytes;
314 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
317 ret = WIMLIB_ERR_NOMEM;
319 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
320 #endif /* WITH_NTFS_3G */
322 #ifdef ENABLE_ERROR_MESSAGES
324 ERROR_WITH_ERRNO("Error converting UTF-8 string to UTF-16LE");
325 ERROR("The failing string was:");
326 print_string(utf8_str, utf8_nbytes);
328 ERROR("Length: %zu bytes", utf8_nbytes);
330 #endif /* ENABLE_ERROR_MESSAGES */