]> wimlib.net Git - wimlib/blob - src/encoding.c
bddb18104fda798426d73893972ffc0ea1ec4c12
[wimlib] / src / encoding.c
1 /*
2  * encoding.c:  Convert UTF-8 to UTF-16LE strings and vice versa
3  */
4
5 /*
6  * Copyright (C) 2012, 2013 Eric Biggers
7  *
8  * This file is part of wimlib, a library for working with WIM files.
9  *
10  * wimlib is free software; you can redistribute it and/or modify it under the
11  * terms of the GNU General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option)
13  * any later version.
14  *
15  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
16  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
17  * A PARTICULAR PURPOSE. See the GNU General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with wimlib; if not, see http://www.gnu.org/licenses/.
22  */
23
24 #include "config.h"
25 #include "wimlib.h"
26 #include "util.h"
27 #include "endianness.h"
28
29 #include <errno.h>
30
31 #ifdef WITH_NTFS_3G
32 #  include <ntfs-3g/volume.h>
33 #  include <ntfs-3g/unistr.h>
34 #elif defined(__WIN32__)
35 #  include <wchar.h>
36 #  include <stdlib.h>
37 #else
38 #  include <iconv.h>
39 #endif
40
41 /*
42  * NOTE:
43  *
44  * utf16_to_utf8_size() and utf8_to_utf16_size() were taken from
45  * libntfs-3g/unistr.c in the NTFS-3g sources.  (Modified slightly to remove
46  * unneeded functionality.)
47  */
48 #if !defined(WITH_NTFS_3G) && !defined(__WIN32__)
49 /*
50  * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
51  * null) to store a given UTF-16LE string.
52  *
53  * Return -1 with errno set if string has invalid byte sequence or too long.
54  */
55 static int utf16_to_utf8_size(const u16 *ins, const int ins_len)
56 {
57         int i, ret = -1;
58         int count = 0;
59         bool surrog;
60
61         surrog = false;
62         for (i = 0; i < ins_len && ins[i]; i++) {
63                 unsigned short c = le16_to_cpu(ins[i]);
64                 if (surrog) {
65                         if ((c >= 0xdc00) && (c < 0xe000)) {
66                                 surrog = false;
67                                 count += 4;
68                         } else
69                                 goto fail;
70                 } else
71                         if (c < 0x80)
72                                 count++;
73                         else if (c < 0x800)
74                                 count += 2;
75                         else if (c < 0xd800)
76                                 count += 3;
77                         else if (c < 0xdc00)
78                                 surrog = true;
79 #if NOREVBOM
80                         else if ((c >= 0xe000) && (c < 0xfffe))
81 #else
82                         else if (c >= 0xe000)
83 #endif
84                                 count += 3;
85                         else
86                                 goto fail;
87         }
88         if (surrog)
89                 goto fail;
90
91         ret = count;
92 out:
93         return ret;
94 fail:
95         errno = EILSEQ;
96         goto out;
97 }
98
99 /*
100  * Return the amount of 16-bit elements in UTF-16LE needed
101  * (without the terminating null) to store given UTF-8 string.
102  *
103  * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
104  *
105  * Note: This does not check whether the input sequence is a valid utf8 string,
106  *       and should be used only in context where such check is made!
107  */
108 static int utf8_to_utf16_size(const char *s)
109 {
110         unsigned int byte;
111         size_t count = 0;
112         while ((byte = *((const unsigned char *)s++))) {
113                 count++;
114                 if (byte >= 0xc0) {
115                         if (byte >= 0xF5) {
116                                 errno = EILSEQ;
117                                 return -1;
118                         }
119                         if (!*s)
120                                 break;
121                         if (byte >= 0xC0)
122                                 s++;
123                         if (!*s)
124                                 break;
125                         if (byte >= 0xE0)
126                                 s++;
127                         if (!*s)
128                                 break;
129                         if (byte >= 0xF0) {
130                                 s++;
131                                 count++;
132                         }
133                 }
134         }
135         return count;
136 }
137
138 static iconv_t cd_utf8_to_utf16 = (iconv_t)(-1);
139 static iconv_t cd_utf16_to_utf8 = (iconv_t)(-1);
140
141 int iconv_global_init()
142 {
143         if (cd_utf16_to_utf8 == (iconv_t)(-1)) {
144                 cd_utf16_to_utf8 = iconv_open("UTF-8", "UTF-16LE");
145                 if (cd_utf16_to_utf8 == (iconv_t)-1) {
146                         ERROR_WITH_ERRNO("Failed to get conversion descriptor "
147                                          "for converting UTF-16LE to UTF-8");
148                         if (errno == ENOMEM)
149                                 return WIMLIB_ERR_NOMEM;
150                         else
151                                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
152                 }
153         }
154
155         if (cd_utf8_to_utf16 == (iconv_t)(-1)) {
156                 cd_utf8_to_utf16 = iconv_open("UTF-16LE", "UTF-8");
157                 if (cd_utf8_to_utf16 == (iconv_t)-1) {
158                         ERROR_WITH_ERRNO("Failed to get conversion descriptor "
159                                          "for converting UTF-8 to UTF-16LE");
160                         if (errno == ENOMEM)
161                                 return WIMLIB_ERR_NOMEM;
162                         else
163                                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
164                 }
165         }
166         return 0;
167 }
168
169 void iconv_global_cleanup()
170 {
171         if (cd_utf8_to_utf16 != (iconv_t)(-1))
172                 iconv_close(cd_utf8_to_utf16);
173         if (cd_utf16_to_utf8 != (iconv_t)(-1))
174                 iconv_close(cd_utf16_to_utf8);
175 }
176 #endif /* !WITH_NTFS_3G && !__WIN32__ */
177
178 /* Converts a string in the UTF-16LE encoding to a newly allocated string in the
179  * UTF-8 encoding.
180  *
181  * If available, do so by calling a similar function from libntfs-3g.
182  * Otherwise, use iconv() along with the helper function utf16_to_utf8_size().
183  */
184 int utf16_to_utf8(const char *utf16_str, size_t utf16_nbytes,
185                   char **utf8_str_ret, size_t *utf8_nbytes_ret)
186 {
187         int ret;
188
189         if (utf16_nbytes == 0) {
190                 *utf8_str_ret = NULL;
191                 *utf8_nbytes_ret = 0;
192                 return 0;
193         }
194
195         if (utf16_nbytes & 1) {
196                 ERROR("UTF-16LE string is invalid (odd number of bytes)!");
197                 return WIMLIB_ERR_INVALID_UTF16_STRING;
198         }
199 #ifdef WITH_NTFS_3G
200         char *outs = NULL;
201         int outs_len = ntfs_ucstombs((const ntfschar*)utf16_str,
202                                      utf16_nbytes / 2, &outs, 0);
203         if (outs_len >= 0) {
204                 *utf8_str_ret = outs;
205                 *utf8_nbytes_ret = outs_len;
206                 ret = 0;
207         } else {
208                 if (errno == ENOMEM)
209                         ret = WIMLIB_ERR_NOMEM;
210                 else
211                         ret = WIMLIB_ERR_INVALID_UTF16_STRING;
212         }
213 #elif defined(__WIN32__)
214         char *utf8_str;
215         size_t utf8_nbytes;
216         utf8_nbytes = wcstombs(NULL, (const wchar_t*)utf16_str, 0);
217         if (utf8_nbytes == (size_t)(-1)) {
218                 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
219         } else {
220                 utf8_str = MALLOC(utf8_nbytes + 1);
221                 if (!utf8_str) {
222                         ret = WIMLIB_ERR_NOMEM;
223                 } else {
224                         wcstombs(utf8_str, (const wchar_t*)utf16_str, utf8_nbytes + 1);
225                         *utf8_str_ret = utf8_str;
226                         *utf8_nbytes_ret = utf8_nbytes;
227                         ret = 0;
228                 }
229         }
230 #else
231         ret = iconv_global_init();
232         if (ret != 0)
233                 return ret;
234
235         ret = utf16_to_utf8_size((const u16*)utf16_str, utf16_nbytes / 2);
236         if (ret >= 0) {
237                 size_t utf8_expected_nbytes;
238                 char  *utf8_str;
239                 size_t utf8_bytes_left;
240                 size_t utf16_bytes_left;
241                 size_t num_chars_converted;
242                 char  *utf8_str_save;
243                 const char *utf16_str_save;
244
245                 utf8_expected_nbytes = ret;
246                 utf8_str = MALLOC(utf8_expected_nbytes + 1);
247                 if (utf8_str) {
248                         utf8_bytes_left = utf8_expected_nbytes;
249                         utf16_bytes_left = utf16_nbytes;
250                         utf8_str_save = utf8_str;
251                         utf16_str_save = utf16_str;
252                         num_chars_converted = iconv(cd_utf16_to_utf8,
253                                                     (char**)&utf16_str,
254                                                     &utf16_bytes_left,
255                                                     &utf8_str,
256                                                     &utf8_bytes_left);
257                         utf8_str = utf8_str_save;
258                         utf16_str = utf16_str_save;
259                         if (utf16_bytes_left == 0 &&
260                             utf8_bytes_left == 0 &&
261                             num_chars_converted != (size_t)(-1))
262                         {
263                                 utf8_str[utf8_expected_nbytes] = '\0';
264                                 *utf8_str_ret = utf8_str;
265                                 *utf8_nbytes_ret = utf8_expected_nbytes;
266                                 ret = 0;
267                         } else {
268                                 FREE(utf8_str);
269                                 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
270                         }
271                 } else
272                         ret = WIMLIB_ERR_NOMEM;
273         } else
274                 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
275 #endif /* WITH_NTFS_3G */
276
277 #ifdef ENABLE_ERROR_MESSAGES
278         if (ret != 0) {
279                 ERROR_WITH_ERRNO("Error converting UTF-16LE string to UTF-8");
280                 ERROR("The failing string was:");
281                 print_string(utf16_str, utf16_nbytes);
282                 putchar('\n');
283         }
284 #endif /* ENABLE_ERROR_MESSAGES */
285         return ret;
286 }
287
288
289 /* Converts a string in the UTF-8 encoding to a newly allocated string in the
290  * UTF-16 encoding.
291  *
292  * If available, do so by calling a similar function from libntfs-3g.
293  * Otherwise, use iconv() along with the helper function utf8_to_utf16_size().
294  */
295 int utf8_to_utf16(const char *utf8_str, size_t utf8_nbytes,
296                   char **utf16_str_ret, size_t *utf16_nbytes_ret)
297 {
298         int ret;
299         if (utf8_nbytes == 0) {
300                 *utf16_str_ret = NULL;
301                 *utf16_nbytes_ret = 0;
302                 return 0;
303         }
304 #ifdef WITH_NTFS_3G
305         char *outs = NULL;
306         int outs_nchars = ntfs_mbstoucs(utf8_str, (ntfschar**)&outs);
307         if (outs_nchars >= 0) {
308                 *utf16_str_ret = outs;
309                 *utf16_nbytes_ret = (size_t)outs_nchars * 2;
310                 ret = 0;
311         } else {
312                 if (errno == ENOMEM)
313                         ret = WIMLIB_ERR_NOMEM;
314                 else
315                         ret = WIMLIB_ERR_INVALID_UTF8_STRING;
316         }
317 #elif defined(__WIN32__)
318
319         char *utf16_str;
320         size_t utf16_nchars;
321         utf16_nchars = mbstowcs(NULL, utf8_str, 0);
322         if (utf16_nchars == (size_t)(-1)) {
323                 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
324         } else {
325                 utf16_str = MALLOC((utf16_nchars + 1) * sizeof(wchar_t));
326                 if (!utf16_str) {
327                         ret = WIMLIB_ERR_NOMEM;
328                 } else {
329                         mbstowcs((wchar_t*)utf16_str, utf8_str,
330                                  utf16_nchars + 1);
331                         *utf16_str_ret = utf16_str;
332                         *utf16_nbytes_ret = utf16_nchars * sizeof(wchar_t);
333                         ret = 0;
334                 }
335         }
336         
337 #else
338         ret = iconv_global_init();
339         if (ret != 0)
340                 return ret;
341         ret = utf8_to_utf16_size(utf8_str);
342         if (ret >= 0) {
343                 size_t utf16_expected_nbytes;
344                 char  *utf16_str;
345                 size_t utf16_bytes_left;
346                 size_t utf8_bytes_left;
347                 size_t num_chars_converted;
348                 const char *utf8_str_save;
349                 char  *utf16_str_save;
350
351                 utf16_expected_nbytes = (size_t)ret * 2;
352                 utf16_str = MALLOC(utf16_expected_nbytes + 2);
353                 if (utf16_str) {
354                         utf16_bytes_left = utf16_expected_nbytes;
355                         utf8_bytes_left = utf8_nbytes;
356                         utf8_str_save = utf8_str;
357                         utf16_str_save = utf16_str;
358                         num_chars_converted = iconv(cd_utf8_to_utf16,
359                                                     (char**)&utf8_str,
360                                                     &utf8_bytes_left,
361                                                     &utf16_str,
362                                                     &utf16_bytes_left);
363                         utf8_str = utf8_str_save;
364                         utf16_str = utf16_str_save;
365                         if (utf16_bytes_left == 0 &&
366                             utf8_bytes_left == 0 &&
367                             num_chars_converted != (size_t)(-1))
368                         {
369                                 utf16_str[utf16_expected_nbytes] = '\0';
370                                 utf16_str[utf16_expected_nbytes + 1] = '\0';
371                                 *utf16_str_ret = utf16_str;
372                                 *utf16_nbytes_ret = utf16_expected_nbytes;
373                                 ret = 0;
374                         } else {
375                                 FREE(utf16_str);
376                                 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
377                         }
378                 } else
379                         ret = WIMLIB_ERR_NOMEM;
380         } else
381                 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
382 #endif /* WITH_NTFS_3G */
383
384 #ifdef ENABLE_ERROR_MESSAGES
385         if (ret != 0) {
386                 ERROR_WITH_ERRNO("Error converting UTF-8 string to UTF-16LE");
387                 ERROR("The failing string was:");
388                 print_string(utf8_str, utf8_nbytes);
389                 putchar('\n');
390                 ERROR("Length: %zu bytes", utf8_nbytes);
391         }
392 #endif /* ENABLE_ERROR_MESSAGES */
393         return ret;
394 }