96fb485a9b2bd60b5b7a7055f94f1c7618096e02
[wimlib] / src / encoding.c
1 /*
2  * encoding.c:  Convert UTF-8 to UTF-16LE strings and vice versa
3  */
4
5 /*
6  * Copyright (C) 2012, 2013 Biggers
7  *
8  * This file is part of wimlib, a library for working with WIM files.
9  *
10  * wimlib is free software; you can redistribute it and/or modify it under the
11  * terms of the GNU General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option)
13  * any later version.
14  *
15  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
16  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
17  * A PARTICULAR PURPOSE. See the GNU General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with wimlib; if not, see http://www.gnu.org/licenses/.
22  */
23
24 #include "wimlib.h"
25 #include "util.h"
26 #include "endianness.h"
27
28 #include <errno.h>
29
30 #ifdef WITH_NTFS_3G
31 #include <ntfs-3g/volume.h>
32 #include <ntfs-3g/unistr.h>
33 #else
34 #include <iconv.h>
35 #endif
36
37 /*
38  * NOTE:
39  *
40  * utf16_to_utf8_size() and utf8_to_utf16_size() were taken from
41  * libntfs-3g/unistr.c in the NTFS-3g sources.  (Modified slightly to remove
42  * unneeded functionality.)
43  */
44 #ifndef WITH_NTFS_3G
45 /*
46  * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
47  * null) to store a given UTF-16LE string.
48  *
49  * Return -1 with errno set if string has invalid byte sequence or too long.
50  */
51 static int utf16_to_utf8_size(const u16 *ins, const int ins_len)
52 {
53         int i, ret = -1;
54         int count = 0;
55         bool surrog;
56
57         surrog = false;
58         for (i = 0; i < ins_len && ins[i]; i++) {
59                 unsigned short c = le16_to_cpu(ins[i]);
60                 if (surrog) {
61                         if ((c >= 0xdc00) && (c < 0xe000)) {
62                                 surrog = false;
63                                 count += 4;
64                         } else
65                                 goto fail;
66                 } else
67                         if (c < 0x80)
68                                 count++;
69                         else if (c < 0x800)
70                                 count += 2;
71                         else if (c < 0xd800)
72                                 count += 3;
73                         else if (c < 0xdc00)
74                                 surrog = true;
75 #if NOREVBOM
76                         else if ((c >= 0xe000) && (c < 0xfffe))
77 #else
78                         else if (c >= 0xe000)
79 #endif
80                                 count += 3;
81                         else
82                                 goto fail;
83         }
84         if (surrog)
85                 goto fail;
86
87         ret = count;
88 out:
89         return ret;
90 fail:
91         errno = EILSEQ;
92         goto out;
93 }
94
95 /*
96  * Return the amount of 16-bit elements in UTF-16LE needed
97  * (without the terminating null) to store given UTF-8 string.
98  *
99  * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
100  *
101  * Note: This does not check whether the input sequence is a valid utf8 string,
102  *       and should be used only in context where such check is made!
103  */
104 static int utf8_to_utf16_size(const char *s)
105 {
106         unsigned int byte;
107         size_t count = 0;
108         while ((byte = *((const unsigned char *)s++))) {
109                 count++;
110                 if (byte >= 0xc0) {
111                         if (byte >= 0xF5) {
112                                 errno = EILSEQ;
113                                 return -1;
114                         }
115                         if (!*s)
116                                 break;
117                         if (byte >= 0xC0)
118                                 s++;
119                         if (!*s)
120                                 break;
121                         if (byte >= 0xE0)
122                                 s++;
123                         if (!*s)
124                                 break;
125                         if (byte >= 0xF0) {
126                                 s++;
127                                 count++;
128                         }
129                 }
130         }
131         return count;
132 }
133 #endif /* !WITH_NTFS_3G */
134
135 #ifndef WITH_NTFS_3G
136 static iconv_t cd_utf8_to_utf16 = (iconv_t)(-1);
137 static iconv_t cd_utf16_to_utf8 = (iconv_t)(-1);
138
139 int iconv_global_init()
140 {
141         if (cd_utf16_to_utf8 == (iconv_t)(-1)) {
142                 cd_utf16_to_utf8 = iconv_open("UTF-8", "UTF-16LE");
143                 if (cd_utf16_to_utf8 == (iconv_t)-1) {
144                         ERROR_WITH_ERRNO("Failed to get conversion descriptor "
145                                          "for converting UTF-16LE to UTF-8");
146                         if (errno == ENOMEM)
147                                 return WIMLIB_ERR_NOMEM;
148                         else
149                                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
150                 }
151         }
152
153         if (cd_utf8_to_utf16 == (iconv_t)(-1)) {
154                 cd_utf8_to_utf16 = iconv_open("UTF-16LE", "UTF-8");
155                 if (cd_utf8_to_utf16 == (iconv_t)-1) {
156                         ERROR_WITH_ERRNO("Failed to get conversion descriptor "
157                                          "for converting UTF-8 to UTF-16LE");
158                         if (errno == ENOMEM)
159                                 return WIMLIB_ERR_NOMEM;
160                         else
161                                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
162                 }
163         }
164         return 0;
165 }
166
167 void iconv_global_cleanup()
168 {
169         if (cd_utf8_to_utf16 != (iconv_t)(-1))
170                 iconv_close(cd_utf8_to_utf16);
171         if (cd_utf16_to_utf8 != (iconv_t)(-1))
172                 iconv_close(cd_utf16_to_utf8);
173 }
174 #endif
175
176 /* Converts a string in the UTF-16LE encoding to a newly allocated string in the
177  * UTF-8 encoding.
178  *
179  * If available, do so by calling a similar function from libntfs-3g.
180  * Otherwise, use iconv() along with the helper function utf16_to_utf8_size().
181  */
182 int utf16_to_utf8(const char *utf16_str, size_t utf16_nbytes,
183                   char **utf8_str_ret, size_t *utf8_nbytes_ret)
184 {
185         int ret;
186
187         if (utf16_nbytes == 0) {
188                 *utf8_str_ret = NULL;
189                 *utf8_nbytes_ret = 0;
190                 return 0;
191         }
192
193         if (utf16_nbytes & 1) {
194                 ERROR("UTF-16LE string is invalid (odd number of bytes)!");
195                 return WIMLIB_ERR_INVALID_UTF16_STRING;
196         }
197 #ifdef WITH_NTFS_3G
198         char *outs = NULL;
199         int outs_len = ntfs_ucstombs((const ntfschar*)utf16_str,
200                                      utf16_nbytes / 2, &outs, 0);
201         if (outs_len >= 0) {
202                 *utf8_str_ret = outs;
203                 *utf8_nbytes_ret = outs_len;
204                 ret = 0;
205         } else {
206                 if (errno == ENOMEM)
207                         ret = WIMLIB_ERR_NOMEM;
208                 else
209                         ret = WIMLIB_ERR_INVALID_UTF16_STRING;
210         }
211 #else /* !WITH_NTFS_3G */
212
213         ret = iconv_global_init();
214         if (ret != 0)
215                 return ret;
216
217         ret = utf16_to_utf8_size((const u16*)utf16_str, utf16_nbytes / 2);
218         if (ret >= 0) {
219                 size_t utf8_expected_nbytes;
220                 char  *utf8_str;
221                 size_t utf8_bytes_left;
222                 size_t utf16_bytes_left;
223                 size_t num_chars_converted;
224                 char  *utf8_str_save;
225                 const char *utf16_str_save;
226
227                 utf8_expected_nbytes = ret;
228                 utf8_str = MALLOC(utf8_expected_nbytes + 1);
229                 if (utf8_str) {
230                         utf8_bytes_left = utf8_expected_nbytes;
231                         utf16_bytes_left = utf16_nbytes;
232                         utf8_str_save = utf8_str;
233                         utf16_str_save = utf16_str;
234                         num_chars_converted = iconv(cd_utf16_to_utf8,
235                                                     (char**)&utf16_str,
236                                                     &utf16_bytes_left,
237                                                     &utf8_str,
238                                                     &utf8_bytes_left);
239                         utf8_str = utf8_str_save;
240                         utf16_str = utf16_str_save;
241                         if (utf16_bytes_left == 0 &&
242                             utf8_bytes_left == 0 &&
243                             num_chars_converted != (size_t)(-1))
244                         {
245                                 utf8_str[utf8_expected_nbytes] = '\0';
246                                 *utf8_str_ret = utf8_str;
247                                 *utf8_nbytes_ret = utf8_expected_nbytes;
248                                 ret = 0;
249                         } else {
250                                 FREE(utf8_str);
251                                 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
252                         }
253                 } else
254                         ret = WIMLIB_ERR_NOMEM;
255         } else
256                 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
257 #endif /* WITH_NTFS_3G */
258
259 #ifdef ENABLE_ERROR_MESSAGES
260         if (ret != 0) {
261                 ERROR_WITH_ERRNO("Error converting UTF-16LE string to UTF-8");
262                 ERROR("The failing string was:");
263                 print_string(utf16_str, utf16_nbytes);
264                 putchar('\n');
265         }
266 #endif /* ENABLE_ERROR_MESSAGES */
267         return ret;
268 }
269
270
271 /* Converts a string in the UTF-8 encoding to a newly allocated string in the
272  * UTF-16 encoding.
273  *
274  * If available, do so by calling a similar function from libntfs-3g.
275  * Otherwise, use iconv() along with the helper function utf8_to_utf16_size().
276  */
277 int utf8_to_utf16(const char *utf8_str, size_t utf8_nbytes,
278                   char **utf16_str_ret, size_t *utf16_nbytes_ret)
279 {
280         int ret;
281         if (utf8_nbytes == 0) {
282                 *utf16_str_ret = NULL;
283                 *utf16_nbytes_ret = 0;
284                 return 0;
285         }
286 #ifdef WITH_NTFS_3G
287         char *outs = NULL;
288         int outs_nchars = ntfs_mbstoucs(utf8_str, (ntfschar**)&outs);
289         if (outs_nchars >= 0) {
290                 *utf16_str_ret = outs;
291                 *utf16_nbytes_ret = (size_t)outs_nchars * 2;
292                 ret = 0;
293         } else {
294                 if (errno == ENOMEM)
295                         ret = WIMLIB_ERR_NOMEM;
296                 else
297                         ret = WIMLIB_ERR_INVALID_UTF8_STRING;
298         }
299 #else /* !WITH_NTFS_3G */
300
301         ret = iconv_global_init();
302         if (ret != 0)
303                 return ret;
304         ret = utf8_to_utf16_size(utf8_str);
305         if (ret >= 0) {
306                 size_t utf16_expected_nbytes;
307                 char  *utf16_str;
308                 size_t utf16_bytes_left;
309                 size_t utf8_bytes_left;
310                 size_t num_chars_converted;
311                 const char *utf8_str_save;
312                 char  *utf16_str_save;
313
314                 utf16_expected_nbytes = (size_t)ret * 2;
315                 utf16_str = MALLOC(utf16_expected_nbytes + 2);
316                 if (utf16_str) {
317                         utf16_bytes_left = utf16_expected_nbytes;
318                         utf8_bytes_left = utf8_nbytes;
319                         utf8_str_save = utf8_str;
320                         utf16_str_save = utf16_str;
321                         num_chars_converted = iconv(cd_utf8_to_utf16,
322                                                     (char**)&utf8_str,
323                                                     &utf8_bytes_left,
324                                                     &utf16_str,
325                                                     &utf16_bytes_left);
326                         utf8_str = utf8_str_save;
327                         utf16_str = utf16_str_save;
328                         if (utf16_bytes_left == 0 &&
329                             utf8_bytes_left == 0 &&
330                             num_chars_converted != (size_t)(-1))
331                         {
332                                 utf16_str[utf16_expected_nbytes] = '\0';
333                                 utf16_str[utf16_expected_nbytes + 1] = '\0';
334                                 *utf16_str_ret = utf16_str;
335                                 *utf16_nbytes_ret = utf16_expected_nbytes;
336                                 ret = 0;
337                         } else {
338                                 FREE(utf16_str);
339                                 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
340                         }
341                 } else
342                         ret = WIMLIB_ERR_NOMEM;
343         } else
344                 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
345 #endif /* WITH_NTFS_3G */
346
347 #ifdef ENABLE_ERROR_MESSAGES
348         if (ret != 0) {
349                 ERROR_WITH_ERRNO("Error converting UTF-8 string to UTF-16LE");
350                 ERROR("The failing string was:");
351                 print_string(utf8_str, utf8_nbytes);
352                 putchar('\n');
353                 ERROR("Length: %zu bytes", utf8_nbytes);
354         }
355 #endif /* ENABLE_ERROR_MESSAGES */
356         return ret;
357 }