edb24c70796ff3efc73b36d16aad812df2e12c9a
[wimlib] / src / encoding.c
1 /*
2  * encoding.c:  Convert UTF-8 to UTF-16LE strings and vice versa
3  */
4
5 /*
6  * Copyright (C) 2012 Eric Biggers
7  *
8  * This file is part of wimlib, a library for working with WIM files.
9  *
10  * wimlib is free software; you can redistribute it and/or modify it under the
11  * terms of the GNU General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option)
13  * any later version.
14  *
15  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
16  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
17  * A PARTICULAR PURPOSE. See the GNU General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with wimlib; if not, see http://www.gnu.org/licenses/.
22  */
23
24 #include "wimlib.h"
25 #include "util.h"
26 #include "endianness.h"
27
28 #include <errno.h>
29
30 #ifdef WITH_NTFS_3G
31 #include <ntfs-3g/volume.h>
32 #include <ntfs-3g/unistr.h>
33 #else
34 #include <iconv.h>
35 #endif
36
37 /*
38  * NOTE:
39  *
40  * utf16_to_utf8_size() and utf8_to_utf16_size() were taken from
41  * libntfs-3g/unistr.c in the NTFS-3g sources.  (Modified slightly to remove
42  * unneeded functionality.)
43  */
44 #ifndef WITH_NTFS_3G
45 /*
46  * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
47  * null) to store a given UTF-16LE string.
48  *
49  * Return -1 with errno set if string has invalid byte sequence or too long.
50  */
51 static int utf16_to_utf8_size(const u16 *ins, const int ins_len)
52 {
53         int i, ret = -1;
54         int count = 0;
55         bool surrog;
56
57         surrog = false;
58         for (i = 0; i < ins_len && ins[i]; i++) {
59                 unsigned short c = le16_to_cpu(ins[i]);
60                 if (surrog) {
61                         if ((c >= 0xdc00) && (c < 0xe000)) {
62                                 surrog = false;
63                                 count += 4;
64                         } else
65                                 goto fail;
66                 } else
67                         if (c < 0x80)
68                                 count++;
69                         else if (c < 0x800)
70                                 count += 2;
71                         else if (c < 0xd800)
72                                 count += 3;
73                         else if (c < 0xdc00)
74                                 surrog = true;
75 #if NOREVBOM
76                         else if ((c >= 0xe000) && (c < 0xfffe))
77 #else
78                         else if (c >= 0xe000)
79 #endif
80                                 count += 3;
81                         else
82                                 goto fail;
83         }
84         if (surrog)
85                 goto fail;
86
87         ret = count;
88 out:
89         return ret;
90 fail:
91         errno = EILSEQ;
92         goto out;
93 }
94
95 /*
96  * Return the amount of 16-bit elements in UTF-16LE needed
97  * (without the terminating null) to store given UTF-8 string.
98  *
99  * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
100  *
101  * Note: This does not check whether the input sequence is a valid utf8 string,
102  *       and should be used only in context where such check is made!
103  */
104 static int utf8_to_utf16_size(const char *s)
105 {
106         unsigned int byte;
107         size_t count = 0;
108         while ((byte = *((const unsigned char *)s++))) {
109                 count++;
110                 if (byte >= 0xc0) {
111                         if (byte >= 0xF5) {
112                                 errno = EILSEQ;
113                                 return -1;
114                         }
115                         if (!*s)
116                                 break;
117                         if (byte >= 0xC0)
118                                 s++;
119                         if (!*s)
120                                 break;
121                         if (byte >= 0xE0)
122                                 s++;
123                         if (!*s)
124                                 break;
125                         if (byte >= 0xF0) {
126                                 s++;
127                                 count++;
128                         }
129                 }
130         }
131         return count;
132 }
133 #endif /* !WITH_NTFS_3G */
134
135 /* Converts a string in the UTF-16LE encoding to a newly allocated string in the
136  * UTF-8 encoding.
137  *
138  * If available, do so by calling a similar function from libntfs-3g.
139  * Otherwise, use iconv() along with the helper function utf16_to_utf8_size().
140  */
141 int utf16_to_utf8(const char *utf16_str, size_t utf16_nbytes,
142                   char **utf8_str_ret, size_t *utf8_nbytes_ret)
143 {
144         int ret;
145
146         if (utf16_nbytes == 0) {
147                 *utf8_str_ret = NULL;
148                 *utf8_nbytes_ret = 0;
149                 return 0;
150         }
151
152         if (utf16_nbytes & 1) {
153                 ERROR("UTF-16LE string is invalid (odd number of bytes)!");
154                 return WIMLIB_ERR_INVALID_UTF16_STRING;
155         }
156 #ifdef WITH_NTFS_3G
157         char *outs = NULL;
158         int outs_len = ntfs_ucstombs((const ntfschar*)utf16_str,
159                                      utf16_nbytes / 2, &outs, 0);
160         if (outs_len >= 0) {
161                 *utf8_str_ret = outs;
162                 *utf8_nbytes_ret = outs_len;
163                 ret = 0;
164         } else {
165                 if (errno == ENOMEM)
166                         ret = WIMLIB_ERR_NOMEM;
167                 else
168                         ret = WIMLIB_ERR_INVALID_UTF16_STRING;
169         }
170 #else /* WITH_NTFS_3G */
171         static iconv_t cd_utf16_to_utf8 = (iconv_t)(-1);
172         if (cd_utf16_to_utf8 == (iconv_t)(-1)) {
173                 cd_utf16_to_utf8 = iconv_open("UTF-8", "UTF-16LE");
174                 if (cd_utf16_to_utf8 == (iconv_t)-1) {
175                         ERROR_WITH_ERRNO("Failed to get conversion descriptor "
176                                          "for converting UTF-16LE to UTF-8");
177                         if (errno == ENOMEM)
178                                 return WIMLIB_ERR_NOMEM;
179                         else
180                                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
181                 }
182         }
183         ret = utf16_to_utf8_size((const u16*)utf16_str, utf16_nbytes / 2);
184         if (ret >= 0) {
185                 size_t utf8_expected_nbytes;
186                 char  *utf8_str;
187                 size_t utf8_bytes_left;
188                 size_t utf16_bytes_left;
189                 size_t num_chars_converted;
190                 char  *utf8_str_save;
191                 const char *utf16_str_save;
192
193                 utf8_expected_nbytes = ret;
194                 utf8_str = MALLOC(utf8_expected_nbytes + 1);
195                 if (utf8_str) {
196                         utf8_bytes_left = utf8_expected_nbytes;
197                         utf16_bytes_left = utf16_nbytes;
198                         utf8_str_save = utf8_str;
199                         utf16_str_save = utf16_str;
200                         num_chars_converted = iconv(cd_utf16_to_utf8,
201                                                     (char**)&utf16_str,
202                                                     &utf16_bytes_left,
203                                                     &utf8_str,
204                                                     &utf8_bytes_left);
205                         utf8_str = utf8_str_save;
206                         utf16_str = utf16_str_save;
207                         if (utf16_bytes_left == 0 &&
208                             utf8_bytes_left == 0 &&
209                             num_chars_converted != (size_t)(-1))
210                         {
211                                 utf8_str[utf8_expected_nbytes] = '\0';
212                                 *utf8_str_ret = utf8_str;
213                                 *utf8_nbytes_ret = utf8_expected_nbytes;
214                                 ret = 0;
215                         } else {
216                                 FREE(utf8_str);
217                                 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
218                         }
219                 } else
220                         ret = WIMLIB_ERR_NOMEM;
221         } else
222                 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
223 #endif /* WITH_NTFS_3G */
224
225 #ifdef ENABLE_ERROR_MESSAGES
226         if (ret != 0) {
227                 ERROR_WITH_ERRNO("Error converting UTF-16LE string to UTF-8");
228                 ERROR("The failing string was:");
229                 print_string(utf16_str, utf16_nbytes);
230                 putchar('\n');
231         }
232 #endif /* ENABLE_ERROR_MESSAGES */
233         return ret;
234 }
235
236
237 /* Converts a string in the UTF-8 encoding to a newly allocated string in the
238  * UTF-16 encoding.
239  *
240  * If available, do so by calling a similar function from libntfs-3g.
241  * Otherwise, use iconv() along with the helper function utf8_to_utf16_size().
242  */
243 int utf8_to_utf16(const char *utf8_str, size_t utf8_nbytes,
244                   char **utf16_str_ret, size_t *utf16_nbytes_ret)
245 {
246         int ret;
247         if (utf8_nbytes == 0) {
248                 *utf16_str_ret = NULL;
249                 *utf16_nbytes_ret = 0;
250                 return 0;
251         }
252 #ifdef WITH_NTFS_3G
253         char *outs = NULL;
254         int outs_nchars = ntfs_mbstoucs(utf8_str, (ntfschar**)&outs);
255         if (outs_nchars >= 0) {
256                 *utf16_str_ret = outs;
257                 *utf16_nbytes_ret = (size_t)outs_nchars * 2;
258                 ret = 0;
259         } else {
260                 if (errno == ENOMEM)
261                         ret = WIMLIB_ERR_NOMEM;
262                 else
263                         ret = WIMLIB_ERR_INVALID_UTF8_STRING;
264         }
265 #else /* WITH_NTFS_3G */
266         static iconv_t cd_utf8_to_utf16 = (iconv_t)(-1);
267         if (cd_utf8_to_utf16 == (iconv_t)(-1)) {
268                 cd_utf8_to_utf16 = iconv_open("UTF-16LE", "UTF-8");
269                 if (cd_utf8_to_utf16 == (iconv_t)-1) {
270                         ERROR_WITH_ERRNO("Failed to get conversion descriptor "
271                                          "for converting UTF-8 to UTF-16LE");
272                         if (errno == ENOMEM)
273                                 return WIMLIB_ERR_NOMEM;
274                         else
275                                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
276                 }
277         }
278
279         ret = utf8_to_utf16_size(utf8_str);
280         if (ret >= 0) {
281                 size_t utf16_expected_nbytes;
282                 char  *utf16_str;
283                 size_t utf16_bytes_left;
284                 size_t utf8_bytes_left;
285                 size_t num_chars_converted;
286                 const char *utf8_str_save;
287                 char  *utf16_str_save;
288
289                 utf16_expected_nbytes = (size_t)ret * 2;
290                 utf16_str = MALLOC(utf16_expected_nbytes + 2);
291                 if (utf16_str) {
292                         utf16_bytes_left = utf16_expected_nbytes;
293                         utf8_bytes_left = utf8_nbytes;
294                         utf8_str_save = utf8_str;
295                         utf16_str_save = utf16_str;
296                         num_chars_converted = iconv(cd_utf8_to_utf16,
297                                                     (char**)&utf8_str,
298                                                     &utf8_bytes_left,
299                                                     &utf16_str,
300                                                     &utf16_bytes_left);
301                         utf8_str = utf8_str_save;
302                         utf16_str = utf16_str_save;
303                         if (utf16_bytes_left == 0 &&
304                             utf8_bytes_left == 0 &&
305                             num_chars_converted != (size_t)(-1))
306                         {
307                                 utf16_str[utf16_expected_nbytes] = '\0';
308                                 utf16_str[utf16_expected_nbytes + 1] = '\0';
309                                 *utf16_str_ret = utf16_str;
310                                 *utf16_nbytes_ret = utf16_expected_nbytes;
311                                 ret = 0;
312                         } else {
313                                 FREE(utf16_str);
314                                 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
315                         }
316                 } else
317                         ret = WIMLIB_ERR_NOMEM;
318         } else
319                 ret = WIMLIB_ERR_INVALID_UTF8_STRING;
320 #endif /* WITH_NTFS_3G */
321
322 #ifdef ENABLE_ERROR_MESSAGES
323         if (ret != 0) {
324                 ERROR_WITH_ERRNO("Error converting UTF-8 string to UTF-16LE");
325                 ERROR("The failing string was:");
326                 print_string(utf8_str, utf8_nbytes);
327                 putchar('\n');
328                 ERROR("Length: %zu bytes", utf8_nbytes);
329         }
330 #endif /* ENABLE_ERROR_MESSAGES */
331         return ret;
332 }