implement WIMLIB_INIT_FLAG_ASSUME_UTF8
[wimlib] / src / encoding.c
1 /*
2  * encoding.c
3  */
4
5 /*
6  * Copyright (C) 2012, 2013 Eric Biggers
7  *
8  * This file is part of wimlib, a library for working with WIM files.
9  *
10  * wimlib is free software; you can redistribute it and/or modify it under the
11  * terms of the GNU General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option)
13  * any later version.
14  *
15  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
16  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
17  * A PARTICULAR PURPOSE. See the GNU General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with wimlib; if not, see http://www.gnu.org/licenses/.
22  */
23
24 #include "wimlib_internal.h"
25
26 #include <errno.h>
27 #include <iconv.h>
28 #include <pthread.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 bool wimlib_mbs_is_utf8 = !TCHAR_IS_UTF16LE;
33
34 /* List of iconv_t conversion descriptors for a specific character conversion.
35  * The idea is that it is not thread-safe to have just one conversion
36  * descriptor, but it also is inefficient to open a new conversion descriptor to
37  * convert every string.  Both these problems can be solved by maintaining a
38  * list of conversion descriptors; then, a thread can use an existing conversion
39  * descriptor if available. */
40 struct iconv_list_head {
41         const char *from_encoding;
42         const char *to_encoding;
43         struct list_head list;
44         pthread_mutex_t mutex;
45 };
46
47 struct iconv_node {
48         iconv_t cd;
49         struct list_head list;
50         struct iconv_list_head *head;
51 };
52
53 #define ICONV_LIST(name, from, to)                      \
54 struct iconv_list_head name = {                         \
55         .from_encoding = from,                          \
56         .to_encoding = to,                              \
57         .list = LIST_HEAD_INIT(name.list),              \
58         .mutex = PTHREAD_MUTEX_INITIALIZER,             \
59 }
60
61 static iconv_t *
62 get_iconv(struct iconv_list_head *head)
63 {
64         iconv_t cd;
65         iconv_t *cd_p;
66         struct iconv_node *i;
67
68         pthread_mutex_lock(&head->mutex);
69         if (list_empty(&head->list)) {
70                 cd = iconv_open(head->to_encoding, head->from_encoding);
71                 if (cd == (iconv_t)-1) {
72                         ERROR_WITH_ERRNO("Failed to open iconv from %s to %s",
73                                          head->from_encoding, head->to_encoding);
74                         cd_p = NULL;
75                 } else {
76                         i = MALLOC(sizeof(struct iconv_node));
77                         if (i) {
78                                 i->head = head;
79                                 i->cd = cd;
80                                 cd_p = &i->cd;
81                         } else {
82                                 iconv_close(cd);
83                                 cd_p = NULL;
84                         }
85                 }
86         } else {
87                 i = container_of(head->list.next, struct iconv_node, list);
88                 list_del(head->list.next);
89                 cd_p = &i->cd;
90         }
91         pthread_mutex_unlock(&head->mutex);
92         return cd_p;
93 }
94
95 static void
96 put_iconv(iconv_t *cd)
97 {
98         int errno_save = errno;
99         struct iconv_node *i = container_of(cd, struct iconv_node, cd);
100         struct iconv_list_head *head = i->head;
101
102         pthread_mutex_lock(&head->mutex);
103         list_add(&i->list, &head->list);
104         pthread_mutex_unlock(&head->mutex);
105         errno = errno_save;
106 }
107
108 /* Prevent printing an error message if a character conversion error occurs
109  * while printing an error message.  (This variable is not per-thread but it
110  * doesn't matter too much since it's just the error messages.) */
111 static bool error_message_being_printed = false;
112
113 #define DEFINE_CHAR_CONVERSION_FUNCTIONS(varname1, longname1, chartype1,\
114                                          varname2, longname2, chartype2,\
115                                          earlyreturn_on_utf8_locale,    \
116                                          earlyreturn_expr,              \
117                                          worst_case_len_expr,           \
118                                          err_return,                    \
119                                          err_msg,                       \
120                                          modifier)                      \
121 static ICONV_LIST(iconv_##varname1##_to_##varname2,                     \
122                   longname1, longname2);                                \
123                                                                         \
124 modifier int                                                            \
125 varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\
126                                   size_t *out_nbytes_ret)               \
127 {                                                                       \
128         iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2);     \
129         if (cd == NULL)                                                 \
130                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;                  \
131                                                                         \
132         /* Worst case length */                                         \
133         chartype2 buf[worst_case_len_expr];                             \
134         char *inbuf = (char*)in;                                        \
135         size_t inbytesleft = in_nbytes;                                 \
136         char *outbuf = (char*)buf;                                      \
137         size_t outbytesleft = sizeof(buf);                              \
138         size_t len;                                                     \
139         int ret;                                                        \
140                                                                         \
141         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \
142         if (len == (size_t)-1) {                                        \
143                 if (!error_message_being_printed) {                     \
144                         error_message_being_printed = true;             \
145                         err_msg;                                        \
146                         error_message_being_printed = false;            \
147                 }                                                       \
148                 ret = err_return;                                       \
149         } else {                                                        \
150                 *out_nbytes_ret = sizeof(buf) - outbytesleft;           \
151                 ret = 0;                                                \
152         }                                                               \
153         put_iconv(cd);                                                  \
154         return ret;                                                     \
155 }                                                                       \
156                                                                         \
157 modifier int                                                            \
158 varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes,   \
159                                chartype2 *out)                          \
160 {                                                                       \
161         iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2);     \
162         if (cd == NULL)                                                 \
163                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;                  \
164                                                                         \
165         char *inbuf = (char*)in;                                        \
166         size_t inbytesleft = in_nbytes;                                 \
167         char *outbuf = (char*)out;                                      \
168         const size_t LARGE_NUMBER = 1000000000;                         \
169         size_t outbytesleft = LARGE_NUMBER;                             \
170         size_t len;                                                     \
171         int ret;                                                        \
172                                                                         \
173         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \
174         if (len == (size_t)-1) {                                        \
175                 if (!error_message_being_printed) {                     \
176                         error_message_being_printed = true;             \
177                         err_msg;                                        \
178                         error_message_being_printed = false;            \
179                 }                                                       \
180                 ret = err_return;                                       \
181         } else {                                                        \
182                 out[(LARGE_NUMBER-outbytesleft)/sizeof(chartype2)] = 0; \
183                 ret = 0;                                                \
184         }                                                               \
185         put_iconv(cd);                                                  \
186         return ret;                                                     \
187 }                                                                       \
188                                                                         \
189 modifier int                                                            \
190 varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes,         \
191                          chartype2 **out_ret,                           \
192                          size_t *out_nbytes_ret)                        \
193 {                                                                       \
194         int ret;                                                        \
195         chartype2 *out;                                                 \
196         size_t out_nbytes;                                              \
197                                                                         \
198         if (earlyreturn_on_utf8_locale && wimlib_mbs_is_utf8) {         \
199                 earlyreturn_expr;                                       \
200                 /* Out same as in */                                    \
201                 out = MALLOC(in_nbytes + sizeof(chartype2));            \
202                 if (!out)                                               \
203                         return WIMLIB_ERR_NOMEM;                        \
204                 memcpy(out, in, in_nbytes);                             \
205                 out[in_nbytes / sizeof(chartype2)] = 0;                 \
206                 *out_ret = out;                                         \
207                 *out_nbytes_ret = in_nbytes;                            \
208                 return 0;                                               \
209         }                                                               \
210                                                                         \
211         ret = varname1##_to_##varname2##_nbytes(in, in_nbytes,          \
212                                                 &out_nbytes);           \
213         if (ret)                                                        \
214                 return ret;                                             \
215                                                                         \
216         out = MALLOC(out_nbytes + sizeof(chartype2));                   \
217         if (!out)                                                       \
218                 return WIMLIB_ERR_NOMEM;                                \
219                                                                         \
220         ret = varname1##_to_##varname2##_buf(in, in_nbytes, out);       \
221         if (ret) {                                                      \
222                 FREE(out);                                              \
223         } else {                                                        \
224                 *out_ret = out;                                         \
225                 *out_nbytes_ret = out_nbytes;                           \
226         }                                                               \
227         return ret;                                                     \
228 }
229
230 #if !TCHAR_IS_UTF16LE
231
232 /* UNIX */
233
234 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", tchar,
235                                  utf16le, "UTF-16LE", utf16lechar,
236                                  false,
237                                  ,
238                                  in_nbytes * 2,
239                                  WIMLIB_ERR_INVALID_UTF8_STRING,
240                                  ERROR_WITH_ERRNO("Failed to convert UTF-8 string "
241                                                   "to UTF-16LE string!"),
242                                  static)
243
244 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar,
245                                  utf8, "UTF-8", tchar,
246                                  false,
247                                  ,
248                                  in_nbytes * 2,
249                                  WIMLIB_ERR_INVALID_UTF16_STRING,
250                                  ERROR_WITH_ERRNO("Failed to convert UTF-16LE string "
251                                                   "to UTF-8 string!"),
252                                  static)
253
254 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar,
255                                  utf16le, "UTF-16LE", utf16lechar,
256                                  true,
257                                  return utf8_to_utf16le(in, in_nbytes, out_ret, out_nbytes_ret),
258                                  in_nbytes * 2,
259                                  WIMLIB_ERR_INVALID_MULTIBYTE_STRING,
260                                  ERROR_WITH_ERRNO("Failed to convert multibyte "
261                                                   "string \"%"TS"\" to UTF-16LE string!", in);
262                                  ERROR("If the data you provided was UTF-8, please make sure "
263                                        "the character encoding\n"
264                                        "        of your current locale is UTF-8."),
265                                  )
266
267 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar,
268                                  tstr, "", tchar,
269                                  true,
270                                  return utf16le_to_utf8(in, in_nbytes, out_ret, out_nbytes_ret),
271                                  in_nbytes * 2,
272                                  WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
273                                  ERROR("Failed to convert UTF-16LE string to "
274                                        "multibyte string!");
275                                  ERROR("This may be because the UTF-16LE string "
276                                        "could not be represented\n"
277                                        "        in your locale's character encoding."),
278                                  )
279 #endif
280
281 /* tchar to UTF-8 and back */
282 #if TCHAR_IS_UTF16LE
283
284 /* Windows */
285 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "UTF-16LE", tchar,
286                                  utf8, "UTF-8", char,
287                                  false,
288                                  ,
289                                  in_nbytes * 2,
290                                  WIMLIB_ERR_INVALID_UTF16_STRING,
291                                  ERROR_WITH_ERRNO("Failed to convert UTF-16LE "
292                                                   "string \"%"TS"\" to UTF-8 string!", in),
293                                  static)
294
295 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char,
296                                  tstr, "UTF-16LE", tchar,
297                                  false,
298                                  ,
299                                  in_nbytes * 2,
300                                  WIMLIB_ERR_INVALID_UTF8_STRING,
301                                  ERROR_WITH_ERRNO("Failed to convert UTF-8 string "
302                                                   "to UTF-16LE string!"),
303                                  static)
304 #else
305
306 /* UNIX */
307
308 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar,
309                                  utf8, "UTF-8", char,
310                                  true,
311                                  ,
312                                  in_nbytes * 4,
313                                  WIMLIB_ERR_INVALID_MULTIBYTE_STRING,
314                                  ERROR_WITH_ERRNO("Failed to convert multibyte "
315                                                   "string \"%"TS"\" to UTF-8 string!", in);
316                                  ERROR("If the data you provided was UTF-8, please make sure "
317                                        "the character\n"
318                                        "        encoding of your current locale is UTF-8."),
319                                  static)
320
321 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char,
322                                  tstr, "", tchar,
323                                  true,
324                                  ,
325                                  in_nbytes * 4,
326                                  WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
327                                  ERROR("Failed to convert UTF-8 string to "
328                                        "multibyte string!");
329                                  ERROR("This may be because the UTF-8 data "
330                                        "could not be represented\n"
331                                        "        in your locale's character encoding."),
332                                  static)
333 #endif
334
335 int
336 tstr_to_utf8_simple(const tchar *tstr, char **out)
337 {
338         size_t out_nbytes;
339         return tstr_to_utf8(tstr, tstrlen(tstr) * sizeof(tchar),
340                             out, &out_nbytes);
341 }
342
343 int
344 utf8_to_tstr_simple(const char *utf8str, tchar **out)
345 {
346         size_t out_nbytes;
347         return utf8_to_tstr(utf8str, strlen(utf8str), out, &out_nbytes);
348 }
349
350 static void
351 iconv_cleanup(struct iconv_list_head *head)
352 {
353         pthread_mutex_destroy(&head->mutex);
354         while (!list_empty(&head->list)) {
355                 struct iconv_node *i;
356
357                 i = container_of(head->list.next, struct iconv_node, list);
358                 list_del(&i->list);
359                 iconv_close(i->cd);
360                 FREE(i);
361         }
362 }
363
364 void
365 iconv_global_cleanup()
366 {
367         iconv_cleanup(&iconv_utf8_to_tstr);
368         iconv_cleanup(&iconv_tstr_to_utf8);
369 #if !TCHAR_IS_UTF16LE
370         iconv_cleanup(&iconv_utf16le_to_tstr);
371         iconv_cleanup(&iconv_tstr_to_utf16le);
372 #endif
373 }