Refactor headers
[wimlib] / src / encoding.c
1 /*
2  * encoding.c
3  */
4
5 /*
6  * Copyright (C) 2012, 2013 Eric Biggers
7  *
8  * This file is part of wimlib, a library for working with WIM files.
9  *
10  * wimlib is free software; you can redistribute it and/or modify it under the
11  * terms of the GNU General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option)
13  * any later version.
14  *
15  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
16  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
17  * A PARTICULAR PURPOSE. See the GNU General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with wimlib; if not, see http://www.gnu.org/licenses/.
22  */
23
24 #ifdef HAVE_CONFIG_H
25 #  include "config.h"
26 #endif
27
28 #include "wimlib.h"
29 #include "wimlib/encoding.h"
30 #include "wimlib/error.h"
31 #include "wimlib/list.h"
32 #include "wimlib/util.h"
33
34 #include <errno.h>
35 #include <iconv.h>
36 #include <pthread.h>
37 #include <stdlib.h>
38 #include <string.h>
39
40 bool wimlib_mbs_is_utf8 = !TCHAR_IS_UTF16LE;
41
42 /* List of iconv_t conversion descriptors for a specific character conversion.
43  * The idea is that it is not thread-safe to have just one conversion
44  * descriptor, but it also is inefficient to open a new conversion descriptor to
45  * convert every string.  Both these problems can be solved by maintaining a
46  * list of conversion descriptors; then, a thread can use an existing conversion
47  * descriptor if available. */
48 struct iconv_list_head {
49         const char *from_encoding;
50         const char *to_encoding;
51         struct list_head list;
52         pthread_mutex_t mutex;
53 };
54
55 struct iconv_node {
56         iconv_t cd;
57         struct list_head list;
58         struct iconv_list_head *head;
59 };
60
61 #define ICONV_LIST(name, from, to)                      \
62 struct iconv_list_head name = {                         \
63         .from_encoding = from,                          \
64         .to_encoding = to,                              \
65         .list = LIST_HEAD_INIT(name.list),              \
66         .mutex = PTHREAD_MUTEX_INITIALIZER,             \
67 }
68
69 static iconv_t *
70 get_iconv(struct iconv_list_head *head)
71 {
72         iconv_t cd;
73         iconv_t *cd_p;
74         struct iconv_node *i;
75
76         pthread_mutex_lock(&head->mutex);
77         if (list_empty(&head->list)) {
78                 cd = iconv_open(head->to_encoding, head->from_encoding);
79                 if (cd == (iconv_t)-1) {
80                         ERROR_WITH_ERRNO("Failed to open iconv from %s to %s",
81                                          head->from_encoding, head->to_encoding);
82                         cd_p = NULL;
83                 } else {
84                         i = MALLOC(sizeof(struct iconv_node));
85                         if (i) {
86                                 i->head = head;
87                                 i->cd = cd;
88                                 cd_p = &i->cd;
89                         } else {
90                                 iconv_close(cd);
91                                 cd_p = NULL;
92                         }
93                 }
94         } else {
95                 i = container_of(head->list.next, struct iconv_node, list);
96                 list_del(head->list.next);
97                 cd_p = &i->cd;
98         }
99         pthread_mutex_unlock(&head->mutex);
100         return cd_p;
101 }
102
103 static void
104 put_iconv(iconv_t *cd)
105 {
106         int errno_save = errno;
107         struct iconv_node *i = container_of(cd, struct iconv_node, cd);
108         struct iconv_list_head *head = i->head;
109
110         pthread_mutex_lock(&head->mutex);
111         list_add(&i->list, &head->list);
112         pthread_mutex_unlock(&head->mutex);
113         errno = errno_save;
114 }
115
116 /* Prevent printing an error message if a character conversion error occurs
117  * while printing an error message.  (This variable is not per-thread but it
118  * doesn't matter too much since it's just the error messages.) */
119 static bool error_message_being_printed = false;
120
121 #define DEFINE_CHAR_CONVERSION_FUNCTIONS(varname1, longname1, chartype1,\
122                                          varname2, longname2, chartype2,\
123                                          earlyreturn_on_utf8_locale,    \
124                                          earlyreturn_expr,              \
125                                          worst_case_len_expr,           \
126                                          err_return,                    \
127                                          err_msg,                       \
128                                          modifier)                      \
129 static ICONV_LIST(iconv_##varname1##_to_##varname2,                     \
130                   longname1, longname2);                                \
131                                                                         \
132 modifier int                                                            \
133 varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\
134                                   size_t *out_nbytes_ret)               \
135 {                                                                       \
136         iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2);     \
137         if (cd == NULL)                                                 \
138                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;                  \
139                                                                         \
140         /* Worst case length */                                         \
141         chartype2 buf[worst_case_len_expr];                             \
142         char *inbuf = (char*)in;                                        \
143         size_t inbytesleft = in_nbytes;                                 \
144         char *outbuf = (char*)buf;                                      \
145         size_t outbytesleft = sizeof(buf);                              \
146         size_t len;                                                     \
147         int ret;                                                        \
148                                                                         \
149         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \
150         if (len == (size_t)-1) {                                        \
151                 if (!error_message_being_printed) {                     \
152                         error_message_being_printed = true;             \
153                         err_msg;                                        \
154                         error_message_being_printed = false;            \
155                 }                                                       \
156                 ret = err_return;                                       \
157         } else {                                                        \
158                 *out_nbytes_ret = sizeof(buf) - outbytesleft;           \
159                 ret = 0;                                                \
160         }                                                               \
161         put_iconv(cd);                                                  \
162         return ret;                                                     \
163 }                                                                       \
164                                                                         \
165 modifier int                                                            \
166 varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes,   \
167                                chartype2 *out)                          \
168 {                                                                       \
169         iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2);     \
170         if (cd == NULL)                                                 \
171                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;                  \
172                                                                         \
173         char *inbuf = (char*)in;                                        \
174         size_t inbytesleft = in_nbytes;                                 \
175         char *outbuf = (char*)out;                                      \
176         const size_t LARGE_NUMBER = 1000000000;                         \
177         size_t outbytesleft = LARGE_NUMBER;                             \
178         size_t len;                                                     \
179         int ret;                                                        \
180                                                                         \
181         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \
182         if (len == (size_t)-1) {                                        \
183                 if (!error_message_being_printed) {                     \
184                         error_message_being_printed = true;             \
185                         err_msg;                                        \
186                         error_message_being_printed = false;            \
187                 }                                                       \
188                 ret = err_return;                                       \
189         } else {                                                        \
190                 out[(LARGE_NUMBER-outbytesleft)/sizeof(chartype2)] = 0; \
191                 ret = 0;                                                \
192         }                                                               \
193         put_iconv(cd);                                                  \
194         return ret;                                                     \
195 }                                                                       \
196                                                                         \
197 modifier int                                                            \
198 varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes,         \
199                          chartype2 **out_ret,                           \
200                          size_t *out_nbytes_ret)                        \
201 {                                                                       \
202         int ret;                                                        \
203         chartype2 *out;                                                 \
204         size_t out_nbytes;                                              \
205                                                                         \
206         if (earlyreturn_on_utf8_locale && wimlib_mbs_is_utf8) {         \
207                 earlyreturn_expr;                                       \
208                 /* Out same as in */                                    \
209                 out = MALLOC(in_nbytes + sizeof(chartype2));            \
210                 if (!out)                                               \
211                         return WIMLIB_ERR_NOMEM;                        \
212                 memcpy(out, in, in_nbytes);                             \
213                 out[in_nbytes / sizeof(chartype2)] = 0;                 \
214                 *out_ret = out;                                         \
215                 *out_nbytes_ret = in_nbytes;                            \
216                 return 0;                                               \
217         }                                                               \
218                                                                         \
219         ret = varname1##_to_##varname2##_nbytes(in, in_nbytes,          \
220                                                 &out_nbytes);           \
221         if (ret)                                                        \
222                 return ret;                                             \
223                                                                         \
224         out = MALLOC(out_nbytes + sizeof(chartype2));                   \
225         if (!out)                                                       \
226                 return WIMLIB_ERR_NOMEM;                                \
227                                                                         \
228         ret = varname1##_to_##varname2##_buf(in, in_nbytes, out);       \
229         if (ret) {                                                      \
230                 FREE(out);                                              \
231         } else {                                                        \
232                 *out_ret = out;                                         \
233                 *out_nbytes_ret = out_nbytes;                           \
234         }                                                               \
235         return ret;                                                     \
236 }
237
238 #if !TCHAR_IS_UTF16LE
239
240 /* UNIX */
241
242 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", tchar,
243                                  utf16le, "UTF-16LE", utf16lechar,
244                                  false,
245                                  ,
246                                  in_nbytes * 2,
247                                  WIMLIB_ERR_INVALID_UTF8_STRING,
248                                  ERROR_WITH_ERRNO("Failed to convert UTF-8 string "
249                                                   "to UTF-16LE string!"),
250                                  static)
251
252 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar,
253                                  utf8, "UTF-8", tchar,
254                                  false,
255                                  ,
256                                  in_nbytes * 2,
257                                  WIMLIB_ERR_INVALID_UTF16_STRING,
258                                  ERROR_WITH_ERRNO("Failed to convert UTF-16LE string "
259                                                   "to UTF-8 string!"),
260                                  static)
261
262 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar,
263                                  utf16le, "UTF-16LE", utf16lechar,
264                                  true,
265                                  return utf8_to_utf16le(in, in_nbytes, out_ret, out_nbytes_ret),
266                                  in_nbytes * 2,
267                                  WIMLIB_ERR_INVALID_MULTIBYTE_STRING,
268                                  ERROR_WITH_ERRNO("Failed to convert multibyte "
269                                                   "string \"%"TS"\" to UTF-16LE string!", in);
270                                  ERROR("If the data you provided was UTF-8, please make sure "
271                                        "the character encoding\n"
272                                        "        of your current locale is UTF-8."),
273                                  )
274
275 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar,
276                                  tstr, "", tchar,
277                                  true,
278                                  return utf16le_to_utf8(in, in_nbytes, out_ret, out_nbytes_ret),
279                                  in_nbytes * 2,
280                                  WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
281                                  ERROR("Failed to convert UTF-16LE string to "
282                                        "multibyte string!");
283                                  ERROR("This may be because the UTF-16LE string "
284                                        "could not be represented\n"
285                                        "        in your locale's character encoding."),
286                                  )
287 #endif
288
289 /* tchar to UTF-8 and back */
290 #if TCHAR_IS_UTF16LE
291
292 /* Windows */
293 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "UTF-16LE", tchar,
294                                  utf8, "UTF-8", char,
295                                  false,
296                                  ,
297                                  in_nbytes * 2,
298                                  WIMLIB_ERR_INVALID_UTF16_STRING,
299                                  ERROR_WITH_ERRNO("Failed to convert UTF-16LE "
300                                                   "string \"%"TS"\" to UTF-8 string!", in),
301                                  static)
302
303 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char,
304                                  tstr, "UTF-16LE", tchar,
305                                  false,
306                                  ,
307                                  in_nbytes * 2,
308                                  WIMLIB_ERR_INVALID_UTF8_STRING,
309                                  ERROR_WITH_ERRNO("Failed to convert UTF-8 string "
310                                                   "to UTF-16LE string!"),
311                                  static)
312 #else
313
314 /* UNIX */
315
316 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar,
317                                  utf8, "UTF-8", char,
318                                  true,
319                                  ,
320                                  in_nbytes * 4,
321                                  WIMLIB_ERR_INVALID_MULTIBYTE_STRING,
322                                  ERROR_WITH_ERRNO("Failed to convert multibyte "
323                                                   "string \"%"TS"\" to UTF-8 string!", in);
324                                  ERROR("If the data you provided was UTF-8, please make sure "
325                                        "the character\n"
326                                        "        encoding of your current locale is UTF-8."),
327                                  static)
328
329 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char,
330                                  tstr, "", tchar,
331                                  true,
332                                  ,
333                                  in_nbytes * 4,
334                                  WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
335                                  ERROR("Failed to convert UTF-8 string to "
336                                        "multibyte string!");
337                                  ERROR("This may be because the UTF-8 data "
338                                        "could not be represented\n"
339                                        "        in your locale's character encoding."),
340                                  static)
341 #endif
342
343 int
344 tstr_to_utf8_simple(const tchar *tstr, char **out)
345 {
346         size_t out_nbytes;
347         return tstr_to_utf8(tstr, tstrlen(tstr) * sizeof(tchar),
348                             out, &out_nbytes);
349 }
350
351 int
352 utf8_to_tstr_simple(const char *utf8str, tchar **out)
353 {
354         size_t out_nbytes;
355         return utf8_to_tstr(utf8str, strlen(utf8str), out, &out_nbytes);
356 }
357
358 static void
359 iconv_cleanup(struct iconv_list_head *head)
360 {
361         pthread_mutex_destroy(&head->mutex);
362         while (!list_empty(&head->list)) {
363                 struct iconv_node *i;
364
365                 i = container_of(head->list.next, struct iconv_node, list);
366                 list_del(&i->list);
367                 iconv_close(i->cd);
368                 FREE(i);
369         }
370 }
371
372 void
373 iconv_global_cleanup(void)
374 {
375         iconv_cleanup(&iconv_utf8_to_tstr);
376         iconv_cleanup(&iconv_tstr_to_utf8);
377 #if !TCHAR_IS_UTF16LE
378         iconv_cleanup(&iconv_utf16le_to_tstr);
379         iconv_cleanup(&iconv_tstr_to_utf16le);
380         iconv_cleanup(&iconv_utf16le_to_utf8);
381         iconv_cleanup(&iconv_utf8_to_utf16le);
382 #endif
383 }