822e9f53659ec780d654c4f9168094a2d5b03bcc
[wimlib] / src / encoding.c
1 /*
2  * encoding.c
3  */
4
5 /*
6  * Copyright (C) 2012, 2013 Eric Biggers
7  *
8  * This file is part of wimlib, a library for working with WIM files.
9  *
10  * wimlib is free software; you can redistribute it and/or modify it under the
11  * terms of the GNU General Public License as published by the Free
12  * Software Foundation; either version 3 of the License, or (at your option)
13  * any later version.
14  *
15  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
16  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
17  * A PARTICULAR PURPOSE. See the GNU General Public License for more
18  * details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with wimlib; if not, see http://www.gnu.org/licenses/.
22  */
23
24 #include "wimlib_internal.h"
25
26 #include <errno.h>
27 #include <iconv.h>
28 #include <pthread.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 bool wimlib_mbs_is_utf8 = !TCHAR_IS_UTF16LE;
33
34 /* List of iconv_t conversion descriptors for a specific character conversion.
35  * The idea is that it is not thread-safe to have just one conversion
36  * descriptor, but it also is inefficient to open a new conversion descriptor to
37  * convert every string.  Both these problems can be solved by maintaining a
38  * list of conversion descriptors; then, a thread can use an existing conversion
39  * descriptor if available. */
40 struct iconv_list_head {
41         const char *from_encoding;
42         const char *to_encoding;
43         struct list_head list;
44         pthread_mutex_t mutex;
45 };
46
47 struct iconv_node {
48         iconv_t cd;
49         struct list_head list;
50         struct iconv_list_head *head;
51 };
52
53 #define ICONV_LIST(name, from, to)                      \
54 struct iconv_list_head name = {                         \
55         .from_encoding = from,                          \
56         .to_encoding = to,                              \
57         .list = LIST_HEAD_INIT(name.list),              \
58         .mutex = PTHREAD_MUTEX_INITIALIZER,             \
59 }
60
61 static iconv_t *
62 get_iconv(struct iconv_list_head *head)
63 {
64         iconv_t cd;
65         iconv_t *cd_p;
66         struct iconv_node *i;
67
68         pthread_mutex_lock(&head->mutex);
69         if (list_empty(&head->list)) {
70                 cd = iconv_open(head->to_encoding, head->from_encoding);
71                 if (cd == (iconv_t)-1) {
72                         ERROR_WITH_ERRNO("Failed to open iconv from %s to %s",
73                                          head->from_encoding, head->to_encoding);
74                         cd_p = NULL;
75                 } else {
76                         i = MALLOC(sizeof(struct iconv_node));
77                         if (i) {
78                                 i->head = head;
79                                 i->cd = cd;
80                                 cd_p = &i->cd;
81                         } else {
82                                 iconv_close(cd);
83                                 cd_p = NULL;
84                         }
85                 }
86         } else {
87                 i = container_of(head->list.next, struct iconv_node, list);
88                 list_del(head->list.next);
89                 cd_p = &i->cd;
90         }
91         pthread_mutex_unlock(&head->mutex);
92         return cd_p;
93 }
94
95 static void
96 put_iconv(iconv_t *cd)
97 {
98         int errno_save = errno;
99         struct iconv_node *i = container_of(cd, struct iconv_node, cd);
100         struct iconv_list_head *head = i->head;
101
102         pthread_mutex_lock(&head->mutex);
103         list_add(&i->list, &head->list);
104         pthread_mutex_unlock(&head->mutex);
105         errno = errno_save;
106 }
107
108 /* Prevent printing an error message if a character conversion error occurs
109  * while printing an error message.  (This variable is not per-thread but it
110  * doesn't matter too much since it's just the error messages.) */
111 static bool error_message_being_printed = false;
112
113 #define DEFINE_CHAR_CONVERSION_FUNCTIONS(varname1, longname1, chartype1,\
114                                          varname2, longname2, chartype2,\
115                                          earlyreturn,                   \
116                                          worst_case_len_expr,           \
117                                          err_return,                    \
118                                          err_msg,                       \
119                                          modifier)                      \
120 static ICONV_LIST(iconv_##varname1##_to_##varname2,                     \
121                   longname1, longname2);                                \
122                                                                         \
123 modifier int                                                            \
124 varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\
125                                   size_t *out_nbytes_ret)               \
126 {                                                                       \
127         iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2);     \
128         if (cd == NULL)                                                 \
129                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;                  \
130                                                                         \
131         /* Worst case length */                                         \
132         chartype2 buf[worst_case_len_expr];                             \
133         char *inbuf = (char*)in;                                        \
134         size_t inbytesleft = in_nbytes;                                 \
135         char *outbuf = (char*)buf;                                      \
136         size_t outbytesleft = sizeof(buf);                              \
137         size_t len;                                                     \
138         int ret;                                                        \
139                                                                         \
140         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \
141         if (len == (size_t)-1) {                                        \
142                 if (!error_message_being_printed) {                     \
143                         error_message_being_printed = true;             \
144                         err_msg;                                        \
145                         error_message_being_printed = false;            \
146                 }                                                       \
147                 ret = err_return;                                       \
148         } else {                                                        \
149                 *out_nbytes_ret = sizeof(buf) - outbytesleft;           \
150                 ret = 0;                                                \
151         }                                                               \
152         put_iconv(cd);                                                  \
153         return ret;                                                     \
154 }                                                                       \
155                                                                         \
156 modifier int                                                            \
157 varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes,   \
158                                chartype2 *out)                          \
159 {                                                                       \
160         iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2);     \
161         if (cd == NULL)                                                 \
162                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;                  \
163                                                                         \
164         char *inbuf = (char*)in;                                        \
165         size_t inbytesleft = in_nbytes;                                 \
166         char *outbuf = (char*)out;                                      \
167         const size_t LARGE_NUMBER = 1000000000;                         \
168         size_t outbytesleft = LARGE_NUMBER;                             \
169         size_t len;                                                     \
170         int ret;                                                        \
171                                                                         \
172         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \
173         if (len == (size_t)-1) {                                        \
174                 if (!error_message_being_printed) {                     \
175                         error_message_being_printed = true;             \
176                         err_msg;                                        \
177                         error_message_being_printed = false;            \
178                 }                                                       \
179                 ret = err_return;                                       \
180         } else {                                                        \
181                 out[(LARGE_NUMBER-outbytesleft)/sizeof(chartype2)] = 0; \
182                 ret = 0;                                                \
183         }                                                               \
184         put_iconv(cd);                                                  \
185         return ret;                                                     \
186 }                                                                       \
187                                                                         \
188 modifier int                                                            \
189 varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes,         \
190                          chartype2 **out_ret,                           \
191                          size_t *out_nbytes_ret)                        \
192 {                                                                       \
193         int ret;                                                        \
194         chartype2 *out;                                                 \
195         size_t out_nbytes;                                              \
196                                                                         \
197         if (earlyreturn) {                                              \
198                 /* Out same as in */                                    \
199                 out = MALLOC(in_nbytes + sizeof(chartype2));            \
200                 if (!out)                                               \
201                         return WIMLIB_ERR_NOMEM;                        \
202                 memcpy(out, in, in_nbytes);                             \
203                 out[in_nbytes / sizeof(chartype2)] = 0;                 \
204                 *out_ret = out;                                         \
205                 *out_nbytes_ret = in_nbytes;                            \
206                 return 0;                                               \
207         }                                                               \
208                                                                         \
209         ret = varname1##_to_##varname2##_nbytes(in, in_nbytes,          \
210                                                 &out_nbytes);           \
211         if (ret)                                                        \
212                 return ret;                                             \
213                                                                         \
214         out = MALLOC(out_nbytes + sizeof(chartype2));                   \
215         if (!out)                                                       \
216                 return WIMLIB_ERR_NOMEM;                                \
217                                                                         \
218         ret = varname1##_to_##varname2##_buf(in, in_nbytes, out);       \
219         if (ret) {                                                      \
220                 int errno_save = errno;                                 \
221                 FREE(out);                                              \
222                 errno = errno_save;                                     \
223         } else {                                                        \
224                 *out_ret = out;                                         \
225                 *out_nbytes_ret = out_nbytes;                           \
226         }                                                               \
227         return ret;                                                     \
228 }
229
230 #if !TCHAR_IS_UTF16LE
231 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar,
232                                  utf16le, "UTF-16LE", utf16lechar,
233                                  false,
234                                  in_nbytes * 4,
235                                  WIMLIB_ERR_INVALID_MULTIBYTE_STRING,
236                                  ERROR_WITH_ERRNO("Failed to convert multibyte "
237                                                   "string \"%"TS"\" to UTF-16LE string!", in);
238                                  ERROR("If the data you provided was UTF-8, please make sure "
239                                        "the character encoding of your current locale is UTF-8."),
240                                  )
241
242 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar,
243                                  tstr, "", tchar,
244                                  false,
245                                  in_nbytes * 2,
246                                  WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
247                                  ERROR("Failed to convert UTF-16LE string to "
248                                        "multibyte string!");
249                                  ERROR("This may be because the UTF-16LE string "
250                                        "could not be represented in your "
251                                        "locale's character encoding."),
252                                  )
253 #endif
254
255 /* tchar to UTF-8 and back */
256 #if TCHAR_IS_UTF16LE
257 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "UTF-16LE", tchar,
258                                  utf8, "UTF-8", char,
259                                  false,
260                                  in_nbytes * 2,
261                                  WIMLIB_ERR_INVALID_UTF16_STRING,
262                                  ERROR_WITH_ERRNO("Failed to convert UTF-16LE "
263                                                   "string \"%"TS"\" to UTF-8 string!", in),
264                                  static)
265
266 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char,
267                                  tstr, "UTF-16LE", tchar,
268                                  false,
269                                  in_nbytes * 2,
270                                  WIMLIB_ERR_INVALID_UTF8_STRING,
271                                  ERROR_WITH_ERRNO("Failed to convert UTF-8 string "
272                                                   "to UTF-16LE string!"),
273                                  static)
274 #else
275 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar,
276                                  utf8, "UTF-8", char,
277                                  wimlib_mbs_is_utf8,
278                                  in_nbytes * 4,
279                                  WIMLIB_ERR_INVALID_MULTIBYTE_STRING,
280                                  ERROR_WITH_ERRNO("Failed to convert multibyte "
281                                                   "string \"%"TS"\" to UTF-8 string!", in);
282                                  ERROR("If the data you provided was UTF-8, please make sure "
283                                        "the character encoding of your current locale is UTF-8."),
284                                  static)
285
286 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char,
287                                  tstr, "", tchar,
288                                  wimlib_mbs_is_utf8,
289                                  in_nbytes * 4,
290                                  WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
291                                  ERROR("Failed to convert UTF-8 string to "
292                                        "multibyte string!");
293                                  ERROR("This may be because the UTF-8 data "
294                                        "could not be represented in your "
295                                        "locale's character encoding."),
296                                  static)
297 #endif
298
299 int
300 tstr_to_utf8_simple(const tchar *tstr, char **out)
301 {
302         size_t out_nbytes;
303         return tstr_to_utf8(tstr, tstrlen(tstr) * sizeof(tchar),
304                             out, &out_nbytes);
305 }
306
307 int
308 utf8_to_tstr_simple(const char *utf8str, tchar **out)
309 {
310         size_t out_nbytes;
311         return utf8_to_tstr(utf8str, strlen(utf8str), out, &out_nbytes);
312 }
313
314 static void
315 iconv_cleanup(struct iconv_list_head *head)
316 {
317         pthread_mutex_destroy(&head->mutex);
318         while (!list_empty(&head->list)) {
319                 struct iconv_node *i;
320
321                 i = container_of(head->list.next, struct iconv_node, list);
322                 list_del(&i->list);
323                 iconv_close(i->cd);
324                 FREE(i);
325         }
326 }
327
328 void
329 iconv_global_cleanup()
330 {
331         iconv_cleanup(&iconv_utf8_to_tstr);
332         iconv_cleanup(&iconv_tstr_to_utf8);
333 #if !TCHAR_IS_UTF16LE
334         iconv_cleanup(&iconv_utf16le_to_tstr);
335         iconv_cleanup(&iconv_tstr_to_utf16le);
336 #endif
337 }