Fix glob()
[wimlib] / src / encoding.c
1 /*
2  * encoding.c:  Convert "multibyte" strings (the locale-default encoding---
3  * generally, UTF-8 or something like ISO-8859-1) to UTF-16LE strings, and vice
4  * versa.  Also, convert UTF-8 strings to multibyte strings.
5  */
6
7 /*
8  * Copyright (C) 2012, 2013 Eric Biggers
9  *
10  * This file is part of wimlib, a library for working with WIM files.
11  *
12  * wimlib is free software; you can redistribute it and/or modify it under the
13  * terms of the GNU General Public License as published by the Free
14  * Software Foundation; either version 3 of the License, or (at your option)
15  * any later version.
16  *
17  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
18  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
19  * A PARTICULAR PURPOSE. See the GNU General Public License for more
20  * details.
21  *
22  * You should have received a copy of the GNU General Public License
23  * along with wimlib; if not, see http://www.gnu.org/licenses/.
24  */
25
26 #include "wimlib_internal.h"
27
28 #include <errno.h>
29 #include <iconv.h>
30 #include <pthread.h>
31 #include <stdlib.h>
32
33 bool wimlib_mbs_is_utf8 = true;
34
35 /* List of iconv_t conversion descriptors for a specific character conversion.
36  * The idea is that it is not thread-safe to have just one conversion
37  * descriptor, but it also is inefficient to open a new conversion descriptor to
38  * convert every string.  Both these problems can be solved by maintaining a
39  * list of conversion descriptors; then, a thread can use an existing conversion
40  * descriptor if available. */
41 struct iconv_list_head {
42         const char *from_encoding;
43         const char *to_encoding;
44         struct list_head list;
45         pthread_mutex_t mutex;
46 };
47
48 struct iconv_node {
49         iconv_t cd;
50         struct list_head list;
51         struct iconv_list_head *head;
52 };
53
54 #define ICONV_LIST(name, from, to)                      \
55 struct iconv_list_head name = {                         \
56         .from_encoding = from,                          \
57         .to_encoding = to,                              \
58         .list = LIST_HEAD_INIT(name.list),              \
59         .mutex = PTHREAD_MUTEX_INITIALIZER,             \
60 }
61
62 static iconv_t *
63 get_iconv(struct iconv_list_head *head)
64 {
65         iconv_t cd;
66         iconv_t *cd_p;
67         struct iconv_node *i;
68
69         pthread_mutex_lock(&head->mutex);
70         if (list_empty(&head->list)) {
71                 cd = iconv_open(head->to_encoding, head->from_encoding);
72                 if (cd == (iconv_t)-1) {
73                         ERROR_WITH_ERRNO("Failed to open iconv from %s to %s",
74                                          head->from_encoding, head->to_encoding);
75                         cd_p = NULL;
76                 } else {
77                         i = MALLOC(sizeof(struct iconv_node));
78                         if (i) {
79                                 i->head = head;
80                                 i->cd = cd;
81                                 cd_p = &i->cd;
82                         } else {
83                                 iconv_close(cd);
84                                 cd_p = NULL;
85                         }
86                 }
87         } else {
88                 i = container_of(head->list.next, struct iconv_node, list);
89                 list_del(head->list.next);
90                 cd_p = &i->cd;
91         }
92         pthread_mutex_unlock(&head->mutex);
93         return cd_p;
94 }
95
96 static void
97 put_iconv(iconv_t *cd)
98 {
99         int errno_save = errno;
100         struct iconv_node *i = container_of(cd, struct iconv_node, cd);
101         struct iconv_list_head *head = i->head;
102         
103         pthread_mutex_lock(&head->mutex);
104         list_add(&i->list, &head->list);
105         pthread_mutex_unlock(&head->mutex);
106         errno = errno_save;
107 }
108
109 /* Prevent printing an error message if a character conversion error occurs
110  * while printing an error message.  (This variable is not per-thread but it
111  * doesn't matter too much since it's just the error messages.) */
112 static bool error_message_being_printed = false;
113
114 #define DEFINE_CHAR_CONVERSION_FUNCTIONS(varname1, longname1, chartype1,\
115                                          varname2, longname2, chartype2,\
116                                          worst_case_len_expr,           \
117                                          err_return,                    \
118                                          err_msg)                       \
119 static ICONV_LIST(iconv_##varname1##_to_##varname2,                     \
120                   longname1, longname2);                                \
121                                                                         \
122 int                                                                     \
123 varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\
124                                   size_t *out_nbytes_ret)               \
125 {                                                                       \
126         iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2);     \
127         if (cd == NULL)                                                 \
128                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;                  \
129                                                                         \
130         /* Worst case length */                                         \
131         chartype2 buf[worst_case_len_expr];                             \
132         char *inbuf = (char*)in;                                        \
133         size_t inbytesleft = in_nbytes;                                 \
134         char *outbuf = (char*)buf;                                      \
135         size_t outbytesleft = sizeof(buf);                              \
136         size_t len;                                                     \
137         int ret;                                                        \
138                                                                         \
139         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \
140         if (len == (size_t)-1) {                                        \
141                 if (!error_message_being_printed) {                     \
142                         error_message_being_printed = true;             \
143                         err_msg;                                        \
144                         error_message_being_printed = false;            \
145                 }                                                       \
146                 ret = err_return;                                       \
147         } else {                                                        \
148                 *out_nbytes_ret = sizeof(buf) - outbytesleft;           \
149                 ret = 0;                                                \
150         }                                                               \
151         put_iconv(cd);                                                  \
152         return ret;                                                     \
153 }                                                                       \
154                                                                         \
155 int                                                                     \
156 varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes,   \
157                                chartype2 *out)                          \
158 {                                                                       \
159         iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2);     \
160         if (cd == NULL)                                                 \
161                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;                  \
162                                                                         \
163         char *inbuf = (char*)in;                                        \
164         size_t inbytesleft = in_nbytes;                                 \
165         char *outbuf = (char*)out;                                      \
166         const size_t LARGE_NUMBER = 1000000000;                         \
167         size_t outbytesleft = LARGE_NUMBER;                             \
168         size_t len;                                                     \
169         int ret;                                                        \
170                                                                         \
171         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \
172         if (len == (size_t)-1) {                                        \
173                 if (!error_message_being_printed) {                     \
174                         error_message_being_printed = true;             \
175                         err_msg;                                        \
176                         error_message_being_printed = false;            \
177                 }                                                       \
178                 ret = err_return;                                       \
179         } else {                                                        \
180                 out[(LARGE_NUMBER-outbytesleft)/sizeof(chartype2)] = 0; \
181                 ret = 0;                                                \
182         }                                                               \
183         put_iconv(cd);                                                  \
184         return ret;                                                     \
185 }                                                                       \
186                                                                         \
187 int                                                                     \
188 varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes,         \
189                          chartype2 **out_ret,                           \
190                          size_t *out_nbytes_ret)                        \
191 {                                                                       \
192         int ret;                                                        \
193         chartype2 *out;                                                 \
194         size_t out_nbytes;                                              \
195                                                                         \
196         ret = varname1##_to_##varname2##_nbytes(in, in_nbytes,          \
197                                                 &out_nbytes);           \
198         if (ret)                                                        \
199                 return ret;                                             \
200                                                                         \
201         out = MALLOC(out_nbytes + sizeof(chartype2));                   \
202         if (!out)                                                       \
203                 return WIMLIB_ERR_NOMEM;                                \
204                                                                         \
205         ret = varname1##_to_##varname2##_buf(in, in_nbytes, out);       \
206         if (ret) {                                                      \
207                 int errno_save = errno;                                 \
208                 FREE(out);                                              \
209                 errno = errno_save;                                     \
210         } else {                                                        \
211                 *out_ret = out;                                         \
212                 *out_nbytes_ret = out_nbytes;                           \
213         }                                                               \
214         return ret;                                                     \
215 }
216
217 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar,
218                                  mbs, "", mbchar,
219                                  in_nbytes / 2 * MB_CUR_MAX,
220                                  WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
221                                  ERROR("Failed to convert UTF-16LE string "
222                                        "to multibyte string!");
223                                  ERROR("This may be because the UTF-16LE data "
224                                        "could not be represented in your "
225                                        "locale's character encoding."))
226
227 DEFINE_CHAR_CONVERSION_FUNCTIONS(mbs, "", mbchar,
228                                  utf16le, "UTF-16LE", utf16lechar,
229                                  in_nbytes * 2,
230                                  WIMLIB_ERR_INVALID_MULTIBYTE_STRING,
231                                  ERROR_WITH_ERRNO("Failed to convert multibyte "
232                                                   "string \"%s\" to UTF-16LE string!", in);
233                                  ERROR("If the data you provided was UTF-8, please make sure "
234                                        "the character encoding of your current locale is UTF-8."))
235
236 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", utf8char,
237                                  mbs, "", mbchar,
238                                  in_nbytes,
239                                  WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
240                                  ERROR("Failed to convert UTF-8 string to multibyte string!");
241                                  ERROR("This may be because the UTF-8 data could not be represented "
242                                        "in your locale's character encoding."))
243
244 static void
245 iconv_cleanup(struct iconv_list_head *head)
246 {
247         pthread_mutex_destroy(&head->mutex);
248         while (!list_empty(&head->list)) {
249                 struct iconv_node *i;
250                 
251                 i = container_of(head->list.next, struct iconv_node, list);
252                 list_del(&i->list);
253                 iconv_close(i->cd);
254                 FREE(i);
255         }
256 }
257
258 void
259 iconv_global_cleanup()
260 {
261         iconv_cleanup(&iconv_utf16le_to_mbs);
262         iconv_cleanup(&iconv_mbs_to_utf16le);
263         iconv_cleanup(&iconv_utf8_to_mbs);
264 }
265
266 bool
267 utf8_str_contains_nonascii_chars(const utf8char *utf8_str)
268 {
269         do {
270                 if ((unsigned char)*utf8_str > 127)
271                         return true;
272         } while (*++utf8_str);
273         return false;
274 }