]> wimlib.net Git - wimlib/blob - src/encoding.c
Improve char encoding support (IN PROGRESS)
[wimlib] / src / encoding.c
1 /*
2  * encoding.c:  Convert "multibyte" strings (the locale-default encoding---
3  * generally, UTF-8 or something like ISO-8859-1) to UTF-16LE strings, and vice
4  * versa.
5  */
6
7 /*
8  * Copyright (C) 2012, 2013 Eric Biggers
9  *
10  * This file is part of wimlib, a library for working with WIM files.
11  *
12  * wimlib is free software; you can redistribute it and/or modify it under the
13  * terms of the GNU General Public License as published by the Free
14  * Software Foundation; either version 3 of the License, or (at your option)
15  * any later version.
16  *
17  * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
18  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
19  * A PARTICULAR PURPOSE. See the GNU General Public License for more
20  * details.
21  *
22  * You should have received a copy of the GNU General Public License
23  * along with wimlib; if not, see http://www.gnu.org/licenses/.
24  */
25
26 #include "config.h"
27 #include "wimlib_internal.h"
28 #include <pthread.h>
29 #include "list.h"
30
31 #include <iconv.h>
32 #include <stdlib.h>
33
34 bool wimlib_mbs_is_utf8 = false;
35
36 struct iconv_list_head {
37         const char *from_encoding;
38         const char *to_encoding;
39         struct list_head list;
40         pthread_mutex_t mutex;
41 };
42
43 struct iconv_node {
44         iconv_t cd;
45         struct list_head list;
46         struct iconv_list_head *head;
47 };
48
49 #define ICONV_LIST(name, from, to)                      \
50 struct iconv_list_head name = {                         \
51         .from_encoding = from,                          \
52         .to_encoding = to,                              \
53         .list = LIST_HEAD_INIT(name.list),              \
54         .mutex = PTHREAD_MUTEX_INITIALIZER,             \
55 }
56
57 static ICONV_LIST(iconv_mbs_to_utf16le, "", "UTF-16LE");
58 static ICONV_LIST(iconv_utf16le_to_mbs, "UTF-16LE", "");
59
60 static iconv_t *
61 get_iconv(struct iconv_list_head *head)
62 {
63         iconv_t cd;
64         struct iconv_node *i;
65
66         pthread_mutex_lock(&head->mutex);
67         if (list_empty(&head->list)) {
68                 cd = iconv_open(head->to_encoding, head->from_encoding);
69                 if (cd == (iconv_t)-1) {
70                         goto out_unlock;
71                 } else {
72                         i = MALLOC(sizeof(struct iconv_node));
73                         if (!i) {
74                                 iconv_close(cd);
75                                 cd = (iconv_t)-1;
76                                 goto out_unlock;
77                         }
78                         i->head = head;
79                 }
80         } else {
81                 i = container_of(head->list.next, struct iconv_node, list);
82                 list_del(head->list.next);
83         }
84         cd = i->cd;
85 out_unlock:
86         pthread_mutex_unlock(&head->mutex);
87         return cd;
88 }
89
90 static void
91 put_iconv(iconv_t *cd)
92 {
93         struct iconv_node *i = container_of(cd, struct iconv_node, cd);
94         struct iconv_list_head *head = i->head;
95         
96         pthread_mutex_lock(&head->mutex);
97         list_add(&i->list, &head->list);
98         pthread_mutex_unlock(&head->mutex);
99 }
100
101 int
102 mbs_to_utf16le_nbytes(const mbchar *mbs, size_t mbs_nbytes,
103                       size_t *utf16le_nbytes_ret)
104 {
105         iconv_t *cd = get_iconv(&iconv_mbs_to_utf16le);
106         if (*cd == (iconv_t)-1)
107                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
108
109         /* Worst case length */
110         utf16lechar buf[mbs_nbytes * 2];
111         char *inbuf = (char*)mbs;
112         char *outbuf = (char*)buf;
113         size_t outbytesleft = sizeof(buf);
114         size_t inbytesleft = mbs_nbytes;
115         size_t len;
116         int ret;
117
118         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
119         if (len == (size_t)-1) {
120                 ret = WIMLIB_ERR_INVALID_MULTIBYTE_STRING;
121         } else {
122                 *utf16le_nbytes_ret = sizeof(buf) - outbytesleft;
123                 ret = 0;
124         }
125         put_iconv(cd);
126         return ret;
127 }
128
129
130 int
131 utf16le_to_mbs_nbytes(const utf16lechar *utf16le_str, size_t utf16le_nbytes,
132                       size_t *mbs_nbytes_ret)
133 {
134         iconv_t *cd = get_iconv(&iconv_utf16le_to_mbs);
135         if (*cd == (iconv_t)-1)
136                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
137
138         /* Worst case length */
139         mbchar buf[utf16le_nbytes / 2 * MB_CUR_MAX];
140         char *inbuf = (char*)utf16le_str;
141         char *outbuf = (char*)buf;
142         size_t outbytesleft = sizeof(buf);
143         size_t inbytesleft = utf16le_nbytes;
144         size_t len;
145         int ret;
146
147         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
148         if (len == (size_t)-1) {
149                 ERROR("Could not convert \"%W\" to encoding of current locale",
150                       utf16le_str);
151                 /* EILSEQ is supposed to mean that the *input* is invalid, but
152                  * it's also returned if any input characters are not
153                  * representable in the output encoding.  (The actual behavior
154                  * in this case is undefined for some reason...).  Assume it's
155                  * the latter error case. */
156                 ret = WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE;
157         } else {
158                 *mbs_nbytes_ret  = sizeof(buf) - outbytesleft;
159                 ret = 0;
160         }
161         put_iconv(cd);
162         return ret;
163 }
164
165 int
166 mbs_to_utf16le_buf(const mbchar *mbs, size_t mbs_nbytes,
167                    utf16lechar *utf16le_str)
168 {
169         iconv_t *cd = get_iconv(&iconv_mbs_to_utf16le);
170         if (*cd == (iconv_t)-1)
171                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
172
173         char *inbuf = (char*)mbs;
174         size_t inbytesleft = mbs_nbytes;
175         char *outbuf = (char*)utf16le_str;
176         size_t outbytesleft = SIZE_MAX;
177         size_t len;
178         int ret;
179
180         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
181         if (len == (size_t)-1) {
182                 ret = WIMLIB_ERR_INVALID_MULTIBYTE_STRING;
183         } else {
184                 ret = 0;
185         }
186         put_iconv(cd);
187         return ret;
188 }
189
190 int
191 utf16le_to_mbs_buf(const utf16lechar *utf16le_str, size_t utf16le_nbytes,
192                    mbchar *mbs)
193 {
194         int ret;
195         iconv_t *cd = get_iconv(&iconv_utf16le_to_mbs);
196         if (*cd == (iconv_t)-1)
197                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
198
199         char *inbuf = (char*)utf16le_str;
200         size_t inbytesleft;
201         char *outbuf = (char*)mbs;
202         size_t outbytesleft = SIZE_MAX;
203         size_t len;
204
205         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
206         if (len == (size_t)-1) {
207                 ret = WIMLIB_ERR_INVALID_UTF16_STRING;
208         } else {
209                 ret = 0;
210         }
211         mbs[SIZE_MAX - inbytesleft] = '\0';
212         put_iconv(cd);
213         return ret;
214 }
215
216 int
217 mbs_to_utf16le(const mbchar *mbs, size_t mbs_nbytes,
218                utf16lechar **utf16le_ret, size_t *utf16le_nbytes_ret)
219 {
220         int ret;
221         utf16lechar *utf16le_str;
222         size_t utf16le_nbytes;
223
224         ret = mbs_to_utf16le_nbytes(mbs, mbs_nbytes,
225                                     &utf16le_nbytes);
226         if (ret)
227                 return ret;
228
229         utf16le_str = MALLOC(utf16le_nbytes + 1);
230         if (!utf16le_str)
231                 return WIMLIB_ERR_NOMEM;
232
233         ret = mbs_to_utf16le_buf(mbs, mbs_nbytes, utf16le_str);
234         if (ret) {
235                 FREE(utf16le_str);
236         } else {
237                 *utf16le_ret = utf16le_str;
238                 *utf16le_nbytes_ret = utf16le_nbytes;
239         }
240         return ret;
241 }
242
243
244 int
245 utf16le_to_mbs(const utf16lechar *utf16le_str, size_t utf16le_nbytes,
246                mbchar **mbs_ret, size_t *mbs_nbytes_ret)
247 {
248         int ret;
249         mbchar *mbs;
250         size_t mbs_nbytes;
251
252         ret = utf16le_to_mbs_nbytes(utf16le_str, utf16le_nbytes,
253                                     &mbs_nbytes);
254         if (ret)
255                 return ret;
256
257         mbs = MALLOC(mbs_nbytes + 1);
258         if (!mbs)
259                 return WIMLIB_ERR_NOMEM;
260
261         ret = utf16le_to_mbs_buf(utf16le_str, utf16le_nbytes, mbs);
262         if (ret) {
263                 FREE(mbs);
264         } else {
265                 *mbs_ret = mbs;
266                 *mbs_nbytes_ret = mbs_nbytes;
267         }
268         return ret;
269 }
270
271 bool
272 utf8_str_contains_nonascii_chars(const utf8char *utf8_str)
273 {
274         do {
275                 if ((unsigned char)*utf8_str > 127)
276                         return false;
277         } while (*++utf8_str);
278         return true;
279 }