6 * Copyright (C) 2012, 2013 Eric Biggers
8 * This file is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU Lesser General Public License as published by the Free
10 * Software Foundation; either version 3 of the License, or (at your option) any
13 * This file is distributed in the hope that it will be useful, but WITHOUT
14 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
18 * You should have received a copy of the GNU Lesser General Public License
19 * along with this file; if not, see http://www.gnu.org/licenses/.
32 #include "wimlib/alloca.h"
33 #include "wimlib/assert.h"
34 #include "wimlib/encoding.h"
35 #include "wimlib/endianness.h"
36 #include "wimlib/error.h"
37 #include "wimlib/list.h"
38 #include "wimlib/util.h"
41 bool wimlib_mbs_is_utf8 = !TCHAR_IS_UTF16LE;
43 /* List of iconv_t conversion descriptors for a specific character conversion.
44 * The idea is that it is not thread-safe to have just one conversion
45 * descriptor, but it also is inefficient to open a new conversion descriptor to
46 * convert every string. Both these problems can be solved by maintaining a
47 * list of conversion descriptors; then, a thread can use an existing conversion
48 * descriptor if available. */
49 struct iconv_list_head {
50 const char *from_encoding;
51 const char *to_encoding;
52 struct list_head list;
53 pthread_mutex_t mutex;
58 struct list_head list;
59 struct iconv_list_head *head;
62 #define ICONV_LIST(name, from, to) \
63 struct iconv_list_head name = { \
64 .from_encoding = from, \
69 get_iconv(struct iconv_list_head *head)
75 pthread_mutex_lock(&head->mutex);
76 if (list_empty(&head->list)) {
77 cd = iconv_open(head->to_encoding, head->from_encoding);
78 if (cd == (iconv_t)-1) {
79 ERROR_WITH_ERRNO("Failed to open iconv from %s to %s",
80 head->from_encoding, head->to_encoding);
83 i = MALLOC(sizeof(struct iconv_node));
94 i = container_of(head->list.next, struct iconv_node, list);
95 list_del(head->list.next);
98 pthread_mutex_unlock(&head->mutex);
103 put_iconv(iconv_t *cd)
105 int errno_save = errno;
106 struct iconv_node *i = container_of(cd, struct iconv_node, cd);
107 struct iconv_list_head *head = i->head;
109 pthread_mutex_lock(&head->mutex);
110 list_add(&i->list, &head->list);
111 pthread_mutex_unlock(&head->mutex);
115 #define DEFINE_CHAR_CONVERSION_FUNCTIONS(varname1, longname1, chartype1,\
116 varname2, longname2, chartype2,\
117 earlyreturn_on_utf8_locale, \
119 worst_case_len_expr, \
123 static ICONV_LIST(iconv_##varname1##_to_##varname2, \
124 longname1, longname2); \
127 varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\
128 size_t *out_nbytes_ret) \
130 iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2); \
132 return WIMLIB_ERR_ICONV_NOT_AVAILABLE; \
137 bufsize = (worst_case_len_expr) * sizeof(chartype2); \
138 /* Worst case length */ \
139 if (bufsize <= STACK_MAX) { \
140 buf = alloca(bufsize); \
141 buf_onheap = false; \
143 buf = MALLOC(bufsize); \
145 return WIMLIB_ERR_NOMEM; \
149 char *inbuf = (char*)in; \
150 size_t inbytesleft = in_nbytes; \
151 char *outbuf = (char*)buf; \
152 size_t outbytesleft = bufsize; \
156 len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \
157 if (len == (size_t)-1) { \
161 *out_nbytes_ret = bufsize - outbytesleft; \
171 varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes, \
174 iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2); \
176 return WIMLIB_ERR_ICONV_NOT_AVAILABLE; \
178 char *inbuf = (char*)in; \
179 size_t inbytesleft = in_nbytes; \
180 char *outbuf = (char*)out; \
181 const size_t LARGE_NUMBER = 1000000000; \
182 size_t outbytesleft = LARGE_NUMBER; \
186 len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \
187 if (len == (size_t)-1) { \
191 out[(LARGE_NUMBER-outbytesleft)/sizeof(chartype2)] = 0; \
199 varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes, \
200 chartype2 **out_ret, \
201 size_t *out_nbytes_ret) \
207 if (earlyreturn_on_utf8_locale && wimlib_mbs_is_utf8) { \
209 /* Out same as in */ \
210 out = MALLOC(in_nbytes + sizeof(chartype2)); \
212 return WIMLIB_ERR_NOMEM; \
213 memcpy(out, in, in_nbytes); \
214 out[in_nbytes / sizeof(chartype2)] = 0; \
216 *out_nbytes_ret = in_nbytes; \
220 ret = varname1##_to_##varname2##_nbytes(in, in_nbytes, \
225 out = MALLOC(out_nbytes + sizeof(chartype2)); \
227 return WIMLIB_ERR_NOMEM; \
229 ret = varname1##_to_##varname2##_buf(in, in_nbytes, out); \
234 *out_nbytes_ret = out_nbytes; \
239 #if !TCHAR_IS_UTF16LE
243 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", tchar,
244 utf16le, "UTF-16LE", utf16lechar,
248 WIMLIB_ERR_INVALID_UTF8_STRING,
249 ERROR_WITH_ERRNO("Failed to convert UTF-8 string "
250 "to UTF-16LE string!"),
253 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar,
254 utf8, "UTF-8", tchar,
258 WIMLIB_ERR_INVALID_UTF16_STRING,
259 ERROR_WITH_ERRNO("Failed to convert UTF-16LE string "
263 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar,
264 utf16le, "UTF-16LE", utf16lechar,
266 return utf8_to_utf16le(in, in_nbytes, out_ret, out_nbytes_ret),
268 WIMLIB_ERR_INVALID_MULTIBYTE_STRING,
269 ERROR_WITH_ERRNO("Failed to convert multibyte "
270 "string \"%"TS"\" to UTF-16LE string!", in);
271 ERROR("If the data you provided was UTF-8, please make sure "
272 "the character encoding\n"
273 " of your current locale is UTF-8."),
276 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar,
279 return utf16le_to_utf8(in, in_nbytes, out_ret, out_nbytes_ret),
281 WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
282 ERROR("Failed to convert UTF-16LE string to "
283 "multibyte string!");
284 ERROR("This may be because the UTF-16LE string "
285 "could not be represented\n"
286 " in your locale's character encoding."),
290 /* tchar to UTF-8 and back */
294 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "UTF-16LE", tchar,
299 WIMLIB_ERR_INVALID_UTF16_STRING,
300 ERROR_WITH_ERRNO("Failed to convert UTF-16LE "
301 "string \"%"TS"\" to UTF-8 string!", in),
304 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char,
305 tstr, "UTF-16LE", tchar,
309 WIMLIB_ERR_INVALID_UTF8_STRING,
310 ERROR_WITH_ERRNO("Failed to convert UTF-8 string "
311 "to UTF-16LE string!"),
317 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar,
322 WIMLIB_ERR_INVALID_MULTIBYTE_STRING,
323 ERROR_WITH_ERRNO("Failed to convert multibyte "
324 "string \"%"TS"\" to UTF-8 string!", in);
325 ERROR("If the data you provided was UTF-8, please make sure "
327 " encoding of your current locale is UTF-8."),
330 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char,
335 WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
336 ERROR("Failed to convert UTF-8 string to "
337 "multibyte string!");
338 ERROR("This may be because the UTF-8 data "
339 "could not be represented\n"
340 " in your locale's character encoding."),
345 tstr_to_utf8_simple(const tchar *tstr, char **out)
348 return tstr_to_utf8(tstr, tstrlen(tstr) * sizeof(tchar),
353 utf8_to_tstr_simple(const char *utf8str, tchar **out)
356 return utf8_to_tstr(utf8str, strlen(utf8str), out, &out_nbytes);
360 iconv_init(struct iconv_list_head *head)
362 pthread_mutex_init(&head->mutex, NULL);
363 INIT_LIST_HEAD(&head->list);
367 iconv_cleanup(struct iconv_list_head *head)
369 pthread_mutex_destroy(&head->mutex);
370 while (!list_empty(&head->list)) {
371 struct iconv_node *i;
373 i = container_of(head->list.next, struct iconv_node, list);
381 iconv_global_init(void)
383 iconv_init(&iconv_utf8_to_tstr);
384 iconv_init(&iconv_tstr_to_utf8);
385 #if !TCHAR_IS_UTF16LE
386 iconv_init(&iconv_utf16le_to_tstr);
387 iconv_init(&iconv_tstr_to_utf16le);
388 iconv_init(&iconv_utf16le_to_utf8);
389 iconv_init(&iconv_utf8_to_utf16le);
394 iconv_global_cleanup(void)
396 iconv_cleanup(&iconv_utf8_to_tstr);
397 iconv_cleanup(&iconv_tstr_to_utf8);
398 #if !TCHAR_IS_UTF16LE
399 iconv_cleanup(&iconv_utf16le_to_tstr);
400 iconv_cleanup(&iconv_tstr_to_utf16le);
401 iconv_cleanup(&iconv_utf16le_to_utf8);
402 iconv_cleanup(&iconv_utf8_to_utf16le);
406 /* A table that maps from UCS-2 characters to their upper case equivalents.
407 * Index and array values are both CPU endian.
408 * Note: this is only an *approximation* of real UTF-16 case folding.
415 /* This is the table used in NTFS volumes formatted by Windows 10.
416 * It was compressed by tools/compress_upcase_table.c. */
417 static const u16 upcase_compressed[] = {
418 0x0000, 0x0000, 0x0060, 0x0000, 0x0000, 0xffe0, 0x0019, 0x0061,
419 0x0061, 0x0000, 0x001b, 0x005d, 0x0008, 0x0060, 0x0000, 0x0079,
420 0x0000, 0x0000, 0x0000, 0xffff, 0x002f, 0x0100, 0x0002, 0x0000,
421 0x0007, 0x012b, 0x0011, 0x0121, 0x002f, 0x0103, 0x0006, 0x0101,
422 0x0000, 0x00c3, 0x0006, 0x0131, 0x0007, 0x012e, 0x0004, 0x0000,
423 0x0003, 0x012f, 0x0000, 0x0061, 0x0004, 0x0130, 0x0000, 0x00a3,
424 0x0003, 0x0000, 0x0000, 0x0082, 0x000b, 0x0131, 0x0006, 0x0189,
425 0x0008, 0x012f, 0x0007, 0x012e, 0x0000, 0x0038, 0x0006, 0x0000,
426 0x0000, 0xfffe, 0x0007, 0x01c4, 0x000f, 0x0101, 0x0000, 0xffb1,
427 0x0015, 0x011e, 0x0004, 0x01cc, 0x002a, 0x0149, 0x0014, 0x0149,
428 0x0007, 0x0000, 0x0009, 0x018c, 0x000b, 0x0138, 0x0000, 0x2a1f,
429 0x0000, 0x2a1c, 0x0000, 0x0000, 0x0000, 0xff2e, 0x0000, 0xff32,
430 0x0000, 0x0000, 0x0000, 0xff33, 0x0000, 0xff33, 0x0000, 0x0000,
431 0x0000, 0xff36, 0x0000, 0x0000, 0x0000, 0xff35, 0x0004, 0x0000,
432 0x0002, 0x0257, 0x0000, 0x0000, 0x0000, 0xff31, 0x0004, 0x0000,
433 0x0000, 0xff2f, 0x0000, 0xff2d, 0x0000, 0x0000, 0x0000, 0x29f7,
434 0x0003, 0x0000, 0x0002, 0x0269, 0x0000, 0x29fd, 0x0000, 0xff2b,
435 0x0002, 0x0000, 0x0000, 0xff2a, 0x0007, 0x0000, 0x0000, 0x29e7,
436 0x0002, 0x0000, 0x0000, 0xff26, 0x0005, 0x027e, 0x0003, 0x027e,
437 0x0000, 0xffbb, 0x0000, 0xff27, 0x0000, 0xff27, 0x0000, 0xffb9,
438 0x0005, 0x0000, 0x0000, 0xff25, 0x0065, 0x007b, 0x0079, 0x0293,
439 0x0008, 0x012d, 0x0003, 0x019c, 0x0002, 0x037b, 0x002e, 0x0000,
440 0x0000, 0xffda, 0x0000, 0xffdb, 0x0002, 0x03ad, 0x0012, 0x0060,
441 0x000a, 0x0060, 0x0000, 0xffc0, 0x0000, 0xffc1, 0x0000, 0xffc1,
442 0x0008, 0x0000, 0x0000, 0xfff8, 0x001a, 0x0118, 0x0000, 0x0007,
443 0x0008, 0x018d, 0x0009, 0x0233, 0x0046, 0x0035, 0x0006, 0x0061,
444 0x0000, 0xffb0, 0x000f, 0x0450, 0x0025, 0x010e, 0x000a, 0x036b,
445 0x0032, 0x048b, 0x000e, 0x0100, 0x0000, 0xfff1, 0x0037, 0x048a,
446 0x0026, 0x0465, 0x0034, 0x0000, 0x0000, 0xffd0, 0x0025, 0x0561,
447 0x00de, 0x0293, 0x1714, 0x0587, 0x0000, 0x8a04, 0x0003, 0x0000,
448 0x0000, 0x0ee6, 0x0087, 0x02ee, 0x0092, 0x1e01, 0x0069, 0x1df7,
449 0x0000, 0x0008, 0x0007, 0x1f00, 0x0008, 0x0000, 0x000e, 0x1f02,
450 0x0008, 0x1f0e, 0x0010, 0x1f06, 0x001a, 0x1f06, 0x0002, 0x1f0f,
451 0x0007, 0x1f50, 0x0017, 0x1f19, 0x0000, 0x004a, 0x0000, 0x004a,
452 0x0000, 0x0056, 0x0003, 0x1f72, 0x0000, 0x0064, 0x0000, 0x0064,
453 0x0000, 0x0080, 0x0000, 0x0080, 0x0000, 0x0070, 0x0000, 0x0070,
454 0x0000, 0x007e, 0x0000, 0x007e, 0x0028, 0x1f1e, 0x000c, 0x1f06,
455 0x0000, 0x0000, 0x0000, 0x0009, 0x000f, 0x0000, 0x000d, 0x1fb3,
456 0x000d, 0x1f44, 0x0008, 0x1fcd, 0x0006, 0x03f2, 0x0015, 0x1fbb,
457 0x014e, 0x0587, 0x0000, 0xffe4, 0x0021, 0x0000, 0x0000, 0xfff0,
458 0x000f, 0x2170, 0x000a, 0x0238, 0x0346, 0x0587, 0x0000, 0xffe6,
459 0x0019, 0x24d0, 0x0746, 0x0587, 0x0026, 0x0561, 0x000b, 0x057e,
460 0x0004, 0x012f, 0x0000, 0xd5d5, 0x0000, 0xd5d8, 0x000c, 0x022e,
461 0x000e, 0x03f8, 0x006e, 0x1e33, 0x0011, 0x0000, 0x0000, 0xe3a0,
462 0x0025, 0x2d00, 0x17f2, 0x0587, 0x6129, 0x2d26, 0x002e, 0x0201,
463 0x002a, 0x1def, 0x0098, 0xa5b7, 0x0040, 0x1dff, 0x000e, 0x0368,
464 0x000d, 0x022b, 0x034c, 0x2184, 0x5469, 0x2d26, 0x007f, 0x0061,
468 /* Simple LZ decoder */
469 const u16 *in_next = upcase_compressed;
470 for (u32 i = 0; i < ARRAY_LEN(upcase); ) {
471 u16 length = *in_next++;
472 u16 src_pos = *in_next++;
475 upcase[i++] = src_pos;
479 upcase[i++] = upcase[src_pos++];
485 for (u32 i = 0; i < ARRAY_LEN(upcase); i++)
490 wimlib_assert(upcase['a'] == 'A');
491 wimlib_assert(upcase['A'] == 'A');
492 wimlib_assert(upcase['z'] == 'Z');
493 wimlib_assert(upcase['Z'] == 'Z');
494 wimlib_assert(upcase['1'] == '1');
495 wimlib_assert(upcase[0x00e9] == 0x00c9); /* Latin letter e, with acute accent */
496 wimlib_assert(upcase[0x00c9] == 0x00c9);
497 wimlib_assert(upcase[0x03c1] == 0x03a1); /* Greek letter rho */
498 wimlib_assert(upcase[0x03a1] == 0x03a1);
499 wimlib_assert(upcase[0x0436] == 0x0416); /* Cyrillic letter zhe */
500 wimlib_assert(upcase[0x0416] == 0x0416);
501 wimlib_assert(upcase[0x0567] == 0x0537); /* Armenian letter eh */
502 wimlib_assert(upcase[0x0537] == 0x0537);
503 wimlib_assert(upcase[0x24d0] == 0x24b6); /* Circled Latin letter A
504 (is that a real character???) */
505 wimlib_assert(upcase[0x24b6] == 0x24b6);
506 wimlib_assert(upcase[0x2603] == 0x2603); /* Note to self: Upper case
507 snowman symbol does not
512 /* Compare UTF-16LE strings case-sensitively (%ignore_case == false) or
513 * case-insensitively (%ignore_case == true).
515 * This is implemented using the default upper-case table used by NTFS. It does
516 * not handle all possible cases allowed by UTF-16LE. For example, different
517 * normalizations of the same sequence of "characters" are not considered equal.
518 * It hopefully does the right thing most of the time though. */
520 cmp_utf16le_strings(const utf16lechar *s1, size_t n1,
521 const utf16lechar *s2, size_t n2,
524 size_t n = min(n1, n2);
527 for (size_t i = 0; i < n; i++) {
528 u16 c1 = upcase[le16_to_cpu(s1[i])];
529 u16 c2 = upcase[le16_to_cpu(s2[i])];
531 return (c1 < c2) ? -1 : 1;
534 for (size_t i = 0; i < n; i++) {
535 u16 c1 = le16_to_cpu(s1[i]);
536 u16 c2 = le16_to_cpu(s2[i]);
538 return (c1 < c2) ? -1 : 1;
543 return (n1 < n2) ? -1 : 1;
546 /* Like cmp_utf16le_strings(), but assumes the strings are null terminated. */
548 cmp_utf16le_strings_z(const utf16lechar *s1, const utf16lechar *s2,
553 u16 c1 = upcase[le16_to_cpu(*s1)];
554 u16 c2 = upcase[le16_to_cpu(*s2)];
556 return (c1 < c2) ? -1 : 1;
562 while (*s1 && *s1 == *s2)
566 return (le16_to_cpu(*s1) < le16_to_cpu(*s2)) ? -1 : 1;
570 /* Duplicate a UTF-16LE string. The input string might not be null terminated
571 * and might be misaligned, but the returned string is guaranteed to be null
572 * terminated and properly aligned. */
574 utf16le_dupz(const void *ustr, size_t usize)
576 utf16lechar *dup = MALLOC(usize + sizeof(utf16lechar));
578 memcpy(dup, ustr, usize);
579 dup[usize / sizeof(utf16lechar)] = 0;
584 /* Duplicate a null-terminated UTF-16LE string. */
586 utf16le_dup(const utf16lechar *ustr)
588 const utf16lechar *p = ustr;
591 return memdup(ustr, (const u8 *)p - (const u8 *)ustr);
594 /* Return the length, in bytes, of a UTF-null terminated UTF-16 string,
595 * excluding the null terminator. */
597 utf16le_len_bytes(const utf16lechar *s)
599 const utf16lechar *p = s;
602 return (p - s) * sizeof(utf16lechar);
605 /* Return the length, in UTF-16 coding units, of a UTF-null terminated UTF-16
606 * string, excluding the null terminator. */
608 utf16le_len_chars(const utf16lechar *s)
610 return utf16le_len_bytes(s) / sizeof(utf16lechar);