wimlib.net Git - wimlib/blob - src/encoding.c

   1 /*
   2  * encoding.c
   3  */
   4
   5 /*
   6  * Copyright (C) 2012, 2013 Eric Biggers
   7  *
   8  * This file is free software; you can redistribute it and/or modify it under
   9  * the terms of the GNU Lesser General Public License as published by the Free
  10  * Software Foundation; either version 3 of the License, or (at your option) any
  11  * later version.
  12  *
  13  * This file is distributed in the hope that it will be useful, but WITHOUT
  14  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  15  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  16  * details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public License
  19  * along with this file; if not, see http://www.gnu.org/licenses/.
  20  */
  21
  22 #ifdef HAVE_CONFIG_H
  23 #  include "config.h"
  24 #endif
  25
  26 #include <errno.h>
  27 #include <iconv.h>
  28 #include <pthread.h>
  29 #include <string.h>
  30
  31 #include "wimlib.h"
  32 #include "wimlib/alloca.h"
  33 #include "wimlib/assert.h"
  34 #include "wimlib/encoding.h"
  35 #include "wimlib/endianness.h"
  36 #include "wimlib/error.h"
  37 #include "wimlib/list.h"
  38 #include "wimlib/util.h"
  39
  40
  41 bool wimlib_mbs_is_utf8 = !TCHAR_IS_UTF16LE;
  42
  43 /* List of iconv_t conversion descriptors for a specific character conversion.
  44  * The idea is that it is not thread-safe to have just one conversion
  45  * descriptor, but it also is inefficient to open a new conversion descriptor to
  46  * convert every string.  Both these problems can be solved by maintaining a
  47  * list of conversion descriptors; then, a thread can use an existing conversion
  48  * descriptor if available. */
  49 struct iconv_list_head {
  50         const char *from_encoding;
  51         const char *to_encoding;
  52         struct list_head list;
  53         pthread_mutex_t mutex;
  54 };
  55
  56 struct iconv_node {
  57         iconv_t cd;
  58         struct list_head list;
  59         struct iconv_list_head *head;
  60 };
  61
  62 #define ICONV_LIST(name, from, to)                      \
  63 struct iconv_list_head name = {                         \
  64         .from_encoding = from,                          \
  65         .to_encoding = to,                              \
  66 }
  67
  68 static iconv_t *
  69 get_iconv(struct iconv_list_head *head)
  70 {
  71         iconv_t cd;
  72         iconv_t *cd_p;
  73         struct iconv_node *i;
  74
  75         pthread_mutex_lock(&head->mutex);
  76         if (list_empty(&head->list)) {
  77                 cd = iconv_open(head->to_encoding, head->from_encoding);
  78                 if (cd == (iconv_t)-1) {
  79                         ERROR_WITH_ERRNO("Failed to open iconv from %s to %s",
  80                                          head->from_encoding, head->to_encoding);
  81                         cd_p = NULL;
  82                 } else {
  83                         i = MALLOC(sizeof(struct iconv_node));
  84                         if (i) {
  85                                 i->head = head;
  86                                 i->cd = cd;
  87                                 cd_p = &i->cd;
  88                         } else {
  89                                 iconv_close(cd);
  90                                 cd_p = NULL;
  91                         }
  92                 }
  93         } else {
  94                 i = container_of(head->list.next, struct iconv_node, list);
  95                 list_del(head->list.next);
  96                 cd_p = &i->cd;
  97         }
  98         pthread_mutex_unlock(&head->mutex);
  99         return cd_p;
 100 }
 101
 102 static void
 103 put_iconv(iconv_t *cd)
 104 {
 105         int errno_save = errno;
 106         struct iconv_node *i = container_of(cd, struct iconv_node, cd);
 107         struct iconv_list_head *head = i->head;
 108
 109         pthread_mutex_lock(&head->mutex);
 110         list_add(&i->list, &head->list);
 111         pthread_mutex_unlock(&head->mutex);
 112         errno = errno_save;
 113 }
 114
 115 #define DEFINE_CHAR_CONVERSION_FUNCTIONS(varname1, longname1, chartype1,\
 116                                          varname2, longname2, chartype2,\
 117                                          earlyreturn_on_utf8_locale,    \
 118                                          earlyreturn_expr,              \
 119                                          worst_case_len_expr,           \
 120                                          err_return,                    \
 121                                          err_msg,                       \
 122                                          modifier)                      \
 123 static ICONV_LIST(iconv_##varname1##_to_##varname2,                     \
 124                   longname1, longname2);                                \
 125                                                                         \
 126 modifier int                                                            \
 127 varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\
 128                                   size_t *out_nbytes_ret)               \
 129 {                                                                       \
 130         iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2);     \
 131         if (cd == NULL)                                                 \
 132                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;                  \
 133                                                                         \
 134         chartype2 *buf;                                                 \
 135         size_t bufsize;                                                 \
 136         bool buf_onheap;                                                \
 137         bufsize = (worst_case_len_expr) * sizeof(chartype2);            \
 138         /* Worst case length */                                         \
 139         if (bufsize <= STACK_MAX) {                                     \
 140                 buf = alloca(bufsize);                                  \
 141                 buf_onheap = false;                                     \
 142         } else {                                                        \
 143                 buf = MALLOC(bufsize);                                  \
 144                 if (!buf)                                               \
 145                         return WIMLIB_ERR_NOMEM;                        \
 146                 buf_onheap = true;                                      \
 147         }                                                               \
 148                                                                         \
 149         char *inbuf = (char*)in;                                        \
 150         size_t inbytesleft = in_nbytes;                                 \
 151         char *outbuf = (char*)buf;                                      \
 152         size_t outbytesleft = bufsize;                                  \
 153         size_t len;                                                     \
 154         int ret;                                                        \
 155                                                                         \
 156         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \
 157         if (len == (size_t)-1) {                                        \
 158                 err_msg;                                                \
 159                 ret = err_return;                                       \
 160         } else {                                                        \
 161                 *out_nbytes_ret = bufsize - outbytesleft;               \
 162                 ret = 0;                                                \
 163         }                                                               \
 164         put_iconv(cd);                                                  \
 165         if (buf_onheap)                                                 \
 166                 FREE(buf);                                              \
 167         return ret;                                                     \
 168 }                                                                       \
 169                                                                         \
 170 modifier int                                                            \
 171 varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes,   \
 172                                chartype2 *out)                          \
 173 {                                                                       \
 174         iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2);     \
 175         if (cd == NULL)                                                 \
 176                 return WIMLIB_ERR_ICONV_NOT_AVAILABLE;                  \
 177                                                                         \
 178         char *inbuf = (char*)in;                                        \
 179         size_t inbytesleft = in_nbytes;                                 \
 180         char *outbuf = (char*)out;                                      \
 181         const size_t LARGE_NUMBER = 1000000000;                         \
 182         size_t outbytesleft = LARGE_NUMBER;                             \
 183         size_t len;                                                     \
 184         int ret;                                                        \
 185                                                                         \
 186         len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); \
 187         if (len == (size_t)-1) {                                        \
 188                 err_msg;                                                \
 189                 ret = err_return;                                       \
 190         } else {                                                        \
 191                 out[(LARGE_NUMBER-outbytesleft)/sizeof(chartype2)] = 0; \
 192                 ret = 0;                                                \
 193         }                                                               \
 194         put_iconv(cd);                                                  \
 195         return ret;                                                     \
 196 }                                                                       \
 197                                                                         \
 198 modifier int                                                            \
 199 varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes,         \
 200                          chartype2 **out_ret,                           \
 201                          size_t *out_nbytes_ret)                        \
 202 {                                                                       \
 203         int ret;                                                        \
 204         chartype2 *out;                                                 \
 205         size_t out_nbytes;                                              \
 206                                                                         \
 207         if (earlyreturn_on_utf8_locale && wimlib_mbs_is_utf8) {         \
 208                 earlyreturn_expr;                                       \
 209                 /* Out same as in */                                    \
 210                 out = MALLOC(in_nbytes + sizeof(chartype2));            \
 211                 if (!out)                                               \
 212                         return WIMLIB_ERR_NOMEM;                        \
 213                 memcpy(out, in, in_nbytes);                             \
 214                 out[in_nbytes / sizeof(chartype2)] = 0;                 \
 215                 *out_ret = out;                                         \
 216                 *out_nbytes_ret = in_nbytes;                            \
 217                 return 0;                                               \
 218         }                                                               \
 219                                                                         \
 220         ret = varname1##_to_##varname2##_nbytes(in, in_nbytes,          \
 221                                                 &out_nbytes);           \
 222         if (ret)                                                        \
 223                 return ret;                                             \
 224                                                                         \
 225         out = MALLOC(out_nbytes + sizeof(chartype2));                   \
 226         if (!out)                                                       \
 227                 return WIMLIB_ERR_NOMEM;                                \
 228                                                                         \
 229         ret = varname1##_to_##varname2##_buf(in, in_nbytes, out);       \
 230         if (ret) {                                                      \
 231                 FREE(out);                                              \
 232         } else {                                                        \
 233                 *out_ret = out;                                         \
 234                 *out_nbytes_ret = out_nbytes;                           \
 235         }                                                               \
 236         return ret;                                                     \
 237 }
 238
 239 #if !TCHAR_IS_UTF16LE
 240
 241 /* UNIX */
 242
 243 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", tchar,
 244                                  utf16le, "UTF-16LE", utf16lechar,
 245                                  false,
 246                                  ,
 247                                  in_nbytes * 2,
 248                                  WIMLIB_ERR_INVALID_UTF8_STRING,
 249                                  ERROR_WITH_ERRNO("Failed to convert UTF-8 string "
 250                                                   "to UTF-16LE string!"),
 251                                  static)
 252
 253 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar,
 254                                  utf8, "UTF-8", tchar,
 255                                  false,
 256                                  ,
 257                                  in_nbytes * 2,
 258                                  WIMLIB_ERR_INVALID_UTF16_STRING,
 259                                  ERROR_WITH_ERRNO("Failed to convert UTF-16LE string "
 260                                                   "to UTF-8 string!"),
 261                                  static)
 262
 263 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar,
 264                                  utf16le, "UTF-16LE", utf16lechar,
 265                                  true,
 266                                  return utf8_to_utf16le(in, in_nbytes, out_ret, out_nbytes_ret),
 267                                  in_nbytes * 2,
 268                                  WIMLIB_ERR_INVALID_MULTIBYTE_STRING,
 269                                  ERROR_WITH_ERRNO("Failed to convert multibyte "
 270                                                   "string \"%"TS"\" to UTF-16LE string!", in);
 271                                  ERROR("If the data you provided was UTF-8, please make sure "
 272                                        "the character encoding\n"
 273                                        "        of your current locale is UTF-8."),
 274                                  )
 275
 276 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar,
 277                                  tstr, "", tchar,
 278                                  true,
 279                                  return utf16le_to_utf8(in, in_nbytes, out_ret, out_nbytes_ret),
 280                                  in_nbytes * 2,
 281                                  WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
 282                                  ERROR("Failed to convert UTF-16LE string to "
 283                                        "multibyte string!");
 284                                  ERROR("This may be because the UTF-16LE string "
 285                                        "could not be represented\n"
 286                                        "        in your locale's character encoding."),
 287                                  )
 288 #endif
 289
 290 /* tchar to UTF-8 and back */
 291 #if TCHAR_IS_UTF16LE
 292
 293 /* Windows */
 294 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "UTF-16LE", tchar,
 295                                  utf8, "UTF-8", char,
 296                                  false,
 297                                  ,
 298                                  in_nbytes * 2,
 299                                  WIMLIB_ERR_INVALID_UTF16_STRING,
 300                                  ERROR_WITH_ERRNO("Failed to convert UTF-16LE "
 301                                                   "string \"%"TS"\" to UTF-8 string!", in),
 302                                  )
 303
 304 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char,
 305                                  tstr, "UTF-16LE", tchar,
 306                                  false,
 307                                  ,
 308                                  in_nbytes * 2,
 309                                  WIMLIB_ERR_INVALID_UTF8_STRING,
 310                                  ERROR_WITH_ERRNO("Failed to convert UTF-8 string "
 311                                                   "to UTF-16LE string!"),
 312                                  )
 313 #else
 314
 315 /* UNIX */
 316
 317 DEFINE_CHAR_CONVERSION_FUNCTIONS(tstr, "", tchar,
 318                                  utf8, "UTF-8", char,
 319                                  true,
 320                                  ,
 321                                  in_nbytes * 4,
 322                                  WIMLIB_ERR_INVALID_MULTIBYTE_STRING,
 323                                  ERROR_WITH_ERRNO("Failed to convert multibyte "
 324                                                   "string \"%"TS"\" to UTF-8 string!", in);
 325                                  ERROR("If the data you provided was UTF-8, please make sure "
 326                                        "the character\n"
 327                                        "        encoding of your current locale is UTF-8."),
 328                                  )
 329
 330 DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", char,
 331                                  tstr, "", tchar,
 332                                  true,
 333                                  ,
 334                                  in_nbytes * 4,
 335                                  WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
 336                                  ERROR("Failed to convert UTF-8 string to "
 337                                        "multibyte string!");
 338                                  ERROR("This may be because the UTF-8 data "
 339                                        "could not be represented\n"
 340                                        "        in your locale's character encoding."),
 341                                  )
 342 #endif
 343
 344 int
 345 tstr_to_utf8_simple(const tchar *tstr, char **out)
 346 {
 347         size_t out_nbytes;
 348         return tstr_to_utf8(tstr, tstrlen(tstr) * sizeof(tchar),
 349                             out, &out_nbytes);
 350 }
 351
 352 int
 353 utf8_to_tstr_simple(const char *utf8str, tchar **out)
 354 {
 355         size_t out_nbytes;
 356         return utf8_to_tstr(utf8str, strlen(utf8str), out, &out_nbytes);
 357 }
 358
 359 static void
 360 iconv_init(struct iconv_list_head *head)
 361 {
 362         pthread_mutex_init(&head->mutex, NULL);
 363         INIT_LIST_HEAD(&head->list);
 364 }
 365
 366 static void
 367 iconv_cleanup(struct iconv_list_head *head)
 368 {
 369         pthread_mutex_destroy(&head->mutex);
 370         while (!list_empty(&head->list)) {
 371                 struct iconv_node *i;
 372
 373                 i = container_of(head->list.next, struct iconv_node, list);
 374                 list_del(&i->list);
 375                 iconv_close(i->cd);
 376                 FREE(i);
 377         }
 378 }
 379
 380 void
 381 iconv_global_init(void)
 382 {
 383         iconv_init(&iconv_utf8_to_tstr);
 384         iconv_init(&iconv_tstr_to_utf8);
 385 #if !TCHAR_IS_UTF16LE
 386         iconv_init(&iconv_utf16le_to_tstr);
 387         iconv_init(&iconv_tstr_to_utf16le);
 388         iconv_init(&iconv_utf16le_to_utf8);
 389         iconv_init(&iconv_utf8_to_utf16le);
 390 #endif
 391 }
 392
 393 void
 394 iconv_global_cleanup(void)
 395 {
 396         iconv_cleanup(&iconv_utf8_to_tstr);
 397         iconv_cleanup(&iconv_tstr_to_utf8);
 398 #if !TCHAR_IS_UTF16LE
 399         iconv_cleanup(&iconv_utf16le_to_tstr);
 400         iconv_cleanup(&iconv_tstr_to_utf16le);
 401         iconv_cleanup(&iconv_utf16le_to_utf8);
 402         iconv_cleanup(&iconv_utf8_to_utf16le);
 403 #endif
 404 }
 405
 406 /* A table that maps from UCS-2 characters to their upper case equivalents.
 407  * Index and array values are both CPU endian.
 408  * Note: this is only an *approximation* of real UTF-16 case folding.
 409  */
 410 u16 upcase[65536];
 411
 412 void
 413 init_upcase(void)
 414 {
 415         /* This is the table used in NTFS volumes formatted by Windows 10.
 416          * It was compressed by tools/compress_upcase_table.c.  */
 417         static const u16 upcase_compressed[] = {
 418                 0x0000, 0x0000, 0x0060, 0x0000, 0x0000, 0xffe0, 0x0019, 0x0061,
 419                 0x0061, 0x0000, 0x001b, 0x005d, 0x0008, 0x0060, 0x0000, 0x0079,
 420                 0x0000, 0x0000, 0x0000, 0xffff, 0x002f, 0x0100, 0x0002, 0x0000,
 421                 0x0007, 0x012b, 0x0011, 0x0121, 0x002f, 0x0103, 0x0006, 0x0101,
 422                 0x0000, 0x00c3, 0x0006, 0x0131, 0x0007, 0x012e, 0x0004, 0x0000,
 423                 0x0003, 0x012f, 0x0000, 0x0061, 0x0004, 0x0130, 0x0000, 0x00a3,
 424                 0x0003, 0x0000, 0x0000, 0x0082, 0x000b, 0x0131, 0x0006, 0x0189,
 425                 0x0008, 0x012f, 0x0007, 0x012e, 0x0000, 0x0038, 0x0006, 0x0000,
 426                 0x0000, 0xfffe, 0x0007, 0x01c4, 0x000f, 0x0101, 0x0000, 0xffb1,
 427                 0x0015, 0x011e, 0x0004, 0x01cc, 0x002a, 0x0149, 0x0014, 0x0149,
 428                 0x0007, 0x0000, 0x0009, 0x018c, 0x000b, 0x0138, 0x0000, 0x2a1f,
 429                 0x0000, 0x2a1c, 0x0000, 0x0000, 0x0000, 0xff2e, 0x0000, 0xff32,
 430                 0x0000, 0x0000, 0x0000, 0xff33, 0x0000, 0xff33, 0x0000, 0x0000,
 431                 0x0000, 0xff36, 0x0000, 0x0000, 0x0000, 0xff35, 0x0004, 0x0000,
 432                 0x0002, 0x0257, 0x0000, 0x0000, 0x0000, 0xff31, 0x0004, 0x0000,
 433                 0x0000, 0xff2f, 0x0000, 0xff2d, 0x0000, 0x0000, 0x0000, 0x29f7,
 434                 0x0003, 0x0000, 0x0002, 0x0269, 0x0000, 0x29fd, 0x0000, 0xff2b,
 435                 0x0002, 0x0000, 0x0000, 0xff2a, 0x0007, 0x0000, 0x0000, 0x29e7,
 436                 0x0002, 0x0000, 0x0000, 0xff26, 0x0005, 0x027e, 0x0003, 0x027e,
 437                 0x0000, 0xffbb, 0x0000, 0xff27, 0x0000, 0xff27, 0x0000, 0xffb9,
 438                 0x0005, 0x0000, 0x0000, 0xff25, 0x0065, 0x007b, 0x0079, 0x0293,
 439                 0x0008, 0x012d, 0x0003, 0x019c, 0x0002, 0x037b, 0x002e, 0x0000,
 440                 0x0000, 0xffda, 0x0000, 0xffdb, 0x0002, 0x03ad, 0x0012, 0x0060,
 441                 0x000a, 0x0060, 0x0000, 0xffc0, 0x0000, 0xffc1, 0x0000, 0xffc1,
 442                 0x0008, 0x0000, 0x0000, 0xfff8, 0x001a, 0x0118, 0x0000, 0x0007,
 443                 0x0008, 0x018d, 0x0009, 0x0233, 0x0046, 0x0035, 0x0006, 0x0061,
 444                 0x0000, 0xffb0, 0x000f, 0x0450, 0x0025, 0x010e, 0x000a, 0x036b,
 445                 0x0032, 0x048b, 0x000e, 0x0100, 0x0000, 0xfff1, 0x0037, 0x048a,
 446                 0x0026, 0x0465, 0x0034, 0x0000, 0x0000, 0xffd0, 0x0025, 0x0561,
 447                 0x00de, 0x0293, 0x1714, 0x0587, 0x0000, 0x8a04, 0x0003, 0x0000,
 448                 0x0000, 0x0ee6, 0x0087, 0x02ee, 0x0092, 0x1e01, 0x0069, 0x1df7,
 449                 0x0000, 0x0008, 0x0007, 0x1f00, 0x0008, 0x0000, 0x000e, 0x1f02,
 450                 0x0008, 0x1f0e, 0x0010, 0x1f06, 0x001a, 0x1f06, 0x0002, 0x1f0f,
 451                 0x0007, 0x1f50, 0x0017, 0x1f19, 0x0000, 0x004a, 0x0000, 0x004a,
 452                 0x0000, 0x0056, 0x0003, 0x1f72, 0x0000, 0x0064, 0x0000, 0x0064,
 453                 0x0000, 0x0080, 0x0000, 0x0080, 0x0000, 0x0070, 0x0000, 0x0070,
 454                 0x0000, 0x007e, 0x0000, 0x007e, 0x0028, 0x1f1e, 0x000c, 0x1f06,
 455                 0x0000, 0x0000, 0x0000, 0x0009, 0x000f, 0x0000, 0x000d, 0x1fb3,
 456                 0x000d, 0x1f44, 0x0008, 0x1fcd, 0x0006, 0x03f2, 0x0015, 0x1fbb,
 457                 0x014e, 0x0587, 0x0000, 0xffe4, 0x0021, 0x0000, 0x0000, 0xfff0,
 458                 0x000f, 0x2170, 0x000a, 0x0238, 0x0346, 0x0587, 0x0000, 0xffe6,
 459                 0x0019, 0x24d0, 0x0746, 0x0587, 0x0026, 0x0561, 0x000b, 0x057e,
 460                 0x0004, 0x012f, 0x0000, 0xd5d5, 0x0000, 0xd5d8, 0x000c, 0x022e,
 461                 0x000e, 0x03f8, 0x006e, 0x1e33, 0x0011, 0x0000, 0x0000, 0xe3a0,
 462                 0x0025, 0x2d00, 0x17f2, 0x0587, 0x6129, 0x2d26, 0x002e, 0x0201,
 463                 0x002a, 0x1def, 0x0098, 0xa5b7, 0x0040, 0x1dff, 0x000e, 0x0368,
 464                 0x000d, 0x022b, 0x034c, 0x2184, 0x5469, 0x2d26, 0x007f, 0x0061,
 465                 0x0040, 0x0000,
 466         };
 467
 468         /* Simple LZ decoder  */
 469         const u16 *in_next = upcase_compressed;
 470         for (u32 i = 0; i < ARRAY_LEN(upcase); ) {
 471                 u16 length = *in_next++;
 472                 u16 src_pos = *in_next++;
 473                 if (length == 0) {
 474                         /* Literal */
 475                         upcase[i++] = src_pos;
 476                 } else {
 477                         /* Match */
 478                         do {
 479                                 upcase[i++] = upcase[src_pos++];
 480                         } while (--length);
 481                 }
 482         }
 483
 484         /* Delta filter  */
 485         for (u32 i = 0; i < ARRAY_LEN(upcase); i++)
 486                 upcase[i] += i;
 487
 488 #if 0
 489         /* Sanity checks  */
 490         wimlib_assert(upcase['a'] == 'A');
 491         wimlib_assert(upcase['A'] == 'A');
 492         wimlib_assert(upcase['z'] == 'Z');
 493         wimlib_assert(upcase['Z'] == 'Z');
 494         wimlib_assert(upcase['1'] == '1');
 495         wimlib_assert(upcase[0x00e9] == 0x00c9); /* Latin letter e, with acute accent  */
 496         wimlib_assert(upcase[0x00c9] == 0x00c9);
 497         wimlib_assert(upcase[0x03c1] == 0x03a1); /* Greek letter rho  */
 498         wimlib_assert(upcase[0x03a1] == 0x03a1);
 499         wimlib_assert(upcase[0x0436] == 0x0416); /* Cyrillic letter zhe  */
 500         wimlib_assert(upcase[0x0416] == 0x0416);
 501         wimlib_assert(upcase[0x0567] == 0x0537); /* Armenian letter eh  */
 502         wimlib_assert(upcase[0x0537] == 0x0537);
 503         wimlib_assert(upcase[0x24d0] == 0x24b6); /* Circled Latin letter A
 504                                                     (is that a real character???)  */
 505         wimlib_assert(upcase[0x24b6] == 0x24b6);
 506         wimlib_assert(upcase[0x2603] == 0x2603); /* Note to self: Upper case
 507                                                     snowman symbol does not
 508                                                     exist.  */
 509 #endif
 510 }
 511
 512 /* Compare UTF-16LE strings case-sensitively (%ignore_case == false) or
 513  * case-insensitively (%ignore_case == true).
 514  *
 515  * This is implemented using the default upper-case table used by NTFS.  It does
 516  * not handle all possible cases allowed by UTF-16LE.  For example, different
 517  * normalizations of the same sequence of "characters" are not considered equal.
 518  * It hopefully does the right thing most of the time though.  */
 519 int
 520 cmp_utf16le_strings(const utf16lechar *s1, size_t n1,
 521                     const utf16lechar *s2, size_t n2,
 522                     bool ignore_case)
 523 {
 524         size_t n = min(n1, n2);
 525
 526         if (ignore_case) {
 527                 for (size_t i = 0; i < n; i++) {
 528                         u16 c1 = upcase[le16_to_cpu(s1[i])];
 529                         u16 c2 = upcase[le16_to_cpu(s2[i])];
 530                         if (c1 != c2)
 531                                 return (c1 < c2) ? -1 : 1;
 532                 }
 533         } else {
 534                 for (size_t i = 0; i < n; i++) {
 535                         u16 c1 = le16_to_cpu(s1[i]);
 536                         u16 c2 = le16_to_cpu(s2[i]);
 537                         if (c1 != c2)
 538                                 return (c1 < c2) ? -1 : 1;
 539                 }
 540         }
 541         if (n1 == n2)
 542                 return 0;
 543         return (n1 < n2) ? -1 : 1;
 544 }
 545
 546 /* Like cmp_utf16le_strings(), but assumes the strings are null terminated.  */
 547 int
 548 cmp_utf16le_strings_z(const utf16lechar *s1, const utf16lechar *s2,
 549                       bool ignore_case)
 550 {
 551         if (ignore_case) {
 552                 for (;;) {
 553                         u16 c1 = upcase[le16_to_cpu(*s1)];
 554                         u16 c2 = upcase[le16_to_cpu(*s2)];
 555                         if (c1 != c2)
 556                                 return (c1 < c2) ? -1 : 1;
 557                         if (c1 == 0)
 558                                 return 0;
 559                         s1++, s2++;
 560                 }
 561         } else {
 562                 while (*s1 && *s1 == *s2)
 563                         s1++, s2++;
 564                 if (*s1 == *s2)
 565                         return 0;
 566                 return (le16_to_cpu(*s1) < le16_to_cpu(*s2)) ? -1 : 1;
 567         }
 568 }
 569
 570 /* Duplicate a UTF-16LE string.  The input string might not be null terminated
 571  * and might be misaligned, but the returned string is guaranteed to be null
 572  * terminated and properly aligned.  */
 573 utf16lechar *
 574 utf16le_dupz(const void *ustr, size_t usize)
 575 {
 576         utf16lechar *dup = MALLOC(usize + sizeof(utf16lechar));
 577         if (dup) {
 578                 memcpy(dup, ustr, usize);
 579                 dup[usize / sizeof(utf16lechar)] = 0;
 580         }
 581         return dup;
 582 }
 583
 584 /* Duplicate a null-terminated UTF-16LE string.  */
 585 utf16lechar *
 586 utf16le_dup(const utf16lechar *ustr)
 587 {
 588         const utf16lechar *p = ustr;
 589         while (*p++)
 590                 ;
 591         return memdup(ustr, (const u8 *)p - (const u8 *)ustr);
 592 }
 593
 594 /* Return the length, in bytes, of a UTF-null terminated UTF-16 string,
 595  * excluding the null terminator.  */
 596 size_t
 597 utf16le_len_bytes(const utf16lechar *s)
 598 {
 599         const utf16lechar *p = s;
 600         while (*p)
 601                 p++;
 602         return (p - s) * sizeof(utf16lechar);
 603 }
 604
 605 /* Return the length, in UTF-16 coding units, of a UTF-null terminated UTF-16
 606  * string, excluding the null terminator.  */
 607 size_t
 608 utf16le_len_chars(const utf16lechar *s)
 609 {
 610         return utf16le_len_bytes(s) / sizeof(utf16lechar);
 611 }