wimlib.net Git - wimlib/blob - src/encoding.c

   1 /*
   2  * encoding.c - UTF-8 and UTF-16LE codecs and utility functions
   3  *
   4  * Copyright 2012-2023 Eric Biggers
   5  *
   6  * This file is free software; you can redistribute it and/or modify it under
   7  * the terms of the GNU Lesser General Public License as published by the Free
   8  * Software Foundation; either version 3 of the License, or (at your option) any
   9  * later version.
  10  *
  11  * This file is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  13  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  14  * details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public License
  17  * along with this file; if not, see https://www.gnu.org/licenses/.
  18  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 #  include "config.h"
  22 #endif
  23
  24 #include <errno.h>
  25 #include <string.h>
  26
  27 #include "wimlib/encoding.h"
  28 #include "wimlib/endianness.h"
  29 #include "wimlib/error.h"
  30 #include "wimlib/unaligned.h"
  31 #include "wimlib/util.h"
  32
  33 /*
  34  * Allow unpaired surrogates, such as might exist in Windows-style filenames ---
  35  * which are normally valid UTF-16LE, but are actually treated as opaque
  36  * sequences of 16-bit WCHARs by Windows.  When decoding "UTF-16LE", unpaired
  37  * surrogates will be decoded as their surrogate codepoints; and when encoding
  38  * to and from "UTF-8", the encoding will actually be WTF-8 ("Wobbly
  39  * Transformation Format - 8-bit"), a superset of UTF-8 which permits the
  40  * surrogate codepoints.
  41  *
  42  * In combination with also allowing the "non-character" codepoints U+FFFE and
  43  * U+FFFF, the result is that every Windows-style filename can be translated to
  44  * a UNIX-style filename.
  45  *
  46  * Unfortunately, the converse is not true: not every UNIX filename can be
  47  * translated to a Windows filename.  Only UNIX filenames that are valid "WTF-8"
  48  * can be translated.  I considered ways to define a bijective mapping, but
  49  * there did not seem to be a straightforward way.  The "UTF-8b" scheme, for
  50  * example, would map each invalid byte 'b' to a surrogate "escape code" 'U+DC00
  51  * + b'.  The problem with this was that surrogate escape codes can be combined
  52  * to create a valid UTF-8 sequence, thus breaking the bijection by mapping
  53  * multiple Windows filenames to a single UNIX filename.
  54  */
  55 #define ALLOW_UNPAIRED_SURROGATES       1
  56
  57 #define INVALID_CODEPOINT       0xFFFFFFFF
  58 #define VALIDATE(expr)          if (validate && unlikely(!(expr))) goto invalid
  59 #define IS_SURROGATE(c)         ((c) >= 0xD800 && (c) < 0xE000)
  60 #define IS_HIGH_SURROGATE(c)    ((c) >= 0xD800 && (c) < 0xDC00)
  61 #define IS_LOW_SURROGATE(c)     ((c) >= 0xDC00 && (c) < 0xE000)
  62 #define IS_UTF8_TAIL(c)         (((c) & 0xC0) == 0x80)
  63
  64 /*
  65  * Decode the next Unicode codepoint from the string at @in, which has
  66  * @remaining >= 1 bytes remaining.  Return the number of bytes consumed and
  67  * write the decoded codepoint to *c_ret.
  68  *
  69  * If the input might not be a valid string in the source encoding, then
  70  * @validate must be specified as %true, and then on invalid input the function
  71  * consumes at least one byte and sets *c_ret to INVALID_CODEPOINT.  If the
  72  * input is guaranteed to be valid, then @validate may be specified as %false.
  73  */
  74 typedef unsigned (*decode_codepoint_fn)(const u8 *in, size_t remaining,
  75                                         bool validate, u32 *c_ret);
  76
  77 /* Encode the Unicode codepoint @c and return the number of bytes used. */
  78 typedef unsigned (*encode_codepoint_fn)(u32 c, u8 *out);
  79
  80 static forceinline unsigned
  81 utf8_decode_codepoint(const u8 *in, size_t remaining, bool validate, u32 *c_ret)
  82 {
  83         if (likely(in[0] < 0x80)) { /* U+0...U+7F */
  84                 *c_ret = in[0];
  85                 return 1;
  86         }
  87
  88         if (in[0] < 0xE0) { /* U+80...U+7FF */
  89                 VALIDATE(in[0] >= 0xC2 && remaining >= 2 &&
  90                          IS_UTF8_TAIL(in[1]));
  91                 *c_ret = ((u32)(in[0] & 0x1F) << 6) |
  92                          ((u32)(in[1] & 0x3F) << 0);
  93                 return 2;
  94         }
  95
  96         if (in[0] < 0xF0) { /* U+800...U+FFFF, possibly excluding surrogates */
  97                 VALIDATE(remaining >= 3 &&
  98                          IS_UTF8_TAIL(in[1]) &&
  99                          IS_UTF8_TAIL(in[2]));
 100                 *c_ret = ((u32)(in[0] & 0x0F) << 12) |
 101                          ((u32)(in[1] & 0x3F) << 6) |
 102                          ((u32)(in[2] & 0x3F) << 0);
 103                 VALIDATE(*c_ret >= 0x800);
 104         #if !ALLOW_UNPAIRED_SURROGATES
 105                 VALIDATE(!IS_SURROGATE(*c_ret));
 106         #endif
 107                 return 3;
 108         }
 109
 110         /* U+10000...U+10FFFF */
 111         VALIDATE(in[0] < 0xF8 && remaining >= 4 &&
 112                  IS_UTF8_TAIL(in[1]) &&
 113                  IS_UTF8_TAIL(in[2]) &&
 114                  IS_UTF8_TAIL(in[3]));
 115         *c_ret = ((u32)(in[0] & 0x07) << 18) |
 116                  ((u32)(in[1] & 0x3F) << 12) |
 117                  ((u32)(in[2] & 0x3F) << 6) |
 118                  ((u32)(in[3] & 0x3F) << 0);
 119         VALIDATE(*c_ret >= 0x10000 && *c_ret <= 0x10FFFF);
 120         return 4;
 121
 122 invalid:
 123         *c_ret = INVALID_CODEPOINT;
 124         return 1;
 125 }
 126
 127 static forceinline unsigned
 128 utf8_encode_codepoint(u32 c, u8 *out)
 129 {
 130         if (likely(c < 0x80)) {
 131                 out[0] = c;
 132                 return 1;
 133         }
 134
 135         if (c < 0x800) {
 136                 out[0] = 0xC0 | (c >> 6);
 137                 out[1] = 0x80 | (c & 0x3F);
 138                 return 2;
 139         }
 140
 141         if (c < 0x10000) {
 142                 out[0] = 0xE0 | (c >> 12);
 143                 out[1] = 0x80 | ((c >> 6) & 0x3F);
 144                 out[2] = 0x80 | (c & 0x3F);
 145                 return 3;
 146         }
 147
 148         out[0] = 0xF0 | (c >> 18);
 149         out[1] = 0x80 | ((c >> 12) & 0x3F);
 150         out[2] = 0x80 | ((c >> 6) & 0x3F);
 151         out[3] = 0x80 | (c & 0x3F);
 152         return 4;
 153 }
 154
 155 static forceinline unsigned
 156 utf16le_decode_codepoint(const u8 *in, size_t remaining, bool validate,
 157                          u32 *c_ret)
 158 {
 159         u32 h, l;
 160
 161         VALIDATE(remaining >= 2);
 162         h = get_unaligned_le16(in);
 163         if (unlikely(IS_SURROGATE(h))) {
 164                 /* Surrogate pairs are U+10000...U+10FFFF.
 165                  * Unpaired surrogates are U+D800...U+DFFF. */
 166         #if ALLOW_UNPAIRED_SURROGATES
 167                 if (unlikely(!IS_HIGH_SURROGATE(h) || remaining < 4))
 168                         goto unpaired;
 169                 l = get_unaligned_le16(in + 2);
 170                 if (unlikely(!IS_LOW_SURROGATE(l)))
 171                         goto unpaired;
 172         #else
 173                 VALIDATE(IS_HIGH_SURROGATE(h) && remaining >= 4);
 174                 l = get_unaligned_le16(in + 2);
 175                 VALIDATE(IS_LOW_SURROGATE(l));
 176         #endif
 177                 *c_ret = 0x10000 + ((h - 0xD800) << 10) + (l - 0xDC00);
 178                 return 4;
 179         }
 180 #if ALLOW_UNPAIRED_SURROGATES
 181 unpaired:
 182 #endif
 183         *c_ret = h;
 184         return 2;
 185
 186 invalid:
 187         *c_ret = INVALID_CODEPOINT;
 188         return min(remaining, 2);
 189 }
 190
 191 static forceinline unsigned
 192 utf16le_encode_codepoint(u32 c, u8 *out)
 193 {
 194         if (likely(c < 0x10000)) {
 195                 put_unaligned_le16(c, out);
 196                 return 2;
 197         }
 198         c -= 0x10000;
 199         put_unaligned_le16(0xD800 + (c >> 10), out);
 200         put_unaligned_le16(0xDC00 + (c & 0x3FF), out + 2);
 201         return 4;
 202 }
 203
 204 /*
 205  * Convert the string @in of size @in_nbytes from the encoding given by the
 206  * @decode_codepoint function to the encoding given by the @encode_codepoint
 207  * function.  @in does not need to be null-terminated, but a null terminator
 208  * will be added to the output string.
 209  *
 210  * On success, write the allocated output string to @out_ret (must not be NULL)
 211  * and its size excluding the null terminator to @out_nbytes_ret (may be NULL).
 212  *
 213  * If the input string is malformed, return @ilseq_err with errno set to EILSEQ.
 214  * If out of memory, return WIMLIB_ERR_NOMEM with errno set to ENOMEM.
 215  */
 216 static forceinline int
 217 convert_string(const u8 * const in, const size_t in_nbytes,
 218                u8 **out_ret, size_t *out_nbytes_ret,
 219                int ilseq_err,
 220                decode_codepoint_fn decode_codepoint,
 221                encode_codepoint_fn encode_codepoint)
 222 {
 223         size_t i;
 224         u8 *p_out;
 225         size_t out_nbytes = 0;
 226         u8 *out;
 227         u8 tmp[8]; /* assuming no codepoint requires > 8 bytes to encode */
 228         u32 c;
 229
 230         /* Validate the input string and compute the output size. */
 231         for (i = 0; i < in_nbytes; ) {
 232                 i += (*decode_codepoint)(&in[i], in_nbytes - i, true, &c);
 233                 if (unlikely(c == INVALID_CODEPOINT)) {
 234                         errno = EILSEQ;
 235                         return ilseq_err;
 236                 }
 237                 out_nbytes += (*encode_codepoint)(c, tmp);
 238         }
 239
 240         /* Allocate the output string, including space for a null terminator. */
 241         out = MALLOC(out_nbytes + (*encode_codepoint)(0, tmp));
 242         if (unlikely(!out))
 243                 return WIMLIB_ERR_NOMEM;
 244
 245         /* Do the conversion. */
 246         p_out = out;
 247         for (i = 0; i < in_nbytes; ) {
 248                 i += (*decode_codepoint)(&in[i], in_nbytes - i, false, &c);
 249                 p_out += (*encode_codepoint)(c, p_out);
 250         }
 251
 252         /* Add a null terminator. */
 253         (*encode_codepoint)(0, p_out);
 254
 255         /* Return the output string and its size (by reference). */
 256         *out_ret = out;
 257         if (out_nbytes_ret)
 258                 *out_nbytes_ret = out_nbytes;
 259         return 0;
 260 }
 261
 262 int
 263 utf8_to_utf16le(const char *in, size_t in_nbytes,
 264                 utf16lechar **out_ret, size_t *out_nbytes_ret)
 265 {
 266         return convert_string((const u8 *)in, in_nbytes,
 267                               (u8 **)out_ret, out_nbytes_ret,
 268                               WIMLIB_ERR_INVALID_UTF8_STRING,
 269                               utf8_decode_codepoint, utf16le_encode_codepoint);
 270 }
 271
 272 int
 273 utf16le_to_utf8(const utf16lechar *in, size_t in_nbytes,
 274                 char **out_ret, size_t *out_nbytes_ret)
 275 {
 276         return convert_string((const u8 *)in, in_nbytes,
 277                               (u8 **)out_ret, out_nbytes_ret,
 278                               WIMLIB_ERR_INVALID_UTF16_STRING,
 279                               utf16le_decode_codepoint, utf8_encode_codepoint);
 280 }
 281
 282 /*
 283  * A table that maps from UCS-2 characters to their upper case equivalents.
 284  * Index and array values are both CPU endian.
 285  * Note: this is only an *approximation* of real UTF-16 case folding.
 286  */
 287 u16 upcase[65536];
 288
 289 void
 290 init_upcase(void)
 291 {
 292         /* This is the table used in NTFS volumes formatted by Windows 10.
 293          * It was compressed by tools/compress_upcase_table.c.  */
 294         static const u16 upcase_compressed[] = {
 295                 0x0000, 0x0000, 0x0060, 0x0000, 0x0000, 0xffe0, 0x0019, 0x0061,
 296                 0x0061, 0x0000, 0x001b, 0x005d, 0x0008, 0x0060, 0x0000, 0x0079,
 297                 0x0000, 0x0000, 0x0000, 0xffff, 0x002f, 0x0100, 0x0002, 0x0000,
 298                 0x0007, 0x012b, 0x0011, 0x0121, 0x002f, 0x0103, 0x0006, 0x0101,
 299                 0x0000, 0x00c3, 0x0006, 0x0131, 0x0007, 0x012e, 0x0004, 0x0000,
 300                 0x0003, 0x012f, 0x0000, 0x0061, 0x0004, 0x0130, 0x0000, 0x00a3,
 301                 0x0003, 0x0000, 0x0000, 0x0082, 0x000b, 0x0131, 0x0006, 0x0189,
 302                 0x0008, 0x012f, 0x0007, 0x012e, 0x0000, 0x0038, 0x0006, 0x0000,
 303                 0x0000, 0xfffe, 0x0007, 0x01c4, 0x000f, 0x0101, 0x0000, 0xffb1,
 304                 0x0015, 0x011e, 0x0004, 0x01cc, 0x002a, 0x0149, 0x0014, 0x0149,
 305                 0x0007, 0x0000, 0x0009, 0x018c, 0x000b, 0x0138, 0x0000, 0x2a1f,
 306                 0x0000, 0x2a1c, 0x0000, 0x0000, 0x0000, 0xff2e, 0x0000, 0xff32,
 307                 0x0000, 0x0000, 0x0000, 0xff33, 0x0000, 0xff33, 0x0000, 0x0000,
 308                 0x0000, 0xff36, 0x0000, 0x0000, 0x0000, 0xff35, 0x0004, 0x0000,
 309                 0x0002, 0x0257, 0x0000, 0x0000, 0x0000, 0xff31, 0x0004, 0x0000,
 310                 0x0000, 0xff2f, 0x0000, 0xff2d, 0x0000, 0x0000, 0x0000, 0x29f7,
 311                 0x0003, 0x0000, 0x0002, 0x0269, 0x0000, 0x29fd, 0x0000, 0xff2b,
 312                 0x0002, 0x0000, 0x0000, 0xff2a, 0x0007, 0x0000, 0x0000, 0x29e7,
 313                 0x0002, 0x0000, 0x0000, 0xff26, 0x0005, 0x027e, 0x0003, 0x027e,
 314                 0x0000, 0xffbb, 0x0000, 0xff27, 0x0000, 0xff27, 0x0000, 0xffb9,
 315                 0x0005, 0x0000, 0x0000, 0xff25, 0x0065, 0x007b, 0x0079, 0x0293,
 316                 0x0008, 0x012d, 0x0003, 0x019c, 0x0002, 0x037b, 0x002e, 0x0000,
 317                 0x0000, 0xffda, 0x0000, 0xffdb, 0x0002, 0x03ad, 0x0012, 0x0060,
 318                 0x000a, 0x0060, 0x0000, 0xffc0, 0x0000, 0xffc1, 0x0000, 0xffc1,
 319                 0x0008, 0x0000, 0x0000, 0xfff8, 0x001a, 0x0118, 0x0000, 0x0007,
 320                 0x0008, 0x018d, 0x0009, 0x0233, 0x0046, 0x0035, 0x0006, 0x0061,
 321                 0x0000, 0xffb0, 0x000f, 0x0450, 0x0025, 0x010e, 0x000a, 0x036b,
 322                 0x0032, 0x048b, 0x000e, 0x0100, 0x0000, 0xfff1, 0x0037, 0x048a,
 323                 0x0026, 0x0465, 0x0034, 0x0000, 0x0000, 0xffd0, 0x0025, 0x0561,
 324                 0x00de, 0x0293, 0x1714, 0x0587, 0x0000, 0x8a04, 0x0003, 0x0000,
 325                 0x0000, 0x0ee6, 0x0087, 0x02ee, 0x0092, 0x1e01, 0x0069, 0x1df7,
 326                 0x0000, 0x0008, 0x0007, 0x1f00, 0x0008, 0x0000, 0x000e, 0x1f02,
 327                 0x0008, 0x1f0e, 0x0010, 0x1f06, 0x001a, 0x1f06, 0x0002, 0x1f0f,
 328                 0x0007, 0x1f50, 0x0017, 0x1f19, 0x0000, 0x004a, 0x0000, 0x004a,
 329                 0x0000, 0x0056, 0x0003, 0x1f72, 0x0000, 0x0064, 0x0000, 0x0064,
 330                 0x0000, 0x0080, 0x0000, 0x0080, 0x0000, 0x0070, 0x0000, 0x0070,
 331                 0x0000, 0x007e, 0x0000, 0x007e, 0x0028, 0x1f1e, 0x000c, 0x1f06,
 332                 0x0000, 0x0000, 0x0000, 0x0009, 0x000f, 0x0000, 0x000d, 0x1fb3,
 333                 0x000d, 0x1f44, 0x0008, 0x1fcd, 0x0006, 0x03f2, 0x0015, 0x1fbb,
 334                 0x014e, 0x0587, 0x0000, 0xffe4, 0x0021, 0x0000, 0x0000, 0xfff0,
 335                 0x000f, 0x2170, 0x000a, 0x0238, 0x0346, 0x0587, 0x0000, 0xffe6,
 336                 0x0019, 0x24d0, 0x0746, 0x0587, 0x0026, 0x0561, 0x000b, 0x057e,
 337                 0x0004, 0x012f, 0x0000, 0xd5d5, 0x0000, 0xd5d8, 0x000c, 0x022e,
 338                 0x000e, 0x03f8, 0x006e, 0x1e33, 0x0011, 0x0000, 0x0000, 0xe3a0,
 339                 0x0025, 0x2d00, 0x17f2, 0x0587, 0x6129, 0x2d26, 0x002e, 0x0201,
 340                 0x002a, 0x1def, 0x0098, 0xa5b7, 0x0040, 0x1dff, 0x000e, 0x0368,
 341                 0x000d, 0x022b, 0x034c, 0x2184, 0x5469, 0x2d26, 0x007f, 0x0061,
 342                 0x0040, 0x0000,
 343         };
 344
 345         /* Simple LZ decoder  */
 346         const u16 *in_next = upcase_compressed;
 347         for (u32 i = 0; i < ARRAY_LEN(upcase); ) {
 348                 u16 length = *in_next++;
 349                 u16 src_pos = *in_next++;
 350                 if (length == 0) {
 351                         /* Literal */
 352                         upcase[i++] = src_pos;
 353                 } else {
 354                         /* Match */
 355                         do {
 356                                 upcase[i++] = upcase[src_pos++];
 357                         } while (--length);
 358                 }
 359         }
 360
 361         /* Delta filter  */
 362         for (u32 i = 0; i < ARRAY_LEN(upcase); i++)
 363                 upcase[i] += i;
 364 }
 365
 366 /*
 367  * Compare UTF-16LE strings case-sensitively (%ignore_case == false) or
 368  * case-insensitively (%ignore_case == true).
 369  *
 370  * This is implemented using the default upper-case table used by NTFS.  It does
 371  * not handle all possible cases allowed by UTF-16LE.  For example, different
 372  * normalizations of the same sequence of "characters" are not considered equal.
 373  * It hopefully does the right thing most of the time though.
 374  */
 375 int
 376 cmp_utf16le_strings(const utf16lechar *s1, size_t n1,
 377                     const utf16lechar *s2, size_t n2,
 378                     bool ignore_case)
 379 {
 380         size_t n = min(n1, n2);
 381
 382         if (ignore_case) {
 383                 for (size_t i = 0; i < n; i++) {
 384                         u16 c1 = upcase[le16_to_cpu(s1[i])];
 385                         u16 c2 = upcase[le16_to_cpu(s2[i])];
 386                         if (c1 != c2)
 387                                 return (c1 < c2) ? -1 : 1;
 388                 }
 389         } else {
 390                 for (size_t i = 0; i < n; i++) {
 391                         u16 c1 = le16_to_cpu(s1[i]);
 392                         u16 c2 = le16_to_cpu(s2[i]);
 393                         if (c1 != c2)
 394                                 return (c1 < c2) ? -1 : 1;
 395                 }
 396         }
 397         if (n1 == n2)
 398                 return 0;
 399         return (n1 < n2) ? -1 : 1;
 400 }
 401
 402 /* Like cmp_utf16le_strings(), but assumes the strings are null terminated.  */
 403 int
 404 cmp_utf16le_strings_z(const utf16lechar *s1, const utf16lechar *s2,
 405                       bool ignore_case)
 406 {
 407         if (ignore_case) {
 408                 for (;;) {
 409                         u16 c1 = upcase[le16_to_cpu(*s1)];
 410                         u16 c2 = upcase[le16_to_cpu(*s2)];
 411                         if (c1 != c2)
 412                                 return (c1 < c2) ? -1 : 1;
 413                         if (c1 == 0)
 414                                 return 0;
 415                         s1++, s2++;
 416                 }
 417         } else {
 418                 while (*s1 && *s1 == *s2)
 419                         s1++, s2++;
 420                 if (*s1 == *s2)
 421                         return 0;
 422                 return (le16_to_cpu(*s1) < le16_to_cpu(*s2)) ? -1 : 1;
 423         }
 424 }
 425
 426 /* Duplicate a UTF-16 string.  The input string might not be null terminated and
 427  * might be misaligned, but the returned string is guaranteed to be null
 428  * terminated and properly aligned.  */
 429 utf16lechar *
 430 utf16le_dupz(const void *s, size_t size)
 431 {
 432         utf16lechar *dup = MALLOC(size + sizeof(utf16lechar));
 433         if (dup) {
 434                 memcpy(dup, s, size);
 435                 dup[size / sizeof(utf16lechar)] = 0;
 436         }
 437         return dup;
 438 }
 439
 440 /* Duplicate a null-terminated UTF-16 string.  */
 441 utf16lechar *
 442 utf16le_dup(const utf16lechar *s)
 443 {
 444         return memdup(s, utf16le_len_bytes(s) + sizeof(utf16lechar));
 445 }
 446
 447 /* Return the length, in bytes, of a null terminated UTF-16 string, excluding
 448  * the null terminator.  */
 449 size_t
 450 utf16le_len_bytes(const utf16lechar *s)
 451 {
 452         const utf16lechar *p = s;
 453         while (*p)
 454                 p++;
 455         return (p - s) * sizeof(utf16lechar);
 456 }
 457
 458 /* Return the length, in UTF-16 coding units, of a null terminated UTF-16
 459  * string, excluding the null terminator.  */
 460 size_t
 461 utf16le_len_chars(const utf16lechar *s)
 462 {
 463         return utf16le_len_bytes(s) / sizeof(utf16lechar);
 464 }
 465
 466 #ifdef ENABLE_TEST_SUPPORT
 467
 468 #include "wimlib/test_support.h"
 469
 470 WIMLIBAPI int
 471 wimlib_utf8_to_utf16le(const char *in, size_t in_nbytes,
 472                        utf16lechar **out_ret, size_t *out_nbytes_ret)
 473 {
 474         return utf8_to_utf16le(in, in_nbytes, out_ret, out_nbytes_ret);
 475 }
 476
 477 WIMLIBAPI int
 478 wimlib_utf16le_to_utf8(const utf16lechar *in, size_t in_nbytes,
 479                        char **out_ret, size_t *out_nbytes_ret)
 480 {
 481         return utf16le_to_utf8(in, in_nbytes, out_ret, out_nbytes_ret);
 482 }
 483 #endif /* ENABLE_TEST_SUPPORT */