2 * encoding.c - UTF-8 and UTF-16LE codecs and utility functions
4 * Copyright 2012-2023 Eric Biggers
6 * This file is free software; you can redistribute it and/or modify it under
7 * the terms of the GNU Lesser General Public License as published by the Free
8 * Software Foundation; either version 3 of the License, or (at your option) any
11 * This file is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with this file; if not, see https://www.gnu.org/licenses/.
27 #include "wimlib/encoding.h"
28 #include "wimlib/endianness.h"
29 #include "wimlib/error.h"
30 #include "wimlib/unaligned.h"
31 #include "wimlib/util.h"
34 * Allow unpaired surrogates, such as might exist in Windows-style filenames ---
35 * which are normally valid UTF-16LE, but are actually treated as opaque
36 * sequences of 16-bit WCHARs by Windows. When decoding "UTF-16LE", unpaired
37 * surrogates will be decoded as their surrogate codepoints; and when encoding
38 * to and from "UTF-8", the encoding will actually be WTF-8 ("Wobbly
39 * Transformation Format - 8-bit"), a superset of UTF-8 which permits the
40 * surrogate codepoints.
42 * In combination with also allowing the "non-character" codepoints U+FFFE and
43 * U+FFFF, the result is that every Windows-style filename can be translated to
44 * a UNIX-style filename.
46 * Unfortunately, the converse is not true: not every UNIX filename can be
47 * translated to a Windows filename. Only UNIX filenames that are valid "WTF-8"
48 * can be translated. I considered ways to define a bijective mapping, but
49 * there did not seem to be a straightforward way. The "UTF-8b" scheme, for
50 * example, would map each invalid byte 'b' to a surrogate "escape code" 'U+DC00
51 * + b'. The problem with this was that surrogate escape codes can be combined
52 * to create a valid UTF-8 sequence, thus breaking the bijection by mapping
53 * multiple Windows filenames to a single UNIX filename.
55 #define ALLOW_UNPAIRED_SURROGATES 1
57 #define INVALID_CODEPOINT 0xFFFFFFFF
58 #define VALIDATE(expr) if (validate && unlikely(!(expr))) goto invalid
59 #define IS_SURROGATE(c) ((c) >= 0xD800 && (c) < 0xE000)
60 #define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) < 0xDC00)
61 #define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) < 0xE000)
62 #define IS_UTF8_TAIL(c) (((c) & 0xC0) == 0x80)
65 * Decode the next Unicode codepoint from the string at @in, which has
66 * @remaining >= 1 bytes remaining. Return the number of bytes consumed and
67 * write the decoded codepoint to *c_ret.
69 * If the input might not be a valid string in the source encoding, then
70 * @validate must be specified as %true, and then on invalid input the function
71 * consumes at least one byte and sets *c_ret to INVALID_CODEPOINT. If the
72 * input is guaranteed to be valid, then @validate may be specified as %false.
74 typedef unsigned (*decode_codepoint_fn)(const u8 *in, size_t remaining,
75 bool validate, u32 *c_ret);
77 /* Encode the Unicode codepoint @c and return the number of bytes used. */
78 typedef unsigned (*encode_codepoint_fn)(u32 c, u8 *out);
80 static forceinline unsigned
81 utf8_decode_codepoint(const u8 *in, size_t remaining, bool validate, u32 *c_ret)
83 if (likely(in[0] < 0x80)) { /* U+0...U+7F */
88 if (in[0] < 0xE0) { /* U+80...U+7FF */
89 VALIDATE(in[0] >= 0xC2 && remaining >= 2 &&
91 *c_ret = ((u32)(in[0] & 0x1F) << 6) |
92 ((u32)(in[1] & 0x3F) << 0);
96 if (in[0] < 0xF0) { /* U+800...U+FFFF, possibly excluding surrogates */
97 VALIDATE(remaining >= 3 &&
98 IS_UTF8_TAIL(in[1]) &&
100 *c_ret = ((u32)(in[0] & 0x0F) << 12) |
101 ((u32)(in[1] & 0x3F) << 6) |
102 ((u32)(in[2] & 0x3F) << 0);
103 VALIDATE(*c_ret >= 0x800);
104 #if !ALLOW_UNPAIRED_SURROGATES
105 VALIDATE(!IS_SURROGATE(*c_ret));
110 /* U+10000...U+10FFFF */
111 VALIDATE(in[0] < 0xF8 && remaining >= 4 &&
112 IS_UTF8_TAIL(in[1]) &&
113 IS_UTF8_TAIL(in[2]) &&
114 IS_UTF8_TAIL(in[3]));
115 *c_ret = ((u32)(in[0] & 0x07) << 18) |
116 ((u32)(in[1] & 0x3F) << 12) |
117 ((u32)(in[2] & 0x3F) << 6) |
118 ((u32)(in[3] & 0x3F) << 0);
119 VALIDATE(*c_ret >= 0x10000 && *c_ret <= 0x10FFFF);
123 *c_ret = INVALID_CODEPOINT;
127 static forceinline unsigned
128 utf8_encode_codepoint(u32 c, u8 *out)
130 if (likely(c < 0x80)) {
136 out[0] = 0xC0 | (c >> 6);
137 out[1] = 0x80 | (c & 0x3F);
142 out[0] = 0xE0 | (c >> 12);
143 out[1] = 0x80 | ((c >> 6) & 0x3F);
144 out[2] = 0x80 | (c & 0x3F);
148 out[0] = 0xF0 | (c >> 18);
149 out[1] = 0x80 | ((c >> 12) & 0x3F);
150 out[2] = 0x80 | ((c >> 6) & 0x3F);
151 out[3] = 0x80 | (c & 0x3F);
155 static forceinline unsigned
156 utf16le_decode_codepoint(const u8 *in, size_t remaining, bool validate,
161 VALIDATE(remaining >= 2);
162 h = get_unaligned_le16(in);
163 if (unlikely(IS_SURROGATE(h))) {
164 /* Surrogate pairs are U+10000...U+10FFFF.
165 * Unpaired surrogates are U+D800...U+DFFF. */
166 #if ALLOW_UNPAIRED_SURROGATES
167 if (unlikely(!IS_HIGH_SURROGATE(h) || remaining < 4))
169 l = get_unaligned_le16(in + 2);
170 if (unlikely(!IS_LOW_SURROGATE(l)))
173 VALIDATE(IS_HIGH_SURROGATE(h) && remaining >= 4);
174 l = get_unaligned_le16(in + 2);
175 VALIDATE(IS_LOW_SURROGATE(l));
177 *c_ret = 0x10000 + ((h - 0xD800) << 10) + (l - 0xDC00);
180 #if ALLOW_UNPAIRED_SURROGATES
187 *c_ret = INVALID_CODEPOINT;
188 return min(remaining, 2);
191 static forceinline unsigned
192 utf16le_encode_codepoint(u32 c, u8 *out)
194 if (likely(c < 0x10000)) {
195 put_unaligned_le16(c, out);
199 put_unaligned_le16(0xD800 + (c >> 10), out);
200 put_unaligned_le16(0xDC00 + (c & 0x3FF), out + 2);
205 * Convert the string @in of size @in_nbytes from the encoding given by the
206 * @decode_codepoint function to the encoding given by the @encode_codepoint
207 * function. @in does not need to be null-terminated, but a null terminator
208 * will be added to the output string.
210 * On success, write the allocated output string to @out_ret (must not be NULL)
211 * and its size excluding the null terminator to @out_nbytes_ret (may be NULL).
213 * If the input string is malformed, return @ilseq_err with errno set to EILSEQ.
214 * If out of memory, return WIMLIB_ERR_NOMEM with errno set to ENOMEM.
216 static forceinline int
217 convert_string(const u8 * const in, const size_t in_nbytes,
218 u8 **out_ret, size_t *out_nbytes_ret,
220 decode_codepoint_fn decode_codepoint,
221 encode_codepoint_fn encode_codepoint)
225 size_t out_nbytes = 0;
227 u8 tmp[8]; /* assuming no codepoint requires > 8 bytes to encode */
230 /* Validate the input string and compute the output size. */
231 for (i = 0; i < in_nbytes; ) {
232 i += (*decode_codepoint)(&in[i], in_nbytes - i, true, &c);
233 if (unlikely(c == INVALID_CODEPOINT)) {
237 out_nbytes += (*encode_codepoint)(c, tmp);
240 /* Allocate the output string, including space for a null terminator. */
241 out = MALLOC(out_nbytes + (*encode_codepoint)(0, tmp));
243 return WIMLIB_ERR_NOMEM;
245 /* Do the conversion. */
247 for (i = 0; i < in_nbytes; ) {
248 i += (*decode_codepoint)(&in[i], in_nbytes - i, false, &c);
249 p_out += (*encode_codepoint)(c, p_out);
252 /* Add a null terminator. */
253 (*encode_codepoint)(0, p_out);
255 /* Return the output string and its size (by reference). */
258 *out_nbytes_ret = out_nbytes;
263 utf8_to_utf16le(const char *in, size_t in_nbytes,
264 utf16lechar **out_ret, size_t *out_nbytes_ret)
266 return convert_string((const u8 *)in, in_nbytes,
267 (u8 **)out_ret, out_nbytes_ret,
268 WIMLIB_ERR_INVALID_UTF8_STRING,
269 utf8_decode_codepoint, utf16le_encode_codepoint);
273 utf16le_to_utf8(const utf16lechar *in, size_t in_nbytes,
274 char **out_ret, size_t *out_nbytes_ret)
276 return convert_string((const u8 *)in, in_nbytes,
277 (u8 **)out_ret, out_nbytes_ret,
278 WIMLIB_ERR_INVALID_UTF16_STRING,
279 utf16le_decode_codepoint, utf8_encode_codepoint);
283 * A table that maps from UCS-2 characters to their upper case equivalents.
284 * Index and array values are both CPU endian.
285 * Note: this is only an *approximation* of real UTF-16 case folding.
292 /* This is the table used in NTFS volumes formatted by Windows 10.
293 * It was compressed by tools/compress_upcase_table.c. */
294 static const u16 upcase_compressed[] = {
295 0x0000, 0x0000, 0x0060, 0x0000, 0x0000, 0xffe0, 0x0019, 0x0061,
296 0x0061, 0x0000, 0x001b, 0x005d, 0x0008, 0x0060, 0x0000, 0x0079,
297 0x0000, 0x0000, 0x0000, 0xffff, 0x002f, 0x0100, 0x0002, 0x0000,
298 0x0007, 0x012b, 0x0011, 0x0121, 0x002f, 0x0103, 0x0006, 0x0101,
299 0x0000, 0x00c3, 0x0006, 0x0131, 0x0007, 0x012e, 0x0004, 0x0000,
300 0x0003, 0x012f, 0x0000, 0x0061, 0x0004, 0x0130, 0x0000, 0x00a3,
301 0x0003, 0x0000, 0x0000, 0x0082, 0x000b, 0x0131, 0x0006, 0x0189,
302 0x0008, 0x012f, 0x0007, 0x012e, 0x0000, 0x0038, 0x0006, 0x0000,
303 0x0000, 0xfffe, 0x0007, 0x01c4, 0x000f, 0x0101, 0x0000, 0xffb1,
304 0x0015, 0x011e, 0x0004, 0x01cc, 0x002a, 0x0149, 0x0014, 0x0149,
305 0x0007, 0x0000, 0x0009, 0x018c, 0x000b, 0x0138, 0x0000, 0x2a1f,
306 0x0000, 0x2a1c, 0x0000, 0x0000, 0x0000, 0xff2e, 0x0000, 0xff32,
307 0x0000, 0x0000, 0x0000, 0xff33, 0x0000, 0xff33, 0x0000, 0x0000,
308 0x0000, 0xff36, 0x0000, 0x0000, 0x0000, 0xff35, 0x0004, 0x0000,
309 0x0002, 0x0257, 0x0000, 0x0000, 0x0000, 0xff31, 0x0004, 0x0000,
310 0x0000, 0xff2f, 0x0000, 0xff2d, 0x0000, 0x0000, 0x0000, 0x29f7,
311 0x0003, 0x0000, 0x0002, 0x0269, 0x0000, 0x29fd, 0x0000, 0xff2b,
312 0x0002, 0x0000, 0x0000, 0xff2a, 0x0007, 0x0000, 0x0000, 0x29e7,
313 0x0002, 0x0000, 0x0000, 0xff26, 0x0005, 0x027e, 0x0003, 0x027e,
314 0x0000, 0xffbb, 0x0000, 0xff27, 0x0000, 0xff27, 0x0000, 0xffb9,
315 0x0005, 0x0000, 0x0000, 0xff25, 0x0065, 0x007b, 0x0079, 0x0293,
316 0x0008, 0x012d, 0x0003, 0x019c, 0x0002, 0x037b, 0x002e, 0x0000,
317 0x0000, 0xffda, 0x0000, 0xffdb, 0x0002, 0x03ad, 0x0012, 0x0060,
318 0x000a, 0x0060, 0x0000, 0xffc0, 0x0000, 0xffc1, 0x0000, 0xffc1,
319 0x0008, 0x0000, 0x0000, 0xfff8, 0x001a, 0x0118, 0x0000, 0x0007,
320 0x0008, 0x018d, 0x0009, 0x0233, 0x0046, 0x0035, 0x0006, 0x0061,
321 0x0000, 0xffb0, 0x000f, 0x0450, 0x0025, 0x010e, 0x000a, 0x036b,
322 0x0032, 0x048b, 0x000e, 0x0100, 0x0000, 0xfff1, 0x0037, 0x048a,
323 0x0026, 0x0465, 0x0034, 0x0000, 0x0000, 0xffd0, 0x0025, 0x0561,
324 0x00de, 0x0293, 0x1714, 0x0587, 0x0000, 0x8a04, 0x0003, 0x0000,
325 0x0000, 0x0ee6, 0x0087, 0x02ee, 0x0092, 0x1e01, 0x0069, 0x1df7,
326 0x0000, 0x0008, 0x0007, 0x1f00, 0x0008, 0x0000, 0x000e, 0x1f02,
327 0x0008, 0x1f0e, 0x0010, 0x1f06, 0x001a, 0x1f06, 0x0002, 0x1f0f,
328 0x0007, 0x1f50, 0x0017, 0x1f19, 0x0000, 0x004a, 0x0000, 0x004a,
329 0x0000, 0x0056, 0x0003, 0x1f72, 0x0000, 0x0064, 0x0000, 0x0064,
330 0x0000, 0x0080, 0x0000, 0x0080, 0x0000, 0x0070, 0x0000, 0x0070,
331 0x0000, 0x007e, 0x0000, 0x007e, 0x0028, 0x1f1e, 0x000c, 0x1f06,
332 0x0000, 0x0000, 0x0000, 0x0009, 0x000f, 0x0000, 0x000d, 0x1fb3,
333 0x000d, 0x1f44, 0x0008, 0x1fcd, 0x0006, 0x03f2, 0x0015, 0x1fbb,
334 0x014e, 0x0587, 0x0000, 0xffe4, 0x0021, 0x0000, 0x0000, 0xfff0,
335 0x000f, 0x2170, 0x000a, 0x0238, 0x0346, 0x0587, 0x0000, 0xffe6,
336 0x0019, 0x24d0, 0x0746, 0x0587, 0x0026, 0x0561, 0x000b, 0x057e,
337 0x0004, 0x012f, 0x0000, 0xd5d5, 0x0000, 0xd5d8, 0x000c, 0x022e,
338 0x000e, 0x03f8, 0x006e, 0x1e33, 0x0011, 0x0000, 0x0000, 0xe3a0,
339 0x0025, 0x2d00, 0x17f2, 0x0587, 0x6129, 0x2d26, 0x002e, 0x0201,
340 0x002a, 0x1def, 0x0098, 0xa5b7, 0x0040, 0x1dff, 0x000e, 0x0368,
341 0x000d, 0x022b, 0x034c, 0x2184, 0x5469, 0x2d26, 0x007f, 0x0061,
345 /* Simple LZ decoder */
346 const u16 *in_next = upcase_compressed;
347 for (u32 i = 0; i < ARRAY_LEN(upcase); ) {
348 u16 length = *in_next++;
349 u16 src_pos = *in_next++;
352 upcase[i++] = src_pos;
356 upcase[i++] = upcase[src_pos++];
362 for (u32 i = 0; i < ARRAY_LEN(upcase); i++)
367 * Compare UTF-16LE strings case-sensitively (%ignore_case == false) or
368 * case-insensitively (%ignore_case == true).
370 * This is implemented using the default upper-case table used by NTFS. It does
371 * not handle all possible cases allowed by UTF-16LE. For example, different
372 * normalizations of the same sequence of "characters" are not considered equal.
373 * It hopefully does the right thing most of the time though.
376 cmp_utf16le_strings(const utf16lechar *s1, size_t n1,
377 const utf16lechar *s2, size_t n2,
380 size_t n = min(n1, n2);
383 for (size_t i = 0; i < n; i++) {
384 u16 c1 = upcase[le16_to_cpu(s1[i])];
385 u16 c2 = upcase[le16_to_cpu(s2[i])];
387 return (c1 < c2) ? -1 : 1;
390 for (size_t i = 0; i < n; i++) {
391 u16 c1 = le16_to_cpu(s1[i]);
392 u16 c2 = le16_to_cpu(s2[i]);
394 return (c1 < c2) ? -1 : 1;
399 return (n1 < n2) ? -1 : 1;
402 /* Like cmp_utf16le_strings(), but assumes the strings are null terminated. */
404 cmp_utf16le_strings_z(const utf16lechar *s1, const utf16lechar *s2,
409 u16 c1 = upcase[le16_to_cpu(*s1)];
410 u16 c2 = upcase[le16_to_cpu(*s2)];
412 return (c1 < c2) ? -1 : 1;
418 while (*s1 && *s1 == *s2)
422 return (le16_to_cpu(*s1) < le16_to_cpu(*s2)) ? -1 : 1;
426 /* Duplicate a UTF-16 string. The input string might not be null terminated and
427 * might be misaligned, but the returned string is guaranteed to be null
428 * terminated and properly aligned. */
430 utf16le_dupz(const void *s, size_t size)
432 utf16lechar *dup = MALLOC(size + sizeof(utf16lechar));
434 memcpy(dup, s, size);
435 dup[size / sizeof(utf16lechar)] = 0;
440 /* Duplicate a null-terminated UTF-16 string. */
442 utf16le_dup(const utf16lechar *s)
444 return memdup(s, utf16le_len_bytes(s) + sizeof(utf16lechar));
447 /* Return the length, in bytes, of a null terminated UTF-16 string, excluding
448 * the null terminator. */
450 utf16le_len_bytes(const utf16lechar *s)
452 const utf16lechar *p = s;
455 return (p - s) * sizeof(utf16lechar);
458 /* Return the length, in UTF-16 coding units, of a null terminated UTF-16
459 * string, excluding the null terminator. */
461 utf16le_len_chars(const utf16lechar *s)
463 return utf16le_len_bytes(s) / sizeof(utf16lechar);
466 #ifdef ENABLE_TEST_SUPPORT
468 #include "wimlib/test_support.h"
471 wimlib_utf8_to_utf16le(const char *in, size_t in_nbytes,
472 utf16lechar **out_ret, size_t *out_nbytes_ret)
474 return utf8_to_utf16le(in, in_nbytes, out_ret, out_nbytes_ret);
478 wimlib_utf16le_to_utf8(const utf16lechar *in, size_t in_nbytes,
479 char **out_ret, size_t *out_nbytes_ret)
481 return utf16le_to_utf8(in, in_nbytes, out_ret, out_nbytes_ret);
483 #endif /* ENABLE_TEST_SUPPORT */