2 * encoding.c - UTF-8 and UTF-16LE codecs and utility functions
4 * Copyright (C) 2012-2016 Eric Biggers
6 * This file is free software; you can redistribute it and/or modify it under
7 * the terms of the GNU Lesser General Public License as published by the Free
8 * Software Foundation; either version 3 of the License, or (at your option) any
11 * This file is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with this file; if not, see http://www.gnu.org/licenses/.
27 #include "wimlib/encoding.h"
28 #include "wimlib/endianness.h"
29 #include "wimlib/error.h"
30 #include "wimlib/unaligned.h"
31 #include "wimlib/util.h"
34 * Allow unpaired surrogates, such as might exist in Windows-style filenames ---
35 * which are normally valid UTF-16LE, but are actually treated as opaque
36 * sequences of 16-bit WCHARs by Windows. When decoding "UTF-16LE", unpaired
37 * surrogates will be decoded as their surrogate codepoints; and when encoding
38 * to and from "UTF-8", the encoding will actually be WTF-8 ("Wobbly
39 * Transformation Format - 8-bit"), a superset of UTF-8 which permits the
40 * surrogate codepoints.
42 #define ALLOW_UNPAIRED_SURROGATES 1
44 #define INVALID_CODEPOINT 0xFFFFFFFF
45 #define VALIDATE(expr) if (validate && unlikely(!(expr))) goto invalid
46 #define IS_SURROGATE(c) ((c) >= 0xD800 && (c) < 0xE000)
47 #define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) < 0xDC00)
48 #define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) < 0xE000)
49 #define IS_UTF8_TAIL(c) (((c) & 0xC0) == 0x80)
52 * Decode the next Unicode codepoint from the string at @in, which has
53 * @remaining >= 1 bytes remaining. Return the number of bytes consumed and
54 * write the decoded codepoint to *c_ret.
56 * If the input might not be a valid string in the source encoding, then
57 * @validate must be specified as %true, and then on invalid input the function
58 * consumes at least one byte and sets *c_ret to INVALID_CODEPOINT. If the
59 * input is guaranteed to be valid, then @validate may be specified as %false.
61 typedef unsigned (*decode_codepoint_fn)(const u8 *in, size_t remaining,
62 bool validate, u32 *c_ret);
64 /* Encode the Unicode codepoint @c and return the number of bytes used. */
65 typedef unsigned (*encode_codepoint_fn)(u32 c, u8 *out);
67 static inline unsigned
68 utf8_decode_codepoint(const u8 *in, size_t remaining, bool validate, u32 *c_ret)
70 if (likely(in[0] < 0x80)) { /* U+0...U+7F */
75 if (in[0] < 0xE0) { /* U+80...U+7FF */
76 VALIDATE(in[0] >= 0xC2 && remaining >= 2 &&
78 *c_ret = ((u32)(in[0] & 0x1F) << 6) |
79 ((u32)(in[1] & 0x3F) << 0);
83 if (in[0] < 0xF0) { /* U+800...U+FFFF, possibly excluding surrogates */
84 VALIDATE(remaining >= 3 &&
85 IS_UTF8_TAIL(in[1]) &&
87 *c_ret = ((u32)(in[0] & 0x0F) << 12) |
88 ((u32)(in[1] & 0x3F) << 6) |
89 ((u32)(in[2] & 0x3F) << 0);
90 VALIDATE(*c_ret >= 0x800);
91 #if !ALLOW_UNPAIRED_SURROGATES
92 VALIDATE(!IS_SURROGATE(*c_ret));
97 /* U+10000...U+10FFFF */
98 VALIDATE(in[0] < 0xF8 && remaining >= 4 &&
99 IS_UTF8_TAIL(in[1]) &&
100 IS_UTF8_TAIL(in[2]) &&
101 IS_UTF8_TAIL(in[3]));
102 *c_ret = ((u32)(in[0] & 0x07) << 18) |
103 ((u32)(in[1] & 0x3F) << 12) |
104 ((u32)(in[2] & 0x3F) << 6) |
105 ((u32)(in[3] & 0x3F) << 0);
106 VALIDATE(*c_ret >= 0x10000 && *c_ret <= 0x10FFFF);
110 *c_ret = INVALID_CODEPOINT;
114 static inline unsigned
115 utf8_encode_codepoint(u32 c, u8 *out)
117 if (likely(c < 0x80)) {
123 out[0] = 0xC0 | (c >> 6);
124 out[1] = 0x80 | (c & 0x3F);
129 out[0] = 0xE0 | (c >> 12);
130 out[1] = 0x80 | ((c >> 6) & 0x3F);
131 out[2] = 0x80 | (c & 0x3F);
135 out[0] = 0xF0 | (c >> 18);
136 out[1] = 0x80 | ((c >> 12) & 0x3F);
137 out[2] = 0x80 | ((c >> 6) & 0x3F);
138 out[3] = 0x80 | (c & 0x3F);
142 static inline unsigned
143 utf16le_decode_codepoint(const u8 *in, size_t remaining, bool validate,
148 VALIDATE(remaining >= 2);
149 h = get_unaligned_le16(in);
150 if (unlikely(IS_SURROGATE(h))) {
151 /* Surrogate pairs are U+10000...U+10FFFF.
152 * Unpaired surrogates are U+D800...U+DFFF. */
153 #if ALLOW_UNPAIRED_SURROGATES
154 if (unlikely(!IS_HIGH_SURROGATE(h) || remaining < 4))
156 l = get_unaligned_le16(in + 2);
157 if (unlikely(!IS_LOW_SURROGATE(l)))
160 VALIDATE(IS_HIGH_SURROGATE(h) && remaining >= 4);
161 l = get_unaligned_le16(in + 2);
162 VALIDATE(IS_LOW_SURROGATE(l));
164 *c_ret = 0x10000 + ((h - 0xD800) << 10) + (l - 0xDC00);
167 #if ALLOW_UNPAIRED_SURROGATES
174 *c_ret = INVALID_CODEPOINT;
175 return min(remaining, 2);
178 static inline unsigned
179 utf16le_encode_codepoint(u32 c, u8 *out)
181 if (likely(c < 0x10000)) {
182 put_unaligned_le16(c, out);
186 put_unaligned_le16(0xD800 + (c >> 10), out);
187 put_unaligned_le16(0xDC00 + (c & 0x3FF), out + 2);
192 * Convert the string @in of size @in_nbytes from the encoding given by the
193 * @decode_codepoint function to the encoding given by the @encode_codepoint
194 * function. @in does not need to be null-terminated, but a null terminator
195 * will be added to the output string.
197 * On success, write the allocated output string to @out_ret (must not be NULL)
198 * and its size excluding the null terminator to @out_nbytes_ret (may be NULL).
200 * If the input string is malformed, return @ilseq_err with errno set to EILSEQ.
201 * If out of memory, return WIMLIB_ERR_NOMEM with errno set to ENOMEM.
204 convert_string(const u8 * const in, const size_t in_nbytes,
205 u8 **out_ret, size_t *out_nbytes_ret,
207 decode_codepoint_fn decode_codepoint,
208 encode_codepoint_fn encode_codepoint)
210 const u8 * const in_end = in + in_nbytes;
213 size_t out_nbytes = 0;
215 u8 tmp[8]; /* assuming no codepoint requires > 8 bytes to encode */
218 /* Validate the input string and compute the output size. */
219 for (p_in = in; p_in != in_end; ) {
220 p_in += (*decode_codepoint)(p_in, in_end - p_in, true, &c);
221 if (unlikely(c == INVALID_CODEPOINT)) {
225 out_nbytes += (*encode_codepoint)(c, tmp);
228 /* Allocate the output string, including space for a null terminator. */
229 out = MALLOC(out_nbytes + (*encode_codepoint)(0, tmp));
231 return WIMLIB_ERR_NOMEM;
233 /* Do the conversion. */
234 for (p_in = in, p_out = out; p_in != in_end; ) {
235 p_in += (*decode_codepoint)(p_in, in_end - p_in, false, &c);
236 p_out += (*encode_codepoint)(c, p_out);
239 /* Add a null terminator. */
240 (*encode_codepoint)(0, p_out);
242 /* Return the output string and its size (by reference). */
245 *out_nbytes_ret = out_nbytes;
250 utf8_to_utf16le(const char *in, size_t in_nbytes,
251 utf16lechar **out_ret, size_t *out_nbytes_ret)
253 return convert_string((const u8 *)in, in_nbytes,
254 (u8 **)out_ret, out_nbytes_ret,
255 WIMLIB_ERR_INVALID_UTF8_STRING,
256 utf8_decode_codepoint, utf16le_encode_codepoint);
260 utf16le_to_utf8(const utf16lechar *in, size_t in_nbytes,
261 char **out_ret, size_t *out_nbytes_ret)
263 return convert_string((const u8 *)in, in_nbytes,
264 (u8 **)out_ret, out_nbytes_ret,
265 WIMLIB_ERR_INVALID_UTF16_STRING,
266 utf16le_decode_codepoint, utf8_encode_codepoint);
270 * A table that maps from UCS-2 characters to their upper case equivalents.
271 * Index and array values are both CPU endian.
272 * Note: this is only an *approximation* of real UTF-16 case folding.
279 /* This is the table used in NTFS volumes formatted by Windows 10.
280 * It was compressed by tools/compress_upcase_table.c. */
281 static const u16 upcase_compressed[] = {
282 0x0000, 0x0000, 0x0060, 0x0000, 0x0000, 0xffe0, 0x0019, 0x0061,
283 0x0061, 0x0000, 0x001b, 0x005d, 0x0008, 0x0060, 0x0000, 0x0079,
284 0x0000, 0x0000, 0x0000, 0xffff, 0x002f, 0x0100, 0x0002, 0x0000,
285 0x0007, 0x012b, 0x0011, 0x0121, 0x002f, 0x0103, 0x0006, 0x0101,
286 0x0000, 0x00c3, 0x0006, 0x0131, 0x0007, 0x012e, 0x0004, 0x0000,
287 0x0003, 0x012f, 0x0000, 0x0061, 0x0004, 0x0130, 0x0000, 0x00a3,
288 0x0003, 0x0000, 0x0000, 0x0082, 0x000b, 0x0131, 0x0006, 0x0189,
289 0x0008, 0x012f, 0x0007, 0x012e, 0x0000, 0x0038, 0x0006, 0x0000,
290 0x0000, 0xfffe, 0x0007, 0x01c4, 0x000f, 0x0101, 0x0000, 0xffb1,
291 0x0015, 0x011e, 0x0004, 0x01cc, 0x002a, 0x0149, 0x0014, 0x0149,
292 0x0007, 0x0000, 0x0009, 0x018c, 0x000b, 0x0138, 0x0000, 0x2a1f,
293 0x0000, 0x2a1c, 0x0000, 0x0000, 0x0000, 0xff2e, 0x0000, 0xff32,
294 0x0000, 0x0000, 0x0000, 0xff33, 0x0000, 0xff33, 0x0000, 0x0000,
295 0x0000, 0xff36, 0x0000, 0x0000, 0x0000, 0xff35, 0x0004, 0x0000,
296 0x0002, 0x0257, 0x0000, 0x0000, 0x0000, 0xff31, 0x0004, 0x0000,
297 0x0000, 0xff2f, 0x0000, 0xff2d, 0x0000, 0x0000, 0x0000, 0x29f7,
298 0x0003, 0x0000, 0x0002, 0x0269, 0x0000, 0x29fd, 0x0000, 0xff2b,
299 0x0002, 0x0000, 0x0000, 0xff2a, 0x0007, 0x0000, 0x0000, 0x29e7,
300 0x0002, 0x0000, 0x0000, 0xff26, 0x0005, 0x027e, 0x0003, 0x027e,
301 0x0000, 0xffbb, 0x0000, 0xff27, 0x0000, 0xff27, 0x0000, 0xffb9,
302 0x0005, 0x0000, 0x0000, 0xff25, 0x0065, 0x007b, 0x0079, 0x0293,
303 0x0008, 0x012d, 0x0003, 0x019c, 0x0002, 0x037b, 0x002e, 0x0000,
304 0x0000, 0xffda, 0x0000, 0xffdb, 0x0002, 0x03ad, 0x0012, 0x0060,
305 0x000a, 0x0060, 0x0000, 0xffc0, 0x0000, 0xffc1, 0x0000, 0xffc1,
306 0x0008, 0x0000, 0x0000, 0xfff8, 0x001a, 0x0118, 0x0000, 0x0007,
307 0x0008, 0x018d, 0x0009, 0x0233, 0x0046, 0x0035, 0x0006, 0x0061,
308 0x0000, 0xffb0, 0x000f, 0x0450, 0x0025, 0x010e, 0x000a, 0x036b,
309 0x0032, 0x048b, 0x000e, 0x0100, 0x0000, 0xfff1, 0x0037, 0x048a,
310 0x0026, 0x0465, 0x0034, 0x0000, 0x0000, 0xffd0, 0x0025, 0x0561,
311 0x00de, 0x0293, 0x1714, 0x0587, 0x0000, 0x8a04, 0x0003, 0x0000,
312 0x0000, 0x0ee6, 0x0087, 0x02ee, 0x0092, 0x1e01, 0x0069, 0x1df7,
313 0x0000, 0x0008, 0x0007, 0x1f00, 0x0008, 0x0000, 0x000e, 0x1f02,
314 0x0008, 0x1f0e, 0x0010, 0x1f06, 0x001a, 0x1f06, 0x0002, 0x1f0f,
315 0x0007, 0x1f50, 0x0017, 0x1f19, 0x0000, 0x004a, 0x0000, 0x004a,
316 0x0000, 0x0056, 0x0003, 0x1f72, 0x0000, 0x0064, 0x0000, 0x0064,
317 0x0000, 0x0080, 0x0000, 0x0080, 0x0000, 0x0070, 0x0000, 0x0070,
318 0x0000, 0x007e, 0x0000, 0x007e, 0x0028, 0x1f1e, 0x000c, 0x1f06,
319 0x0000, 0x0000, 0x0000, 0x0009, 0x000f, 0x0000, 0x000d, 0x1fb3,
320 0x000d, 0x1f44, 0x0008, 0x1fcd, 0x0006, 0x03f2, 0x0015, 0x1fbb,
321 0x014e, 0x0587, 0x0000, 0xffe4, 0x0021, 0x0000, 0x0000, 0xfff0,
322 0x000f, 0x2170, 0x000a, 0x0238, 0x0346, 0x0587, 0x0000, 0xffe6,
323 0x0019, 0x24d0, 0x0746, 0x0587, 0x0026, 0x0561, 0x000b, 0x057e,
324 0x0004, 0x012f, 0x0000, 0xd5d5, 0x0000, 0xd5d8, 0x000c, 0x022e,
325 0x000e, 0x03f8, 0x006e, 0x1e33, 0x0011, 0x0000, 0x0000, 0xe3a0,
326 0x0025, 0x2d00, 0x17f2, 0x0587, 0x6129, 0x2d26, 0x002e, 0x0201,
327 0x002a, 0x1def, 0x0098, 0xa5b7, 0x0040, 0x1dff, 0x000e, 0x0368,
328 0x000d, 0x022b, 0x034c, 0x2184, 0x5469, 0x2d26, 0x007f, 0x0061,
332 /* Simple LZ decoder */
333 const u16 *in_next = upcase_compressed;
334 for (u32 i = 0; i < ARRAY_LEN(upcase); ) {
335 u16 length = *in_next++;
336 u16 src_pos = *in_next++;
339 upcase[i++] = src_pos;
343 upcase[i++] = upcase[src_pos++];
349 for (u32 i = 0; i < ARRAY_LEN(upcase); i++)
354 * Compare UTF-16LE strings case-sensitively (%ignore_case == false) or
355 * case-insensitively (%ignore_case == true).
357 * This is implemented using the default upper-case table used by NTFS. It does
358 * not handle all possible cases allowed by UTF-16LE. For example, different
359 * normalizations of the same sequence of "characters" are not considered equal.
360 * It hopefully does the right thing most of the time though.
363 cmp_utf16le_strings(const utf16lechar *s1, size_t n1,
364 const utf16lechar *s2, size_t n2,
367 size_t n = min(n1, n2);
370 for (size_t i = 0; i < n; i++) {
371 u16 c1 = upcase[le16_to_cpu(s1[i])];
372 u16 c2 = upcase[le16_to_cpu(s2[i])];
374 return (c1 < c2) ? -1 : 1;
377 for (size_t i = 0; i < n; i++) {
378 u16 c1 = le16_to_cpu(s1[i]);
379 u16 c2 = le16_to_cpu(s2[i]);
381 return (c1 < c2) ? -1 : 1;
386 return (n1 < n2) ? -1 : 1;
389 /* Like cmp_utf16le_strings(), but assumes the strings are null terminated. */
391 cmp_utf16le_strings_z(const utf16lechar *s1, const utf16lechar *s2,
396 u16 c1 = upcase[le16_to_cpu(*s1)];
397 u16 c2 = upcase[le16_to_cpu(*s2)];
399 return (c1 < c2) ? -1 : 1;
405 while (*s1 && *s1 == *s2)
409 return (le16_to_cpu(*s1) < le16_to_cpu(*s2)) ? -1 : 1;
413 /* Duplicate a UTF-16 string. The input string might not be null terminated and
414 * might be misaligned, but the returned string is guaranteed to be null
415 * terminated and properly aligned. */
417 utf16le_dupz(const void *s, size_t size)
419 utf16lechar *dup = MALLOC(size + sizeof(utf16lechar));
421 memcpy(dup, s, size);
422 dup[size / sizeof(utf16lechar)] = 0;
427 /* Duplicate a null-terminated UTF-16 string. */
429 utf16le_dup(const utf16lechar *s)
431 return memdup(s, utf16le_len_bytes(s) + sizeof(utf16lechar));
434 /* Return the length, in bytes, of a null terminated UTF-16 string, excluding
435 * the null terminator. */
437 utf16le_len_bytes(const utf16lechar *s)
439 const utf16lechar *p = s;
442 return (p - s) * sizeof(utf16lechar);
445 /* Return the length, in UTF-16 coding units, of a null terminated UTF-16
446 * string, excluding the null terminator. */
448 utf16le_len_chars(const utf16lechar *s)
450 return utf16le_len_bytes(s) / sizeof(utf16lechar);