+/*
+ * encoding.c: Convert UTF-8 to UTF-16LE strings and vice versa
+ */
+
+/*
+ * Copyright (C) 2012 Eric Biggers
+ *
+ * This file is part of wimlib, a library for working with WIM files.
+ *
+ * wimlib is free software; you can redistribute it and/or modify it under the
+ * terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * wimlib is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+ * A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with wimlib; if not, see http://www.gnu.org/licenses/.
+ */
+
+#include "wimlib.h"
+#include "util.h"
+#include "endianness.h"
+
+#include <errno.h>
+
+#ifdef WITH_NTFS_3G
+#include <ntfs-3g/volume.h>
+#include <ntfs-3g/unistr.h>
+#else
+#include <iconv.h>
+#endif
+
+/*
+ * NOTE:
+ *
+ * utf16_to_utf8_size() and utf8_to_utf16_size() were taken from
+ * libntfs-3g/unistr.c in the NTFS-3g sources. (Modified slightly to remove
+ * unneeded functionality.)
+ */
+#ifndef WITH_NTFS_3G
+/*
+ * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
+ * null) to store a given UTF-16LE string.
+ *
+ * Return -1 with errno set if string has invalid byte sequence or too long.
+ */
+static int utf16_to_utf8_size(const u16 *ins, const int ins_len)
+{
+ int i, ret = -1;
+ int count = 0;
+ bool surrog;
+
+ surrog = false;
+ for (i = 0; i < ins_len && ins[i]; i++) {
+ unsigned short c = le16_to_cpu(ins[i]);
+ if (surrog) {
+ if ((c >= 0xdc00) && (c < 0xe000)) {
+ surrog = false;
+ count += 4;
+ } else
+ goto fail;
+ } else
+ if (c < 0x80)
+ count++;
+ else if (c < 0x800)
+ count += 2;
+ else if (c < 0xd800)
+ count += 3;
+ else if (c < 0xdc00)
+ surrog = true;
+#if NOREVBOM
+ else if ((c >= 0xe000) && (c < 0xfffe))
+#else
+ else if (c >= 0xe000)
+#endif
+ count += 3;
+ else
+ goto fail;
+ }
+ if (surrog)
+ goto fail;
+
+ ret = count;
+out:
+ return ret;
+fail:
+ errno = EILSEQ;
+ goto out;
+}
+
+/*
+ * Return the amount of 16-bit elements in UTF-16LE needed
+ * (without the terminating null) to store given UTF-8 string.
+ *
+ * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
+ *
+ * Note: This does not check whether the input sequence is a valid utf8 string,
+ * and should be used only in context where such check is made!
+ */
+static int utf8_to_utf16_size(const char *s)
+{
+ unsigned int byte;
+ size_t count = 0;
+ while ((byte = *((const unsigned char *)s++))) {
+ count++;
+ if (byte >= 0xc0) {
+ if (byte >= 0xF5) {
+ errno = EILSEQ;
+ return -1;
+ }
+ if (!*s)
+ break;
+ if (byte >= 0xC0)
+ s++;
+ if (!*s)
+ break;
+ if (byte >= 0xE0)
+ s++;
+ if (!*s)
+ break;
+ if (byte >= 0xF0) {
+ s++;
+ count++;
+ }
+ }
+ }
+ return count;
+}
+#endif /* !WITH_NTFS_3G */
+
+/* Converts a string in the UTF-16LE encoding to a newly allocated string in the
+ * UTF-8 encoding.
+ *
+ * If available, do so by calling a similar function from libntfs-3g.
+ * Otherwise, use iconv() along with the helper function utf16_to_utf8_size().
+ */
+int utf16_to_utf8(const char *utf16_str, size_t utf16_nbytes,
+ char **utf8_str_ret, size_t *utf8_nbytes_ret)
+{
+ int ret;
+
+ if (utf16_nbytes == 0) {
+ *utf8_str_ret = NULL;
+ *utf8_nbytes_ret = 0;
+ return 0;
+ }
+
+ if (utf16_nbytes & 1) {
+ ERROR("UTF-16LE string is invalid (odd number of bytes)!");
+ return WIMLIB_ERR_INVALID_UTF16_STRING;
+ }
+#ifdef WITH_NTFS_3G
+ char *outs = NULL;
+ int outs_len = ntfs_ucstombs((const ntfschar*)utf16_str,
+ utf16_nbytes / 2, &outs, 0);
+ if (outs_len >= 0) {
+ *utf8_str_ret = outs;
+ *utf8_nbytes_ret = outs_len;
+ ret = 0;
+ } else {
+ if (errno == ENOMEM)
+ ret = WIMLIB_ERR_NOMEM;
+ else
+ ret = WIMLIB_ERR_INVALID_UTF16_STRING;
+ }
+#else /* WITH_NTFS_3G */
+ static iconv_t cd_utf16_to_utf8 = (iconv_t)(-1);
+ if (cd_utf16_to_utf8 == (iconv_t)(-1)) {
+ cd_utf16_to_utf8 = iconv_open("UTF-8", "UTF-16LE");
+ if (cd_utf16_to_utf8 == (iconv_t)-1) {
+ ERROR_WITH_ERRNO("Failed to get conversion descriptor "
+ "for converting UTF-16LE to UTF-8");
+ if (errno == ENOMEM)
+ return WIMLIB_ERR_NOMEM;
+ else
+ return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
+ }
+ }
+ ret = utf16_to_utf8_size((const u16*)utf16_str, utf16_nbytes / 2);
+ if (ret >= 0) {
+ size_t utf8_expected_nbytes;
+ char *utf8_str;
+ size_t utf8_bytes_left;
+ size_t utf16_bytes_left;
+ size_t num_chars_converted;
+ char *utf8_str_save;
+ const char *utf16_str_save;
+
+ utf8_expected_nbytes = ret;
+ utf8_str = MALLOC(utf8_expected_nbytes + 1);
+ if (utf8_str) {
+ utf8_bytes_left = utf8_expected_nbytes;
+ utf16_bytes_left = utf16_nbytes;
+ utf8_str_save = utf8_str;
+ utf16_str_save = utf16_str;
+ num_chars_converted = iconv(cd_utf16_to_utf8,
+ (char**)&utf16_str,
+ &utf16_bytes_left,
+ &utf8_str,
+ &utf8_bytes_left);
+ utf8_str = utf8_str_save;
+ utf16_str = utf16_str_save;
+ if (utf16_bytes_left == 0 &&
+ utf8_bytes_left == 0 &&
+ num_chars_converted != (size_t)(-1))
+ {
+ utf8_str[utf8_expected_nbytes] = '\0';
+ *utf8_str_ret = utf8_str;
+ *utf8_nbytes_ret = utf8_expected_nbytes;
+ ret = 0;
+ } else {
+ FREE(utf8_str);
+ ret = WIMLIB_ERR_INVALID_UTF16_STRING;
+ }
+ } else
+ ret = WIMLIB_ERR_NOMEM;
+ } else
+ ret = WIMLIB_ERR_INVALID_UTF16_STRING;
+#endif /* WITH_NTFS_3G */
+
+#ifdef ENABLE_ERROR_MESSAGES
+ if (ret != 0) {
+ ERROR_WITH_ERRNO("Error converting UTF-16LE string to UTF-8");
+ ERROR("The failing string was:");
+ print_string(utf16_str, utf16_nbytes);
+ putchar('\n');
+ }
+#endif /* ENABLE_ERROR_MESSAGES */
+ return ret;
+}
+
+
+/* Converts a string in the UTF-8 encoding to a newly allocated string in the
+ * UTF-16 encoding.
+ *
+ * If available, do so by calling a similar function from libntfs-3g.
+ * Otherwise, use iconv() along with the helper function utf8_to_utf16_size().
+ */
+int utf8_to_utf16(const char *utf8_str, size_t utf8_nbytes,
+ char **utf16_str_ret, size_t *utf16_nbytes_ret)
+{
+ int ret;
+ if (utf8_nbytes == 0) {
+ *utf16_str_ret = NULL;
+ *utf16_nbytes_ret = 0;
+ return 0;
+ }
+#ifdef WITH_NTFS_3G
+ char *outs = NULL;
+ int outs_nchars = ntfs_mbstoucs(utf8_str, (ntfschar**)&outs);
+ if (outs_nchars >= 0) {
+ *utf16_str_ret = outs;
+ *utf16_nbytes_ret = (size_t)outs_nchars * 2;
+ ret = 0;
+ } else {
+ if (errno == ENOMEM)
+ ret = WIMLIB_ERR_NOMEM;
+ else
+ ret = WIMLIB_ERR_INVALID_UTF8_STRING;
+ }
+#else /* WITH_NTFS_3G */
+ static iconv_t cd_utf8_to_utf16 = (iconv_t)(-1);
+ if (cd_utf8_to_utf16 == (iconv_t)(-1)) {
+ cd_utf8_to_utf16 = iconv_open("UTF-16LE", "UTF-8");
+ if (cd_utf8_to_utf16 == (iconv_t)-1) {
+ ERROR_WITH_ERRNO("Failed to get conversion descriptor "
+ "for converting UTF-8 to UTF-16LE");
+ if (errno == ENOMEM)
+ return WIMLIB_ERR_NOMEM;
+ else
+ return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
+ }
+ }
+
+ ret = utf8_to_utf16_size(utf8_str);
+ if (ret >= 0) {
+ size_t utf16_expected_nbytes;
+ char *utf16_str;
+ size_t utf16_bytes_left;
+ size_t utf8_bytes_left;
+ size_t num_chars_converted;
+ const char *utf8_str_save;
+ char *utf16_str_save;
+
+ utf16_expected_nbytes = (size_t)ret * 2;
+ utf16_str = MALLOC(utf16_expected_nbytes + 2);
+ if (utf16_str) {
+ utf16_bytes_left = utf16_expected_nbytes;
+ utf8_bytes_left = utf8_nbytes;
+ utf8_str_save = utf8_str;
+ utf16_str_save = utf16_str;
+ num_chars_converted = iconv(cd_utf8_to_utf16,
+ (char**)&utf8_str,
+ &utf8_bytes_left,
+ &utf16_str,
+ &utf16_bytes_left);
+ utf8_str = utf8_str_save;
+ utf16_str = utf16_str_save;
+ if (utf16_bytes_left == 0 &&
+ utf8_bytes_left == 0 &&
+ num_chars_converted != (size_t)(-1))
+ {
+ utf16_str[utf16_expected_nbytes] = '\0';
+ utf16_str[utf16_expected_nbytes + 1] = '\0';
+ *utf16_str_ret = utf16_str;
+ *utf16_nbytes_ret = utf16_expected_nbytes;
+ ret = 0;
+ } else {
+ FREE(utf16_str);
+ ret = WIMLIB_ERR_INVALID_UTF8_STRING;
+ }
+ } else
+ ret = WIMLIB_ERR_NOMEM;
+ } else
+ ret = WIMLIB_ERR_INVALID_UTF8_STRING;
+#endif /* WITH_NTFS_3G */
+
+#ifdef ENABLE_ERROR_MESSAGES
+ if (ret != 0) {
+ ERROR_WITH_ERRNO("Error converting UTF-8 string to UTF-16LE");
+ ERROR("The failing string was:");
+ print_string(utf8_str, utf8_nbytes);
+ putchar('\n');
+ ERROR("Length: %zu bytes", utf8_nbytes);
+ }
+#endif /* ENABLE_ERROR_MESSAGES */
+ return ret;
+}