X-Git-Url: https://wimlib.net/git/?p=wimlib;a=blobdiff_plain;f=src%2Fencoding.c;h=48bad187ff8e21c7df4333e35854aabf75ef6b5d;hp=30a8889edd63189ccd0d45f5cfbca984cfbc0077;hb=de12c346dc64404821d52d545e2e1b3d44230f2a;hpb=7bde3fc590afbdef8f71cd7f8ccbd24172bffc63

diff --git a/src/encoding.c b/src/encoding.c
index 30a8889e..48bad187 100644
--- a/src/encoding.c
+++ b/src/encoding.c
@@ -1,9 +1,11 @@
 /*
- * encoding.c:  Convert UTF-8 to UTF-16LE strings and vice versa
+ * encoding.c:  Convert "multibyte" strings (the locale-default encoding---
+ * generally, UTF-8 or something like ISO-8859-1) to UTF-16LE strings, and vice
+ * versa.  Also, convert UTF-8 strings to multibyte strings.
  */
 
 /*
- * Copyright (C) 2012 Eric Biggers
+ * Copyright (C) 2012, 2013 Eric Biggers
  *
  * This file is part of wimlib, a library for working with WIM files.
  *
@@ -21,337 +23,252 @@
  * along with wimlib; if not, see http://www.gnu.org/licenses/.
  */
 
-#include "wimlib.h"
-#include "util.h"
-#include "endianness.h"
+#include "wimlib_internal.h"
 
 #include <errno.h>
-
-#ifdef WITH_NTFS_3G
-#include <ntfs-3g/volume.h>
-#include <ntfs-3g/unistr.h>
-#else
 #include <iconv.h>
-#endif
+#include <pthread.h>
+#include <stdlib.h>
 
-/*
- * NOTE:
- *
- * utf16_to_utf8_size() and utf8_to_utf16_size() were taken from
- * libntfs-3g/unistr.c in the NTFS-3g sources.  (Modified slightly to remove
- * unneeded functionality.)
- */
-#ifndef WITH_NTFS_3G
-/*
- * Return the amount of 8-bit elements in UTF-8 needed (without the terminating
- * null) to store a given UTF-16LE string.
- *
- * Return -1 with errno set if string has invalid byte sequence or too long.
- */
-static int utf16_to_utf8_size(const u16 *ins, const int ins_len)
-{
-	int i, ret = -1;
-	int count = 0;
-	bool surrog;
+bool wimlib_mbs_is_utf8 = true;
 
-	surrog = false;
-	for (i = 0; i < ins_len && ins[i]; i++) {
-		unsigned short c = le16_to_cpu(ins[i]);
-		if (surrog) {
-			if ((c >= 0xdc00) && (c < 0xe000)) {
-				surrog = false;
-				count += 4;
-			} else
-				goto fail;
-		} else
-			if (c < 0x80)
-				count++;
-			else if (c < 0x800)
-				count += 2;
-			else if (c < 0xd800)
-				count += 3;
-			else if (c < 0xdc00)
-				surrog = true;
-#if NOREVBOM
-			else if ((c >= 0xe000) && (c < 0xfffe))
-#else
-			else if (c >= 0xe000)
-#endif
-				count += 3;
-			else
-				goto fail;
-	}
-	if (surrog)
-		goto fail;
+/* List of iconv_t conversion descriptors for a specific character conversion.
+ * The idea is that it is not thread-safe to have just one conversion
+ * descriptor, but it also is inefficient to open a new conversion descriptor to
+ * convert every string.  Both these problems can be solved by maintaining a
+ * list of conversion descriptors; then, a thread can use an existing conversion
+ * descriptor if available. */
+struct iconv_list_head {
+	const char *from_encoding;
+	const char *to_encoding;
+	struct list_head list;
+	pthread_mutex_t mutex;
+};
 
-	ret = count;
-out:
-	return ret;
-fail:
-	errno = EILSEQ;
-	goto out;
-}
+struct iconv_node {
+	iconv_t cd;
+	struct list_head list;
+	struct iconv_list_head *head;
+};
 
-/*
- * Return the amount of 16-bit elements in UTF-16LE needed
- * (without the terminating null) to store given UTF-8 string.
- *
- * Return -1 with errno set if it's longer than PATH_MAX or string is invalid.
- *
- * Note: This does not check whether the input sequence is a valid utf8 string,
- *	 and should be used only in context where such check is made!
- */
-static int utf8_to_utf16_size(const char *s)
-{
-	unsigned int byte;
-	size_t count = 0;
-	while ((byte = *((const unsigned char *)s++))) {
-		count++;
-		if (byte >= 0xc0) {
-			if (byte >= 0xF5) {
-				errno = EILSEQ;
-				return -1;
-			}
-			if (!*s)
-				break;
-			if (byte >= 0xC0)
-				s++;
-			if (!*s)
-				break;
-			if (byte >= 0xE0)
-				s++;
-			if (!*s)
-				break;
-			if (byte >= 0xF0) {
-				s++;
-				count++;
-			}
-		}
-	}
-	return count;
+#define ICONV_LIST(name, from, to)			\
+struct iconv_list_head name = {				\
+	.from_encoding = from,				\
+	.to_encoding = to,				\
+	.list = LIST_HEAD_INIT(name.list),		\
+	.mutex = PTHREAD_MUTEX_INITIALIZER,		\
 }
-#endif /* !WITH_NTFS_3G */
 
-#ifndef WITH_NTFS_3G
-static iconv_t cd_utf8_to_utf16 = (iconv_t)(-1);
-static iconv_t cd_utf16_to_utf8 = (iconv_t)(-1);
-
-int iconv_global_init()
+static iconv_t *
+get_iconv(struct iconv_list_head *head)
 {
-	if (cd_utf16_to_utf8 == (iconv_t)(-1)) {
-		cd_utf16_to_utf8 = iconv_open("UTF-8", "UTF-16LE");
-		if (cd_utf16_to_utf8 == (iconv_t)-1) {
-			ERROR_WITH_ERRNO("Failed to get conversion descriptor "
-					 "for converting UTF-16LE to UTF-8");
-			if (errno == ENOMEM)
-				return WIMLIB_ERR_NOMEM;
-			else
-				return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
-		}
-	}
+	iconv_t cd;
+	iconv_t *cd_p;
+	struct iconv_node *i;
 
-	if (cd_utf8_to_utf16 == (iconv_t)(-1)) {
-		cd_utf8_to_utf16 = iconv_open("UTF-16LE", "UTF-8");
-		if (cd_utf8_to_utf16 == (iconv_t)-1) {
-			ERROR_WITH_ERRNO("Failed to get conversion descriptor "
-					 "for converting UTF-8 to UTF-16LE");
-			if (errno == ENOMEM)
-				return WIMLIB_ERR_NOMEM;
-			else
-				return WIMLIB_ERR_ICONV_NOT_AVAILABLE;
+	pthread_mutex_lock(&head->mutex);
+	if (list_empty(&head->list)) {
+		cd = iconv_open(head->to_encoding, head->from_encoding);
+		if (cd == (iconv_t)-1) {
+			ERROR_WITH_ERRNO("Failed to open iconv from %s to %s",
+					 head->from_encoding, head->to_encoding);
+			cd_p = NULL;
+		} else {
+			i = MALLOC(sizeof(struct iconv_node));
+			if (i) {
+				i->head = head;
+				i->cd = cd;
+				cd_p = &i->cd;
+			} else {
+				iconv_close(cd);
+				cd_p = NULL;
+			}
 		}
+	} else {
+		i = container_of(head->list.next, struct iconv_node, list);
+		list_del(head->list.next);
+		cd_p = &i->cd;
 	}
-	return 0;
+	pthread_mutex_unlock(&head->mutex);
+	return cd_p;
 }
 
-void iconv_global_cleanup()
+static void
+put_iconv(iconv_t *cd)
 {
-	if (cd_utf8_to_utf16 != (iconv_t)(-1))
-		iconv_close(cd_utf8_to_utf16);
-	if (cd_utf16_to_utf8 != (iconv_t)(-1))
-		iconv_close(cd_utf16_to_utf8);
+	int errno_save = errno;
+	struct iconv_node *i = container_of(cd, struct iconv_node, cd);
+	struct iconv_list_head *head = i->head;
+	
+	pthread_mutex_lock(&head->mutex);
+	list_add(&i->list, &head->list);
+	pthread_mutex_unlock(&head->mutex);
+	errno = errno_save;
 }
-#endif
 
-/* Converts a string in the UTF-16LE encoding to a newly allocated string in the
- * UTF-8 encoding.
- *
- * If available, do so by calling a similar function from libntfs-3g.
- * Otherwise, use iconv() along with the helper function utf16_to_utf8_size().
- */
-int utf16_to_utf8(const char *utf16_str, size_t utf16_nbytes,
-		  char **utf8_str_ret, size_t *utf8_nbytes_ret)
-{
-	int ret;
+/* Prevent printing an error message if a character conversion error occurs
+ * while printing an error message.  (This variable is not per-thread but it
+ * doesn't matter too much since it's just the error messages.) */
+static bool error_message_being_printed = false;
 
-	if (utf16_nbytes == 0) {
-		*utf8_str_ret = NULL;
-		*utf8_nbytes_ret = 0;
-		return 0;
-	}
-
-	if (utf16_nbytes & 1) {
-		ERROR("UTF-16LE string is invalid (odd number of bytes)!");
-		return WIMLIB_ERR_INVALID_UTF16_STRING;
-	}
-#ifdef WITH_NTFS_3G
-	char *outs = NULL;
-	int outs_len = ntfs_ucstombs((const ntfschar*)utf16_str,
-				     utf16_nbytes / 2, &outs, 0);
-	if (outs_len >= 0) {
-		*utf8_str_ret = outs;
-		*utf8_nbytes_ret = outs_len;
-		ret = 0;
-	} else {
-		if (errno == ENOMEM)
-			ret = WIMLIB_ERR_NOMEM;
-		else
-			ret = WIMLIB_ERR_INVALID_UTF16_STRING;
-	}
-#else /* !WITH_NTFS_3G */
+#define DEFINE_CHAR_CONVERSION_FUNCTIONS(varname1, longname1, chartype1,\
+					 varname2, longname2, chartype2,\
+					 worst_case_len_expr,		\
+					 err_return,			\
+					 err_msg)			\
+static ICONV_LIST(iconv_##varname1##_to_##varname2,			\
+		  longname1, longname2);				\
+									\
+int									\
+varname1##_to_##varname2##_nbytes(const chartype1 *in, size_t in_nbytes,\
+				  size_t *out_nbytes_ret)		\
+{									\
+	iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2);	\
+	if (cd == NULL)							\
+		return WIMLIB_ERR_ICONV_NOT_AVAILABLE;			\
+									\
+	/* Worst case length */						\
+	chartype2 buf[worst_case_len_expr];				\
+	char *inbuf = (char*)in;					\
+	size_t inbytesleft = in_nbytes;					\
+	char *outbuf = (char*)buf;					\
+	size_t outbytesleft = sizeof(buf);				\
+	size_t len;							\
+	int ret;							\
+									\
+	len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);	\
+	if (len == (size_t)-1) {					\
+		if (!error_message_being_printed) {			\
+			error_message_being_printed = true;		\
+			err_msg;					\
+			error_message_being_printed = false;		\
+		}							\
+		ret = err_return;					\
+	} else {							\
+		*out_nbytes_ret = sizeof(buf) - outbytesleft;		\
+		ret = 0;						\
+	}								\
+	put_iconv(cd);							\
+	return ret;							\
+}									\
+									\
+int									\
+varname1##_to_##varname2##_buf(const chartype1 *in, size_t in_nbytes,	\
+			       chartype2 *out)				\
+{									\
+	iconv_t *cd = get_iconv(&iconv_##varname1##_to_##varname2);	\
+	if (cd == NULL)							\
+		return WIMLIB_ERR_ICONV_NOT_AVAILABLE;			\
+									\
+	char *inbuf = (char*)in;					\
+	size_t inbytesleft = in_nbytes;					\
+	char *outbuf = (char*)out;					\
+	const size_t LARGE_NUMBER = 1000000000;				\
+	size_t outbytesleft = LARGE_NUMBER;				\
+	size_t len;							\
+	int ret;							\
+									\
+	len = iconv(*cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);	\
+	if (len == (size_t)-1) {					\
+		if (!error_message_being_printed) {			\
+			error_message_being_printed = true;		\
+			err_msg;					\
+			error_message_being_printed = false;		\
+		}							\
+		ret = err_return;					\
+	} else {							\
+		out[(LARGE_NUMBER-outbytesleft)/sizeof(chartype2)] = 0;	\
+		ret = 0;						\
+	}								\
+	put_iconv(cd);							\
+	return ret;							\
+}									\
+									\
+int									\
+varname1##_to_##varname2(const chartype1 *in, size_t in_nbytes,		\
+			 chartype2 **out_ret,				\
+			 size_t *out_nbytes_ret)			\
+{									\
+	int ret;							\
+	chartype2 *out;							\
+	size_t out_nbytes;						\
+									\
+	ret = varname1##_to_##varname2##_nbytes(in, in_nbytes,		\
+						&out_nbytes);		\
+	if (ret)							\
+		return ret;						\
+									\
+	out = MALLOC(out_nbytes + sizeof(chartype2));			\
+	if (!out)							\
+		return WIMLIB_ERR_NOMEM;				\
+									\
+	ret = varname1##_to_##varname2##_buf(in, in_nbytes, out);	\
+	if (ret) {							\
+		int errno_save = errno;					\
+		FREE(out);						\
+		errno = errno_save;					\
+	} else {							\
+		*out_ret = out;						\
+		*out_nbytes_ret = out_nbytes;				\
+	}								\
+	return ret;							\
+}
 
-	ret = iconv_global_init();
-	if (ret != 0)
-		return ret;
+DEFINE_CHAR_CONVERSION_FUNCTIONS(utf16le, "UTF-16LE", utf16lechar,
+				 mbs, "", mbchar,
+				 in_nbytes / 2 * MB_CUR_MAX,
+				 WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
+				 ERROR("Failed to convert UTF-16LE string "
+				       "to multibyte string!");
+				 ERROR("This may be because the UTF-16LE data "
+				       "could not be represented in your "
+				       "locale's character encoding."))
 
-	ret = utf16_to_utf8_size((const u16*)utf16_str, utf16_nbytes / 2);
-	if (ret >= 0) {
-		size_t utf8_expected_nbytes;
-		char  *utf8_str;
-		size_t utf8_bytes_left;
-		size_t utf16_bytes_left;
-		size_t num_chars_converted;
-		char  *utf8_str_save;
-		const char *utf16_str_save;
+DEFINE_CHAR_CONVERSION_FUNCTIONS(mbs, "", mbchar,
+				 utf16le, "UTF-16LE", utf16lechar,
+				 in_nbytes * 2,
+				 WIMLIB_ERR_INVALID_MULTIBYTE_STRING,
+				 ERROR_WITH_ERRNO("Failed to convert multibyte "
+						  "string \"%s\" to UTF-16LE string!", in);
+				 ERROR("If the data you provided was UTF-8, please make sure "
+				       "the character encoding of your current locale is UTF-8."))
 
-		utf8_expected_nbytes = ret;
- 		utf8_str = MALLOC(utf8_expected_nbytes + 1);
-		if (utf8_str) {
-			utf8_bytes_left = utf8_expected_nbytes;
-			utf16_bytes_left = utf16_nbytes;
-			utf8_str_save = utf8_str;
-			utf16_str_save = utf16_str;
-			num_chars_converted = iconv(cd_utf16_to_utf8,
-						    (char**)&utf16_str,
-						    &utf16_bytes_left,
-						    &utf8_str,
-						    &utf8_bytes_left);
-			utf8_str = utf8_str_save;
-			utf16_str = utf16_str_save;
-			if (utf16_bytes_left == 0 &&
-			    utf8_bytes_left == 0 &&
-			    num_chars_converted != (size_t)(-1))
-			{
-				utf8_str[utf8_expected_nbytes] = '\0';
-				*utf8_str_ret = utf8_str;
-				*utf8_nbytes_ret = utf8_expected_nbytes;
-				ret = 0;
-			} else {
-				FREE(utf8_str);
-				ret = WIMLIB_ERR_INVALID_UTF16_STRING;
-			}
-		} else
-			ret = WIMLIB_ERR_NOMEM;
-	} else
-		ret = WIMLIB_ERR_INVALID_UTF16_STRING;
-#endif /* WITH_NTFS_3G */
+DEFINE_CHAR_CONVERSION_FUNCTIONS(utf8, "UTF-8", utf8char,
+				 mbs, "", mbchar,
+				 in_nbytes,
+				 WIMLIB_ERR_UNICODE_STRING_NOT_REPRESENTABLE,
+				 ERROR("Failed to convert UTF-8 string to multibyte string!");
+				 ERROR("This may be because the UTF-8 data could not be represented "
+				       "in your locale's character encoding."))
 
-#ifdef ENABLE_ERROR_MESSAGES
-	if (ret != 0) {
-		ERROR_WITH_ERRNO("Error converting UTF-16LE string to UTF-8");
-		ERROR("The failing string was:");
-		print_string(utf16_str, utf16_nbytes);
-		putchar('\n');
+static void
+iconv_cleanup(struct iconv_list_head *head)
+{
+	pthread_mutex_destroy(&head->mutex);
+	while (!list_empty(&head->list)) {
+		struct iconv_node *i;
+		
+		i = container_of(head->list.next, struct iconv_node, list);
+		list_del(&i->list);
+		iconv_close(i->cd);
+		FREE(i);
 	}
-#endif /* ENABLE_ERROR_MESSAGES */
-	return ret;
 }
 
-
-/* Converts a string in the UTF-8 encoding to a newly allocated string in the
- * UTF-16 encoding.
- *
- * If available, do so by calling a similar function from libntfs-3g.
- * Otherwise, use iconv() along with the helper function utf8_to_utf16_size().
- */
-int utf8_to_utf16(const char *utf8_str, size_t utf8_nbytes,
-		  char **utf16_str_ret, size_t *utf16_nbytes_ret)
+void
+iconv_global_cleanup()
 {
-	int ret;
-	if (utf8_nbytes == 0) {
-		*utf16_str_ret = NULL;
-		*utf16_nbytes_ret = 0;
-		return 0;
-	}
-#ifdef WITH_NTFS_3G
-	char *outs = NULL;
-	int outs_nchars = ntfs_mbstoucs(utf8_str, (ntfschar**)&outs);
-	if (outs_nchars >= 0) {
-		*utf16_str_ret = outs;
-		*utf16_nbytes_ret = (size_t)outs_nchars * 2;
-		ret = 0;
-	} else {
-		if (errno == ENOMEM)
-			ret = WIMLIB_ERR_NOMEM;
-		else
-			ret = WIMLIB_ERR_INVALID_UTF8_STRING;
-	}
-#else /* !WITH_NTFS_3G */
-
-	ret = iconv_global_init();
-	if (ret != 0)
-		return ret;
-	ret = utf8_to_utf16_size(utf8_str);
-	if (ret >= 0) {
-		size_t utf16_expected_nbytes;
-		char  *utf16_str;
-		size_t utf16_bytes_left;
-		size_t utf8_bytes_left;
-		size_t num_chars_converted;
-		const char *utf8_str_save;
-		char  *utf16_str_save;
-
-		utf16_expected_nbytes = (size_t)ret * 2;
- 		utf16_str = MALLOC(utf16_expected_nbytes + 2);
-		if (utf16_str) {
-			utf16_bytes_left = utf16_expected_nbytes;
-			utf8_bytes_left = utf8_nbytes;
-			utf8_str_save = utf8_str;
-			utf16_str_save = utf16_str;
-			num_chars_converted = iconv(cd_utf8_to_utf16,
-						    (char**)&utf8_str,
-						    &utf8_bytes_left,
-						    &utf16_str,
-						    &utf16_bytes_left);
-			utf8_str = utf8_str_save;
-			utf16_str = utf16_str_save;
-			if (utf16_bytes_left == 0 &&
-			    utf8_bytes_left == 0 &&
-			    num_chars_converted != (size_t)(-1))
-			{
-				utf16_str[utf16_expected_nbytes] = '\0';
-				utf16_str[utf16_expected_nbytes + 1] = '\0';
-				*utf16_str_ret = utf16_str;
-				*utf16_nbytes_ret = utf16_expected_nbytes;
-				ret = 0;
-			} else {
-				FREE(utf16_str);
-				ret = WIMLIB_ERR_INVALID_UTF8_STRING;
-			}
-		} else
-			ret = WIMLIB_ERR_NOMEM;
-	} else
-		ret = WIMLIB_ERR_INVALID_UTF8_STRING;
-#endif /* WITH_NTFS_3G */
+	iconv_cleanup(&iconv_utf16le_to_mbs);
+	iconv_cleanup(&iconv_mbs_to_utf16le);
+	iconv_cleanup(&iconv_utf8_to_mbs);
+}
 
-#ifdef ENABLE_ERROR_MESSAGES
-	if (ret != 0) {
-		ERROR_WITH_ERRNO("Error converting UTF-8 string to UTF-16LE");
-		ERROR("The failing string was:");
-		print_string(utf8_str, utf8_nbytes);
-		putchar('\n');
-		ERROR("Length: %zu bytes", utf8_nbytes);
-	}
-#endif /* ENABLE_ERROR_MESSAGES */
-	return ret;
+bool
+utf8_str_contains_nonascii_chars(const utf8char *utf8_str)
+{
+	do {
+		if ((unsigned char)*utf8_str > 127)
+			return true;
+	} while (*++utf8_str);
+	return false;
 }