From 1c1c12926f4de39cb35d8d4c5a5280ab0d6ba931 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 10 Nov 2014 18:10:41 -0600 Subject: [PATCH] LZX, LZMS: Annotate unaligned memory accesses in x86 filtering --- Makefile.am | 1 + NEWS | 2 ++ README | 5 ++--- include/wimlib/unaligned.h | 38 ++++++++++++++++++++++++++++++++++++++ src/lzms-common.c | 17 +++++++++-------- src/lzx-common.c | 23 +++++++++++------------ 6 files changed, 63 insertions(+), 23 deletions(-) create mode 100644 include/wimlib/unaligned.h diff --git a/Makefile.am b/Makefile.am index 612de3ec..2c94bdcb 100644 --- a/Makefile.am +++ b/Makefile.am @@ -126,6 +126,7 @@ libwim_la_SOURCES = \ include/wimlib/textfile.h \ include/wimlib/timestamp.h \ include/wimlib/types.h \ + include/wimlib/unaligned.h \ include/wimlib/unix_data.h \ include/wimlib/util.h \ include/wimlib/version.h \ diff --git a/NEWS b/NEWS index 7ade11fd..159a0ed9 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,8 @@ Version 1.7.3-BETA: Fix for very slow export from solid WIM / ESD files. + Fix for LZX and LZMS algorithms on non-x86 architectures, such as ARM. + New progress message: WIMLIB_PROGRESS_MSG_HANDLE_ERROR. Applications may use this to treat some types of errors as non-fatal. diff --git a/README b/README index b64d6e88..e8f55233 100644 --- a/README +++ b/README @@ -316,9 +316,8 @@ extraction are implemented separately for Windows, UNIX, and UNIX (NTFS-3g mode), to ensure a fast and feature-rich implementation of each platform/mode. wimlib is mainly used on x86 and x86_64 CPUs, but it should also work on a -number of other GCC-supported 32-bit or 64-bit architectures. No assumptions -are made about endianness, but some code assumes that unaligned memory accesses -are supported and relatively efficient. +number of other GCC-supported 32-bit or 64-bit architectures. It has been +tested on the ARM architecture. Currently, gcc and clang are the only supported compilers. A few nonstandard extensions are used in the code. diff --git a/include/wimlib/unaligned.h b/include/wimlib/unaligned.h new file mode 100644 index 00000000..d30c9f20 --- /dev/null +++ b/include/wimlib/unaligned.h @@ -0,0 +1,38 @@ +/* + * unaligned.h + * + * Inline functions for unaligned memory accesses. + * + * The author dedicates this file to the public domain. + * You can do whatever you want with this file. + */ + +#ifndef _WIMLIB_UNALIGNED_H +#define _WIMLIB_UNALIGNED_H + +#include "compiler.h" +#include "endianness.h" +#include "types.h" + +#define DEFINE_UNALIGNED_TYPE(type) \ +struct type##_unaligned { \ + type v; \ +} _packed_attribute; \ + \ +static inline type \ +load_##type##_unaligned(const void *p) \ +{ \ + return ((const struct type##_unaligned *)p)->v; \ +} \ + \ +static inline void \ +store_##type##_unaligned(type val, void *p) \ +{ \ + ((struct type##_unaligned *)p)->v = val; \ +} + +DEFINE_UNALIGNED_TYPE(le16); +DEFINE_UNALIGNED_TYPE(le32); +DEFINE_UNALIGNED_TYPE(le64); + +#endif /* _WIMLIB_UNALIGNED_H */ diff --git a/src/lzms-common.c b/src/lzms-common.c index b5d4fcd5..95682d53 100644 --- a/src/lzms-common.c +++ b/src/lzms-common.c @@ -28,6 +28,7 @@ #include "wimlib/endianness.h" #include "wimlib/lzms.h" +#include "wimlib/unaligned.h" #include "wimlib/util.h" #include @@ -157,20 +158,20 @@ lzms_maybe_do_x86_translation(u8 data[restrict], s32 i, s32 num_op_bytes, if (i - *closest_target_usage_p <= max_trans_offset) { LZMS_DEBUG("Undid x86 translation at position %d " "(opcode 0x%02x)", i, data[i]); - le32 *p32 = (le32*)&data[i + num_op_bytes]; - u32 n = le32_to_cpu(*p32); - *p32 = cpu_to_le32(n - i); + void *p32 = &data[i + num_op_bytes]; + u32 n = le32_to_cpu(load_le32_unaligned(p32)); + store_le32_unaligned(cpu_to_le32(n - i), p32); } - pos = i + le16_to_cpu(*(const le16*)&data[i + num_op_bytes]); + pos = i + le16_to_cpu(load_le16_unaligned(&data[i + num_op_bytes])); } else { - pos = i + le16_to_cpu(*(const le16*)&data[i + num_op_bytes]); + pos = i + le16_to_cpu(load_le16_unaligned(&data[i + num_op_bytes])); if (i - *closest_target_usage_p <= max_trans_offset) { LZMS_DEBUG("Did x86 translation at position %d " "(opcode 0x%02x)", i, data[i]); - le32 *p32 = (le32*)&data[i + num_op_bytes]; - u32 n = le32_to_cpu(*p32); - *p32 = cpu_to_le32(n + i); + void *p32 = &data[i + num_op_bytes]; + u32 n = le32_to_cpu(load_le32_unaligned(p32)); + store_le32_unaligned(cpu_to_le32(n + i), p32); } } diff --git a/src/lzx-common.c b/src/lzx-common.c index dac43bad..9f55f171 100644 --- a/src/lzx-common.c +++ b/src/lzx-common.c @@ -25,6 +25,7 @@ #include "wimlib/endianness.h" #include "wimlib/lzx.h" +#include "wimlib/unaligned.h" #include "wimlib/util.h" #ifdef __SSE2__ @@ -120,12 +121,11 @@ lzx_get_num_main_syms(unsigned window_order) } static void -do_translate_target(sle32 *target, s32 input_pos) +do_translate_target(void *target, s32 input_pos) { s32 abs_offset, rel_offset; - /* XXX: This assumes unaligned memory accesses are okay. */ - rel_offset = le32_to_cpu(*target); + rel_offset = le32_to_cpu(load_le32_unaligned(target)); if (rel_offset >= -input_pos && rel_offset < LZX_WIM_MAGIC_FILESIZE) { if (rel_offset < LZX_WIM_MAGIC_FILESIZE - input_pos) { /* "good translation" */ @@ -134,30 +134,29 @@ do_translate_target(sle32 *target, s32 input_pos) /* "compensating translation" */ abs_offset = rel_offset - LZX_WIM_MAGIC_FILESIZE; } - *target = cpu_to_le32(abs_offset); + store_le32_unaligned(cpu_to_le32(abs_offset), target); } } static void -undo_translate_target(sle32 *target, s32 input_pos) +undo_translate_target(void *target, s32 input_pos) { s32 abs_offset, rel_offset; - /* XXX: This assumes unaligned memory accesses are okay. */ - abs_offset = le32_to_cpu(*target); + abs_offset = le32_to_cpu(load_le32_unaligned(target)); if (abs_offset >= 0) { if (abs_offset < LZX_WIM_MAGIC_FILESIZE) { /* "good translation" */ rel_offset = abs_offset - input_pos; - *target = cpu_to_le32(rel_offset); + store_le32_unaligned(cpu_to_le32(rel_offset), target); } } else { if (abs_offset >= -input_pos) { /* "compensating translation" */ rel_offset = abs_offset + LZX_WIM_MAGIC_FILESIZE; - *target = cpu_to_le32(rel_offset); + store_le32_unaligned(cpu_to_le32(rel_offset), target); } } } @@ -194,7 +193,7 @@ inline /* Although inlining the 'process_target' function still speeds up the SSE2 case, it bloats the binary more. */ #endif void -lzx_e8_filter(u8 *data, u32 size, void (*process_target)(sle32 *, s32)) +lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32)) { #ifdef __SSE2__ /* SSE2 vectorized implementation for x86_64. This speeds up LZX @@ -249,7 +248,7 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(sle32 *, s32)) /* Do (or undo) the e8 translation. */ u8 *p8 = (u8 *)p128 + bit; - (*process_target)((sle32 *)(p8 + 1), + (*process_target)(p8 + 1, p8 - data); /* Don't start an e8 translation in the @@ -279,7 +278,7 @@ lzx_e8_filter(u8 *data, u32 size, void (*process_target)(sle32 *, s32)) u8 *p8_end = data + size - 10; do { if (*p8 == 0xe8) { - (*process_target)((sle32 *)(p8 + 1), p8 - data); + (*process_target)(p8 + 1, p8 - data); p8 += 5; } else { p8++; -- 2.43.0